From 092494d9de3afa8a4e72d3f5be98c32c5b440186 Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Sat, 26 Sep 2020 00:43:10 +1000
Subject: [PATCH] update validateFASTA() logic to accommodate blanks in spacers

---
 .utilities.R | 85 ++++++++++++++++++++++++++++++++++++++++++++--------
 RPR-FASTA.R  | 11 +++----
 2 files changed, 78 insertions(+), 18 deletions(-)

diff --git a/.utilities.R b/.utilities.R
index 5dde8df..f3db10e 100644
--- a/.utilities.R
+++ b/.utilities.R
@@ -20,7 +20,7 @@
 
 
 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                       Line
 #TOC> -----------------------------------------------------------
 #TOC>   1        SCRIPTS TO SOURCE                             50
@@ -43,7 +43,7 @@
 #TOC>   5.03       selectPDBrep()                             438
 #TOC>   5.04       selectChi2()                               474
 #TOC>   5.05       selectENSP()                               487
-#TOC> 
+#TOC>
 #TOC> ==========================================================================
 
 
@@ -206,6 +206,7 @@ validateFA <- function(txt) {
   #
   # The function is used for its side-effect of throwing an error of
   # FASTA assumptions are violated in txt.
+  # - There may be spacer-lines matching "^\\s*$"
   # - At least one header line
   # - No adjacent header lines
   # - All header lines followed by at least one sequence line
@@ -216,34 +217,92 @@ validateFA <- function(txt) {
     stop("no header lines in input")
   }
 
-  sel <- grepl("^>", txt)
-  sel <- sel[- length(sel)] & sel[-1]
+  isHeader <- grepl("^>", txt)
+  isSpacer <- grepl("^\\s*$", txt)
+  txtNoSpc <- txt[! isSpacer]
+
+  # adjacent headers
+  sel <- isHeader[- length(isHeader)] & isHeader[-1]
   if ( any(sel) ) {
     i <- which(sel)[1]
     stop(sprintf("adjacent header lines in input (lines %d and %d)",
                  i, i+1))
   }
 
-  selA <- grepl("^>", txt)
+  # invalid character in a line that is not header or spacer
+  selA <- isHeader | isSpacer
   selB <- grepl(sprintf("[^%s]", AAVALID), txt)
   if ( any( (! selA) & selB) ) {  # (not header) AND (has invalid character)
     i <- which( (! selA) & selB)[1]
-    stop(sprintf("invalid character(s) in sequence (cf. line %d)",
-                 i, i+1))
+    stop(sprintf("invalid character(s) in sequence (cf. line %d)", i))
   }
 
-  sel <- grep("^>", txt) + 1
-  sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel])
-  if ( ! all(sel) ) {
-    i <- which( ! sel)[1]
-    stop(sprintf("a header has no adjacent sequence (line %d)",
-                 grep("^>", txt)[i]))
+  # no spacer should immediately follow a header
+  sel <- c(isSpacer, FALSE) & c(FALSE, isHeader)
+  if ( any(sel) ) {
+    i <- which(sel)[1]
+    stop(sprintf("a header has no adjacent sequence (line %d)", i - 1))
+  }
+
+  # no header alone on the last line
+  if ( which(isHeader)[sum(isHeader)] == length(txt)) {
+    stop(sprintf("a header is alone on the last line (line %d)", length(txt)))
+  }
+
+  # no spacer in a block of sequence
+  # (tested as: not followed by spacer or header)
+
+  if (sum(isSpacer) > 0) {
+    sel <- which(isSpacer) + 1
+    sel <- sel[sel <= length(txt)]
+    if ( ! all(isSpacer[sel] | isHeader[sel]) ) {
+      i <- which(! (isSpacer[sel] | isHeader[sel]))[1]
+      stop(sprintf("a spacer is followed by sequence (line %d)",
+                   which(isSpacer)[i]))
+    }
+
   }
 
   # all good, if we get to here.
   return(invisible(NULL))
 }
 
+if (FALSE) {
+
+  # Should validate
+  fa <- c(">abc", "de")
+  validateFA(fa)
+
+  fa <- c(">abc", "de", "fgh", ">ojz", "ikl")
+  validateFA(fa)
+
+  fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+  # should fail
+  fa <- c("abc", "de")
+  validateFA(fa)
+
+  fa <- c(">abc", "de", ">fgh", ">ojz", "ikl")
+  validateFA(fa)
+
+  fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "")
+  validateFA(fa)
+
+  fa <- c(">abc", " ", "de", "fgh", "   ", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "   ", "fgh", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+}
+
 
 # ==   4.05  readFASTA()  ======================================================
 
diff --git a/RPR-FASTA.R b/RPR-FASTA.R
index 41a8ea8..c933fc2 100644
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@@ -32,8 +32,8 @@
 #TOC> 
 #TOC>   Section  Title                                 Line
 #TOC> -----------------------------------------------------
-#TOC>   1        Reading and validating FASTA            45
-#TOC>   1.1        Validating FASTA                      81
+#TOC>   1        Reading and validating FASTA            44
+#TOC>   1.1        Validating FASTA                      80
 #TOC>   2        Parsing FASTA                          225
 #TOC>   3        Interpreting FASTA                     245
 #TOC>   4        Writing FASTA                          272
@@ -218,8 +218,9 @@ FA <- c(">head1",
 validate(FA)     # ... should not create an error
 
 
-# a somewhat more elaborate validateFA() function was loaded with the
-# ./utilities.R script
+# A somewhat more elaborate validateFA() function was loaded with the
+# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
+# fasta files have space-characters in their spacer lines.
 
 # =    2  Parsing FASTA  =======================================================
 
@@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS)
 # =    4  Writing FASTA  =======================================================
 
 
-# Writing FASTA files mostly just the revrese reverse of reading, with one
+# Writing FASTA files is mostly just the reverse of reading, with one
 # twist: we need to break the long sequence string into chunks of the desired
 # width. The FASTA specification calls for a maximum of 120 characters per line,
 # but writing out much less than that is common, since it allows to comfortably