update validateFASTA() logic to accommodate blanks in spacers

2020-09-26 00:43:10 +10:00
parent 169ca58b92
commit 092494d9de
2 changed files with 78 additions and 18 deletions
--- a/.utilities.R
+++ b/.utilities.R
@@ -20,7 +20,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                       Line
 #TOC> -----------------------------------------------------------
 #TOC>   1        SCRIPTS TO SOURCE                             50
@@ -43,7 +43,7 @@
 #TOC>   5.03       selectPDBrep()                             438
 #TOC>   5.04       selectChi2()                               474
 #TOC>   5.05       selectENSP()                               487
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@@ -206,6 +206,7 @@ validateFA <- function(txt) {
  #
  # The function is used for its side-effect of throwing an error of
  # FASTA assumptions are violated in txt.
+  # - There may be spacer-lines matching "^\\s*$"
  # - At least one header line
  # - No adjacent header lines
  # - All header lines followed by at least one sequence line
@@ -216,34 +217,92 @@ validateFA <- function(txt) {
    stop("no header lines in input")
  }

-  sel <- grepl("^>", txt)
-  sel <- sel[- length(sel)] & sel[-1]
+  isHeader <- grepl("^>", txt)
+  isSpacer <- grepl("^\\s*$", txt)
+  txtNoSpc <- txt[! isSpacer]
+
+  # adjacent headers
+  sel <- isHeader[- length(isHeader)] & isHeader[-1]
  if ( any(sel) ) {
    i <- which(sel)[1]
    stop(sprintf("adjacent header lines in input (lines %d and %d)",
                 i, i+1))
  }

-  selA <- grepl("^>", txt)
+  # invalid character in a line that is not header or spacer
+  selA <- isHeader | isSpacer
  selB <- grepl(sprintf("[^%s]", AAVALID), txt)
  if ( any( (! selA) & selB) ) {  # (not header) AND (has invalid character)
    i <- which( (! selA) & selB)[1]
-    stop(sprintf("invalid character(s) in sequence (cf. line %d)",
-                 i, i+1))
+    stop(sprintf("invalid character(s) in sequence (cf. line %d)", i))
  }

-  sel <- grep("^>", txt) + 1
-  sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel])
-  if ( ! all(sel) ) {
-    i <- which( ! sel)[1]
-    stop(sprintf("a header has no adjacent sequence (line %d)",
-                 grep("^>", txt)[i]))
+  # no spacer should immediately follow a header
+  sel <- c(isSpacer, FALSE) & c(FALSE, isHeader)
+  if ( any(sel) ) {
+    i <- which(sel)[1]
+    stop(sprintf("a header has no adjacent sequence (line %d)", i - 1))
+  }
+
+  # no header alone on the last line
+  if ( which(isHeader)[sum(isHeader)] == length(txt)) {
+    stop(sprintf("a header is alone on the last line (line %d)", length(txt)))
+  }
+
+  # no spacer in a block of sequence
+  # (tested as: not followed by spacer or header)
+
+  if (sum(isSpacer) > 0) {
+    sel <- which(isSpacer) + 1
+    sel <- sel[sel <= length(txt)]
+    if ( ! all(isSpacer[sel] | isHeader[sel]) ) {
+      i <- which(! (isSpacer[sel] | isHeader[sel]))[1]
+      stop(sprintf("a spacer is followed by sequence (line %d)",
+                   which(isSpacer)[i]))
+    }
+
  }

  # all good, if we get to here.
  return(invisible(NULL))
 }

+if (FALSE) {
+
+  # Should validate
+  fa <- c(">abc", "de")
+  validateFA(fa)
+
+  fa <- c(">abc", "de", "fgh", ">ojz", "ikl")
+  validateFA(fa)
+
+  fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+  # should fail
+  fa <- c("abc", "de")
+  validateFA(fa)
+
+  fa <- c(">abc", "de", ">fgh", ">ojz", "ikl")
+  validateFA(fa)
+
+  fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "")
+  validateFA(fa)
+
+  fa <- c(">abc", " ", "de", "fgh", "   ", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz")
+  validateFA(fa)
+
+  fa <- c(" ", ">abc", "de", "   ", "fgh", ">ojz", "ikl", "\t")
+  validateFA(fa)
+
+}
+

 # ==   4.05  readFASTA()  ======================================================

--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@@ -32,8 +32,8 @@
 #TOC> 
 #TOC>   Section  Title                                 Line
 #TOC> -----------------------------------------------------
-#TOC>   1        Reading and validating FASTA            45
-#TOC>   1.1        Validating FASTA                      81
+#TOC>   1        Reading and validating FASTA            44
+#TOC>   1.1        Validating FASTA                      80
 #TOC>   2        Parsing FASTA                          225
 #TOC>   3        Interpreting FASTA                     245
 #TOC>   4        Writing FASTA                          272
@@ -218,8 +218,9 @@ FA <- c(">head1",
 validate(FA)     # ... should not create an error


-# a somewhat more elaborate validateFA() function was loaded with the
-# ./utilities.R script
+# A somewhat more elaborate validateFA() function was loaded with the
+# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
+# fasta files have space-characters in their spacer lines.

 # =    2  Parsing FASTA  =======================================================

@@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS)
 # =    4  Writing FASTA  =======================================================


-# Writing FASTA files mostly just the revrese reverse of reading, with one
+# Writing FASTA files is mostly just the reverse of reading, with one
 # twist: we need to break the long sequence string into chunks of the desired
 # width. The FASTA specification calls for a maximum of 120 characters per line,
 # but writing out much less than that is common, since it allows to comfortably