update validateFASTA() logic to accommodate blanks in spacers

2020-09-26 00:43:10 +10:00
parent 169ca58b92
commit 092494d9de
2 changed files with 78 additions and 18 deletions
--- a/.utilities.R
+++ b/.utilities.R
@@ -206,6 +206,7 @@ validateFA <- function(txt) {
  #
  # The function is used for its side-effect of throwing an error of
  # FASTA assumptions are violated in txt.
  # - There may be spacer-lines matching "^\\s*$"
  # - At least one header line
  # - No adjacent header lines
  # - All header lines followed by at least one sequence line
@@ -216,34 +217,92 @@ validateFA <- function(txt) {
    stop("no header lines in input")
  }
-  sel <- grepl("^>", txt)
+  isHeader <- grepl("^>", txt)
-  sel <- sel[- length(sel)] & sel[-1]
+  isSpacer <- grepl("^\\s*$", txt)
  txtNoSpc <- txt[! isSpacer]
  # adjacent headers
  sel <- isHeader[- length(isHeader)] & isHeader[-1]
  if ( any(sel) ) {
    i <- which(sel)[1]
    stop(sprintf("adjacent header lines in input (lines %d and %d)",
                 i, i+1))
  }
-  selA <- grepl("^>", txt)
+  # invalid character in a line that is not header or spacer
  selA <- isHeader | isSpacer
  selB <- grepl(sprintf("[^%s]", AAVALID), txt)
  if ( any( (! selA) & selB) ) {  # (not header) AND (has invalid character)
    i <- which( (! selA) & selB)[1]
-    stop(sprintf("invalid character(s) in sequence (cf. line %d)",
+    stop(sprintf("invalid character(s) in sequence (cf. line %d)", i))
-                 i, i+1))
+  }
  # no spacer should immediately follow a header
  sel <- c(isSpacer, FALSE) & c(FALSE, isHeader)
  if ( any(sel) ) {
    i <- which(sel)[1]
    stop(sprintf("a header has no adjacent sequence (line %d)", i - 1))
  }
  # no header alone on the last line
  if ( which(isHeader)[sum(isHeader)] == length(txt)) {
    stop(sprintf("a header is alone on the last line (line %d)", length(txt)))
  }
  # no spacer in a block of sequence
  # (tested as: not followed by spacer or header)
  if (sum(isSpacer) > 0) {
    sel <- which(isSpacer) + 1
    sel <- sel[sel <= length(txt)]
    if ( ! all(isSpacer[sel] | isHeader[sel]) ) {
      i <- which(! (isSpacer[sel] | isHeader[sel]))[1]
      stop(sprintf("a spacer is followed by sequence (line %d)",
                   which(isSpacer)[i]))
    }
  sel <- grep("^>", txt) + 1
  sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel])
  if ( ! all(sel) ) {
    i <- which( ! sel)[1]
    stop(sprintf("a header has no adjacent sequence (line %d)",
                 grep("^>", txt)[i]))
  }
  # all good, if we get to here.
  return(invisible(NULL))
 }
 if (FALSE) {
  # Should validate
  fa <- c(">abc", "de")
  validateFA(fa)
  fa <- c(">abc", "de", "fgh", ">ojz", "ikl")
  validateFA(fa)
  fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "")
  validateFA(fa)
  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz", "ikl", "\t")
  validateFA(fa)
  # should fail
  fa <- c("abc", "de")
  validateFA(fa)
  fa <- c(">abc", "de", ">fgh", ">ojz", "ikl")
  validateFA(fa)
  fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "")
  validateFA(fa)
  fa <- c(">abc", " ", "de", "fgh", "   ", ">ojz", "ikl", "\t")
  validateFA(fa)
  fa <- c(" ", ">abc", "de", "fgh", "   ", ">ojz")
  validateFA(fa)
  fa <- c(" ", ">abc", "de", "   ", "fgh", ">ojz", "ikl", "\t")
  validateFA(fa)
 }
 # ==   4.05  readFASTA()  ======================================================
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@@ -32,8 +32,8 @@
 #TOC> 
 #TOC>   Section  Title                                 Line
 #TOC> -----------------------------------------------------
-#TOC>   1        Reading and validating FASTA            45
+#TOC>   1        Reading and validating FASTA            44
-#TOC>   1.1        Validating FASTA                      81
+#TOC>   1.1        Validating FASTA                      80
 #TOC>   2        Parsing FASTA                          225
 #TOC>   3        Interpreting FASTA                     245
 #TOC>   4        Writing FASTA                          272
@@ -218,8 +218,9 @@ FA <- c(">head1",
 validate(FA)     # ... should not create an error
-# a somewhat more elaborate validateFA() function was loaded with the
+# A somewhat more elaborate validateFA() function was loaded with the
-# ./utilities.R script
+# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
 # fasta files have space-characters in their spacer lines.
 # =    2  Parsing FASTA  =======================================================
@@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS)
 # =    4  Writing FASTA  =======================================================
-# Writing FASTA files mostly just the revrese reverse of reading, with one
+# Writing FASTA files is mostly just the reverse of reading, with one
 # twist: we need to break the long sequence string into chunks of the desired
 # width. The FASTA specification calls for a maximum of 120 characters per line,
 # but writing out much less than that is common, since it allows to comfortably