From 092494d9de3afa8a4e72d3f5be98c32c5b440186 Mon Sep 17 00:00:00 2001 From: hyginn Date: Sat, 26 Sep 2020 00:43:10 +1000 Subject: [PATCH] update validateFASTA() logic to accommodate blanks in spacers --- .utilities.R | 85 ++++++++++++++++++++++++++++++++++++++++++++-------- RPR-FASTA.R | 11 +++---- 2 files changed, 78 insertions(+), 18 deletions(-) diff --git a/.utilities.R b/.utilities.R index 5dde8df..f3db10e 100644 --- a/.utilities.R +++ b/.utilities.R @@ -20,7 +20,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------- #TOC> 1 SCRIPTS TO SOURCE 50 @@ -43,7 +43,7 @@ #TOC> 5.03 selectPDBrep() 438 #TOC> 5.04 selectChi2() 474 #TOC> 5.05 selectENSP() 487 -#TOC> +#TOC> #TOC> ========================================================================== @@ -206,6 +206,7 @@ validateFA <- function(txt) { # # The function is used for its side-effect of throwing an error of # FASTA assumptions are violated in txt. + # - There may be spacer-lines matching "^\\s*$" # - At least one header line # - No adjacent header lines # - All header lines followed by at least one sequence line @@ -216,34 +217,92 @@ validateFA <- function(txt) { stop("no header lines in input") } - sel <- grepl("^>", txt) - sel <- sel[- length(sel)] & sel[-1] + isHeader <- grepl("^>", txt) + isSpacer <- grepl("^\\s*$", txt) + txtNoSpc <- txt[! isSpacer] + + # adjacent headers + sel <- isHeader[- length(isHeader)] & isHeader[-1] if ( any(sel) ) { i <- which(sel)[1] stop(sprintf("adjacent header lines in input (lines %d and %d)", i, i+1)) } - selA <- grepl("^>", txt) + # invalid character in a line that is not header or spacer + selA <- isHeader | isSpacer selB <- grepl(sprintf("[^%s]", AAVALID), txt) if ( any( (! selA) & selB) ) { # (not header) AND (has invalid character) i <- which( (! selA) & selB)[1] - stop(sprintf("invalid character(s) in sequence (cf. line %d)", - i, i+1)) + stop(sprintf("invalid character(s) in sequence (cf. line %d)", i)) } - sel <- grep("^>", txt) + 1 - sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel]) - if ( ! all(sel) ) { - i <- which( ! sel)[1] - stop(sprintf("a header has no adjacent sequence (line %d)", - grep("^>", txt)[i])) + # no spacer should immediately follow a header + sel <- c(isSpacer, FALSE) & c(FALSE, isHeader) + if ( any(sel) ) { + i <- which(sel)[1] + stop(sprintf("a header has no adjacent sequence (line %d)", i - 1)) + } + + # no header alone on the last line + if ( which(isHeader)[sum(isHeader)] == length(txt)) { + stop(sprintf("a header is alone on the last line (line %d)", length(txt))) + } + + # no spacer in a block of sequence + # (tested as: not followed by spacer or header) + + if (sum(isSpacer) > 0) { + sel <- which(isSpacer) + 1 + sel <- sel[sel <= length(txt)] + if ( ! all(isSpacer[sel] | isHeader[sel]) ) { + i <- which(! (isSpacer[sel] | isHeader[sel]))[1] + stop(sprintf("a spacer is followed by sequence (line %d)", + which(isSpacer)[i])) + } + } # all good, if we get to here. return(invisible(NULL)) } +if (FALSE) { + + # Should validate + fa <- c(">abc", "de") + validateFA(fa) + + fa <- c(">abc", "de", "fgh", ">ojz", "ikl") + validateFA(fa) + + fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "") + validateFA(fa) + + fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz", "ikl", "\t") + validateFA(fa) + + # should fail + fa <- c("abc", "de") + validateFA(fa) + + fa <- c(">abc", "de", ">fgh", ">ojz", "ikl") + validateFA(fa) + + fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "") + validateFA(fa) + + fa <- c(">abc", " ", "de", "fgh", " ", ">ojz", "ikl", "\t") + validateFA(fa) + + fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz") + validateFA(fa) + + fa <- c(" ", ">abc", "de", " ", "fgh", ">ojz", "ikl", "\t") + validateFA(fa) + +} + # == 4.05 readFASTA() ====================================================== diff --git a/RPR-FASTA.R b/RPR-FASTA.R index 41a8ea8..c933fc2 100644 --- a/RPR-FASTA.R +++ b/RPR-FASTA.R @@ -32,8 +32,8 @@ #TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------- -#TOC> 1 Reading and validating FASTA 45 -#TOC> 1.1 Validating FASTA 81 +#TOC> 1 Reading and validating FASTA 44 +#TOC> 1.1 Validating FASTA 80 #TOC> 2 Parsing FASTA 225 #TOC> 3 Interpreting FASTA 245 #TOC> 4 Writing FASTA 272 @@ -218,8 +218,9 @@ FA <- c(">head1", validate(FA) # ... should not create an error -# a somewhat more elaborate validateFA() function was loaded with the -# ./utilities.R script +# A somewhat more elaborate validateFA() function was loaded with the +# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- +# fasta files have space-characters in their spacer lines. # = 2 Parsing FASTA ======================================================= @@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS) # = 4 Writing FASTA ======================================================= -# Writing FASTA files mostly just the revrese reverse of reading, with one +# Writing FASTA files is mostly just the reverse of reading, with one # twist: we need to break the long sequence string into chunks of the desired # width. The FASTA specification calls for a maximum of 120 characters per line, # but writing out much less than that is common, since it allows to comfortably