update validateFASTA() logic to accommodate blanks in spacers

This commit is contained in:
hyginn 2020-09-26 00:43:10 +10:00
parent 169ca58b92
commit 092494d9de
2 changed files with 78 additions and 18 deletions

View File

@ -20,7 +20,7 @@
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------
#TOC> 1 SCRIPTS TO SOURCE 50
@ -43,7 +43,7 @@
#TOC> 5.03 selectPDBrep() 438
#TOC> 5.04 selectChi2() 474
#TOC> 5.05 selectENSP() 487
#TOC>
#TOC>
#TOC> ==========================================================================
@ -206,6 +206,7 @@ validateFA <- function(txt) {
#
# The function is used for its side-effect of throwing an error of
# FASTA assumptions are violated in txt.
# - There may be spacer-lines matching "^\\s*$"
# - At least one header line
# - No adjacent header lines
# - All header lines followed by at least one sequence line
@ -216,34 +217,92 @@ validateFA <- function(txt) {
stop("no header lines in input")
}
sel <- grepl("^>", txt)
sel <- sel[- length(sel)] & sel[-1]
isHeader <- grepl("^>", txt)
isSpacer <- grepl("^\\s*$", txt)
txtNoSpc <- txt[! isSpacer]
# adjacent headers
sel <- isHeader[- length(isHeader)] & isHeader[-1]
if ( any(sel) ) {
i <- which(sel)[1]
stop(sprintf("adjacent header lines in input (lines %d and %d)",
i, i+1))
}
selA <- grepl("^>", txt)
# invalid character in a line that is not header or spacer
selA <- isHeader | isSpacer
selB <- grepl(sprintf("[^%s]", AAVALID), txt)
if ( any( (! selA) & selB) ) { # (not header) AND (has invalid character)
i <- which( (! selA) & selB)[1]
stop(sprintf("invalid character(s) in sequence (cf. line %d)",
i, i+1))
stop(sprintf("invalid character(s) in sequence (cf. line %d)", i))
}
sel <- grep("^>", txt) + 1
sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel])
if ( ! all(sel) ) {
i <- which( ! sel)[1]
stop(sprintf("a header has no adjacent sequence (line %d)",
grep("^>", txt)[i]))
# no spacer should immediately follow a header
sel <- c(isSpacer, FALSE) & c(FALSE, isHeader)
if ( any(sel) ) {
i <- which(sel)[1]
stop(sprintf("a header has no adjacent sequence (line %d)", i - 1))
}
# no header alone on the last line
if ( which(isHeader)[sum(isHeader)] == length(txt)) {
stop(sprintf("a header is alone on the last line (line %d)", length(txt)))
}
# no spacer in a block of sequence
# (tested as: not followed by spacer or header)
if (sum(isSpacer) > 0) {
sel <- which(isSpacer) + 1
sel <- sel[sel <= length(txt)]
if ( ! all(isSpacer[sel] | isHeader[sel]) ) {
i <- which(! (isSpacer[sel] | isHeader[sel]))[1]
stop(sprintf("a spacer is followed by sequence (line %d)",
which(isSpacer)[i]))
}
}
# all good, if we get to here.
return(invisible(NULL))
}
if (FALSE) {
# Should validate
fa <- c(">abc", "de")
validateFA(fa)
fa <- c(">abc", "de", "fgh", ">ojz", "ikl")
validateFA(fa)
fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "")
validateFA(fa)
fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz", "ikl", "\t")
validateFA(fa)
# should fail
fa <- c("abc", "de")
validateFA(fa)
fa <- c(">abc", "de", ">fgh", ">ojz", "ikl")
validateFA(fa)
fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "")
validateFA(fa)
fa <- c(">abc", " ", "de", "fgh", " ", ">ojz", "ikl", "\t")
validateFA(fa)
fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz")
validateFA(fa)
fa <- c(" ", ">abc", "de", " ", "fgh", ">ojz", "ikl", "\t")
validateFA(fa)
}
# == 4.05 readFASTA() ======================================================

View File

@ -32,8 +32,8 @@
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------
#TOC> 1 Reading and validating FASTA 45
#TOC> 1.1 Validating FASTA 81
#TOC> 1 Reading and validating FASTA 44
#TOC> 1.1 Validating FASTA 80
#TOC> 2 Parsing FASTA 225
#TOC> 3 Interpreting FASTA 245
#TOC> 4 Writing FASTA 272
@ -218,8 +218,9 @@ FA <- c(">head1",
validate(FA) # ... should not create an error
# a somewhat more elaborate validateFA() function was loaded with the
# ./utilities.R script
# A somewhat more elaborate validateFA() function was loaded with the
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
# fasta files have space-characters in their spacer lines.
# = 2 Parsing FASTA =======================================================
@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS)
# = 4 Writing FASTA =======================================================
# Writing FASTA files mostly just the revrese reverse of reading, with one
# Writing FASTA files is mostly just the reverse of reading, with one
# twist: we need to break the long sequence string into chunks of the desired
# width. The FASTA specification calls for a maximum of 120 characters per line,
# but writing out much less than that is common, since it allows to comfortably