update validateFASTA() logic to accommodate blanks in spacers
This commit is contained in:
parent
169ca58b92
commit
092494d9de
81
.utilities.R
81
.utilities.R
@ -206,6 +206,7 @@ validateFA <- function(txt) {
|
|||||||
#
|
#
|
||||||
# The function is used for its side-effect of throwing an error of
|
# The function is used for its side-effect of throwing an error of
|
||||||
# FASTA assumptions are violated in txt.
|
# FASTA assumptions are violated in txt.
|
||||||
|
# - There may be spacer-lines matching "^\\s*$"
|
||||||
# - At least one header line
|
# - At least one header line
|
||||||
# - No adjacent header lines
|
# - No adjacent header lines
|
||||||
# - All header lines followed by at least one sequence line
|
# - All header lines followed by at least one sequence line
|
||||||
@ -216,34 +217,92 @@ validateFA <- function(txt) {
|
|||||||
stop("no header lines in input")
|
stop("no header lines in input")
|
||||||
}
|
}
|
||||||
|
|
||||||
sel <- grepl("^>", txt)
|
isHeader <- grepl("^>", txt)
|
||||||
sel <- sel[- length(sel)] & sel[-1]
|
isSpacer <- grepl("^\\s*$", txt)
|
||||||
|
txtNoSpc <- txt[! isSpacer]
|
||||||
|
|
||||||
|
# adjacent headers
|
||||||
|
sel <- isHeader[- length(isHeader)] & isHeader[-1]
|
||||||
if ( any(sel) ) {
|
if ( any(sel) ) {
|
||||||
i <- which(sel)[1]
|
i <- which(sel)[1]
|
||||||
stop(sprintf("adjacent header lines in input (lines %d and %d)",
|
stop(sprintf("adjacent header lines in input (lines %d and %d)",
|
||||||
i, i+1))
|
i, i+1))
|
||||||
}
|
}
|
||||||
|
|
||||||
selA <- grepl("^>", txt)
|
# invalid character in a line that is not header or spacer
|
||||||
|
selA <- isHeader | isSpacer
|
||||||
selB <- grepl(sprintf("[^%s]", AAVALID), txt)
|
selB <- grepl(sprintf("[^%s]", AAVALID), txt)
|
||||||
if ( any( (! selA) & selB) ) { # (not header) AND (has invalid character)
|
if ( any( (! selA) & selB) ) { # (not header) AND (has invalid character)
|
||||||
i <- which( (! selA) & selB)[1]
|
i <- which( (! selA) & selB)[1]
|
||||||
stop(sprintf("invalid character(s) in sequence (cf. line %d)",
|
stop(sprintf("invalid character(s) in sequence (cf. line %d)", i))
|
||||||
i, i+1))
|
}
|
||||||
|
|
||||||
|
# no spacer should immediately follow a header
|
||||||
|
sel <- c(isSpacer, FALSE) & c(FALSE, isHeader)
|
||||||
|
if ( any(sel) ) {
|
||||||
|
i <- which(sel)[1]
|
||||||
|
stop(sprintf("a header has no adjacent sequence (line %d)", i - 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
# no header alone on the last line
|
||||||
|
if ( which(isHeader)[sum(isHeader)] == length(txt)) {
|
||||||
|
stop(sprintf("a header is alone on the last line (line %d)", length(txt)))
|
||||||
|
}
|
||||||
|
|
||||||
|
# no spacer in a block of sequence
|
||||||
|
# (tested as: not followed by spacer or header)
|
||||||
|
|
||||||
|
if (sum(isSpacer) > 0) {
|
||||||
|
sel <- which(isSpacer) + 1
|
||||||
|
sel <- sel[sel <= length(txt)]
|
||||||
|
if ( ! all(isSpacer[sel] | isHeader[sel]) ) {
|
||||||
|
i <- which(! (isSpacer[sel] | isHeader[sel]))[1]
|
||||||
|
stop(sprintf("a spacer is followed by sequence (line %d)",
|
||||||
|
which(isSpacer)[i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
sel <- grep("^>", txt) + 1
|
|
||||||
sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel])
|
|
||||||
if ( ! all(sel) ) {
|
|
||||||
i <- which( ! sel)[1]
|
|
||||||
stop(sprintf("a header has no adjacent sequence (line %d)",
|
|
||||||
grep("^>", txt)[i]))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# all good, if we get to here.
|
# all good, if we get to here.
|
||||||
return(invisible(NULL))
|
return(invisible(NULL))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (FALSE) {
|
||||||
|
|
||||||
|
# Should validate
|
||||||
|
fa <- c(">abc", "de")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(">abc", "de", "fgh", ">ojz", "ikl")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c("", ">abc", "de", "fgh", "", ">ojz", "ikl", "")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz", "ikl", "\t")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
# should fail
|
||||||
|
fa <- c("abc", "de")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(">abc", "de", ">fgh", ">ojz", "ikl")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c("", ">abc", "de", "f_h", "", ">ojz", "ikl", "")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(">abc", " ", "de", "fgh", " ", ">ojz", "ikl", "\t")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(" ", ">abc", "de", "fgh", " ", ">ojz")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
fa <- c(" ", ">abc", "de", " ", "fgh", ">ojz", "ikl", "\t")
|
||||||
|
validateFA(fa)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 4.05 readFASTA() ======================================================
|
# == 4.05 readFASTA() ======================================================
|
||||||
|
|
||||||
|
11
RPR-FASTA.R
11
RPR-FASTA.R
@ -32,8 +32,8 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------
|
#TOC> -----------------------------------------------------
|
||||||
#TOC> 1 Reading and validating FASTA 45
|
#TOC> 1 Reading and validating FASTA 44
|
||||||
#TOC> 1.1 Validating FASTA 81
|
#TOC> 1.1 Validating FASTA 80
|
||||||
#TOC> 2 Parsing FASTA 225
|
#TOC> 2 Parsing FASTA 225
|
||||||
#TOC> 3 Interpreting FASTA 245
|
#TOC> 3 Interpreting FASTA 245
|
||||||
#TOC> 4 Writing FASTA 272
|
#TOC> 4 Writing FASTA 272
|
||||||
@ -218,8 +218,9 @@ FA <- c(">head1",
|
|||||||
validate(FA) # ... should not create an error
|
validate(FA) # ... should not create an error
|
||||||
|
|
||||||
|
|
||||||
# a somewhat more elaborate validateFA() function was loaded with the
|
# A somewhat more elaborate validateFA() function was loaded with the
|
||||||
# ./utilities.R script
|
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
|
||||||
|
# fasta files have space-characters in their spacer lines.
|
||||||
|
|
||||||
# = 2 Parsing FASTA =======================================================
|
# = 2 Parsing FASTA =======================================================
|
||||||
|
|
||||||
@ -271,7 +272,7 @@ pie(table(x)[names(AACOLS)], col = AACOLS)
|
|||||||
# = 4 Writing FASTA =======================================================
|
# = 4 Writing FASTA =======================================================
|
||||||
|
|
||||||
|
|
||||||
# Writing FASTA files mostly just the revrese reverse of reading, with one
|
# Writing FASTA files is mostly just the reverse of reading, with one
|
||||||
# twist: we need to break the long sequence string into chunks of the desired
|
# twist: we need to break the long sequence string into chunks of the desired
|
||||||
# width. The FASTA specification calls for a maximum of 120 characters per line,
|
# width. The FASTA specification calls for a maximum of 120 characters per line,
|
||||||
# but writing out much less than that is common, since it allows to comfortably
|
# but writing out much less than that is common, since it allows to comfortably
|
||||||
|
Loading…
Reference in New Issue
Block a user