Major refactoring to simplify logic and clean code

This commit is contained in:
hyginn 2020-09-24 18:45:22 +10:00
parent f48b6bf3b7
commit 528dc91407

View File

@ -1,20 +1,17 @@
# tocID <- "RPR-FASTA.R" # tocID <- "RPR-FASTA.R"
# #
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-FASTA unit. # R code accompanying the RPR-FASTA unit.
# #
# Version: 1.0 # Version: 1.1
# #
# Date: 2017 10 14 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.1 2020 Maintenance. Rewrite validation logic. Add data
# to utilities. Define AACOLS
# 1.0 New unit. # 1.0 New unit.
# #
# #
@ -30,55 +27,61 @@
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------- #TOC> -----------------------------------------------------
#TOC> 1 Reading FASTA 39 #TOC> 1 Reading and validating FASTA 45
#TOC> 2 Interpreting FASTA 227 #TOC> 1.1 Validating FASTA 81
#TOC> 3 Writing FASTA 248 #TOC> 2 Parsing FASTA 225
#TOC> 3 Interpreting FASTA 245
#TOC> 4 Writing FASTA 272
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Reading and validating FASTA ========================================
# = 1 Reading FASTA =======================================================
# FASTA is a text based format, structured in lines that are separated by # FASTA is a text based format, structured in lines that are separated by
# line-feed or paragraph-break characters. Which one of these is used, depends # line-feed or paragraph-break characters. Which one of these is used, depends
# on your operating system. But Rs readLines() function knows how to handle # on your operating system. But R's readLines() function knows how to handle
# these correctly, accross platforms. Don't try to read such files "by hand". # these correctly, accross platforms. Don't try to read such files "by hand".
# Here is the yeast Mbp1 gene, via SGD. # Here is the yeast Mbp1 gene, via SGD.
file.show("./data/S288C_YDL056W_MBP1_coding.fsa") file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
myFASTA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# The warning is generated because the programmer who implemented the code to # The warning is generated because the programmer at the NCBI who implemented
# write this FASTA file neglected to place a line-break character after the last # the code to write this FASTA file neglected to place a line-break character
# sequence character. # after the last sequence character. While this is not technically incorrect,
# it is poor practice.
head(myFASTA) head(faMBP1)
# Note that there are NO line-break characters ("\n") at the end of these # Note that there are NO line-break characters ("\n") at the end of these
# strings, readLines() has "consumed" them while reading. # strings, even though they were present in the original file. readLines()
# has "consumed" these characters while reading - but every single line is in
# a vector of its own.
tail(myFASTA) tail(faMBP1)
# Also note that the last line has fewer characters - this means readLines() # Also note that the last line has fewer characters - this means readLines()
# imported the whole line, despite it not being terminated. # imported the whole line, despite it not being terminated by "\n".
# It's very straightforward to work with such data, for example by collapsing # It's very straightforward to work with such data, for example by collapsing
# everything after the first line into a single string ... # everything except the first line into a single string ...
f <- c(myFASTA[1], paste(myFASTA[-1], sep = "", collapse = "")) f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
f[1] f[1]
nchar(f[2]) nchar(f[2])
# ... but this is making assumptions that everything in line 2 until the end IS # == 1.1 Validating FASTA ==================================================
# sequence, the whole sequence and nothing but sequence. That assumption can
# break down in many ways: # The code above is making the assumption that everything from line 2 until
# the end IS sequence, the whole sequence and nothing but sequence.
# That assumption can break down in many ways:
# #
# - there could be more than one header line. The specification says otherwise, # - there could be more than one header line. The specification says otherwise,
# but some older files use multiple, consecutive header lines. You don't # but some older files use multiple, consecutive header lines. You don't
@ -95,143 +98,150 @@ nchar(f[2])
# #
# Data "from the wild" can (and usually does) have the most unexpected # Data "from the wild" can (and usually does) have the most unexpected
# variations and it is really, really important to be clear about the # variations and it is really, really important to be clear about the
# assumptions that you are making. Here is the structure of a FASTA file, # assumptions that you are making. It is possible to "fix" things, according
# specified with as few assumptions as possible. # to the "Robustness Principle" :
# "Be conservative in what you send,
# be liberal in what you accept".
# (cf. https://en.wikipedia.org/wiki/Robustness_principle )
# ... but if you think about this, that's actually a really poor idea,
# which is much more likely to dilute standards, make unwarranted
# assumptions, and allow errors to pass silently and corrupt data.
# #
# (1) it contains characters; # Let's discard this principle on the trash-heap of
# (2) there might be lines that begin with characters other than # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
# ">", these should be discarded; # identify problems, and follow the principle: "crash early, crash often". Of
# (3) it contains one or more consecutive lines that are sequence blocks; # course I can write code that would reformat any possible input as a FASTA
# (4) each sequence block has one or more header lines; # file - but what good will it do me if it parses the file I receive
# (5) header lines start with ">"; # from a server into FASTA format like:
# (6) no actual sequence data begins with a ">";
# (7) header lines can contain any character;
# (8) sequence lines only contain letters, "-" (gap characters), or "*" (stop).
# #
# This suggests to parse as follows: # >404- Page Not Found</title</head>
# - drop all lines that don't begin with ">" or a letter # dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
# - identify consecutive lines that begin ">" and consecutive lines # spellingrcntacttheadministratrsdyhtml
# that do not begin ">" #
# - collapse each set of consecutive lines in-place # Therefore, we write ourselves a FASTA checker that will enforce the following:
# - drop all remaining lines. In this result the odd-indexed elements # (1) a FASTA file contains one or more sequences separated by zero or
# are headers, and the even-indexed elements are sequences. # more empty lines
# (2) a sequence contains one header line followed by
# one or more sequence lines
# (3) a sequence line contains one or more uppercase or lowercase single
# letter amino acid codes, hyphens (gap character), or * (stop).
#
# Anything else should generate an error.
# Let's code this as a function. We need some tool that identifies consecutive # (Case 1): Header(s) exist
# lines of something. The rle() (run-length encoding) function does this. It fX <- c("ABC",
# returns a vector of the length of "runs" in its input: "defghi",
"klmnpq")
sel <- grepl("^>", fX) # "^>" is a regular expression that
# means: the exact character ">" at the
# beginning ("^") of the line.
if ( ! any(sel) ) { stop("no header lines in input.") }
myPets <- c("ant", "bat", "bat", "bat", "cat", "ant", "ant")
(runs <- rle(myPets))
# The cumsum() (cumulative sum) function turns these numbers into indices # (Case 2) No adjacent header lines
# on our original vector. fX <- c(">ABC",
">123",
"defghi",
"klmnpq")
sel <- grepl("^>", fX)
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
if ( any(sel)) { stop("adjacent header lines in input.") }
(idx <- cumsum(runs$lengths)) # (Case 3.1) all sequence lines contain only valid characters
myPets[idx] # note that this is NOT unique ... "ant" appears twice, because # (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
# there were two separate runs of ants in our input. # are defined with the .utilities.R script)
AAVALID
# So far so good. But our FASTA file's lines are ALL different, so all the runs fX <- c(">ABC",
# will only have length 1 ... "def ;-) ghi",
"klmnpq")
rle(myFASTA)$lengths myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
sel <- ! grepl("^>", fX) # NOT headers
# How do we deal with that? Obviously we need to actually analyze the strings we if (any(grepl(myRegex, fX[sel]))) {
# are working with. grepl(<pattern>, <x>) is exactly what we need here. It stop("invalid chracter(s) outside of header lines.")
# produces a vector of booleans, of the same length as the input vector <x>,
# which is TRUE if the element matches the <pattern>, FALSE if not.
grepl("^>", myFASTA) # "^>" is a regular expression that means: ">" at the
# beginning ("^") of the line.
(runs <- rle(grepl("^>", myFASTA)))
# Translating that into start positions of blocks takes a bit of bookkeeping:
# the first start has index 1, the following starts can be calculated from
# cumsum()'s and $length's.
(starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1)))
# ... and with that, we can parse our FASTA data. We take the specification
# above and translate it into code. That's how we develop code: write up step by
# instructions as comments, then implement them one by one.
# Here is an example
FA <- c(">head1 part a", ">head1 part b", "abcdef", "ghi", # two headers
"", # empty line
">head2", "jkl", # one header
">head3", "mno", "pqrs") # two sequence lines
# - drop all lines that don't begin with ">" or a letter, "-", or "*"
FA <- FA[grepl("^[A-Za-z>*-]", FA)]
# - identify consecutive lines that begin ">" and consecutive lines
# that do not begin ">"
runs <- rle(grepl("^>", FA))
starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
# - collapse each set of consecutive lines in-place
for (i in seq_along(starts)) {
FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
sep ="",
collapse = "")
} }
# - drop all remaining lines. # (Case 3.2) all headers are followed directly by
FA <- FA[starts] # at least one letter of sequence
fX <- c(">ABC",
"",
">123",
"defghi",
"klmnpq")
sel <- grep("^>", fX) + 1 # indexes of headers + 1
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
if (! all(grepl(myRegex, fX[sel]))) {
stop("a header has no adjacent sequence.")
}
# Ah, you might ask - couldn't we just have dropped all empty lines, and
# then caught this in Case 2? No - for two reasons: we would still miss headers
# at the end of file, and, we would have changed the line numbering - and
# ideally our "production" function will create information about where the
# error is to be found.
# In this resulting vector the odd-indexed elements
# are headers, and the even-indexed elements are sequences.
# As a function: # Now combine this into a function ...
readFASTA <- function(IN) { val <- function(fa) {
# Read a FASTA formatted file from IN, remove all non-header, non-sequence
# element, return collapsed sequences.
# Parameters:
# IN chr Input file name (or connection)
# Value:
# chr vector in which the odd-indexed elements are headers, and the
# even-indexed elements are sequences.
FA <- readLines(IN) if ( ! any(grepl("^>", fa)) ) {
FA <- FA[grepl("^[A-Za-z>*-]", FA)] stop("no header lines in input.")
runs <- rle(grepl("^>", FA))
starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
for (i in seq_along(starts)) { # collapse runs in-place
FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
sep ="",
collapse = "")
} }
# return collapsed lines sel <- grepl("^>", fa)
return(FA[starts]) if ( any(sel[- length(sel)] & sel[-1])) {
stop("adjacent header lines in input.")
}
sel <- ! grepl("^>", fa)
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
stop("invalid chracter(s) outside of header lines.")
}
sel <- grep("^>", fa) + 1
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
stop("a header has no adjacent sequence.")
}
return(invisible(NULL))
} }
# Try this: Let's try to use only the first 3 elements of myFASTA ... it's a # Here is an example
# lengthy sequence. But how? We don't have a file with that contents and the FA <- c(">head1",
# function expects to read from a file. Do we need to write myFASTA[1:3] to a "acdef",
# temporary file and then read it? We could - but wherever a file is expected we "ghi",
# can also pass in a "text connection" from an object in memory, with the "",
# textConnection() function, like so: ">head2",
"kl",
">head3",
"mn",
"pqrs")
validate(FA) # ... should not create an error
readFASTA(textConnection(myFASTA[1:3]))
# Here is a "real" example - a multi FASTA file of aligned APSES domain # a somewhat more elaborate validateFA() function was loaded with the
# sequences: # ./utilities.R script
(refAPSES <- readFASTA("./data/refAPSES.mfa")) # = 2 Parsing FASTA =======================================================
# Subset all headers: # Once we have validated our assumptions about our input, it's quite
refAPSES[seq(1, length(refAPSES), by = 2)] # painless to parse it. I have put this together as a function and the function
# gets loaded from ./.utilities.R
#
# Lets try this:
# - the first 3 elements of faMBP1:
readFASTA(faMBP1[1:3])
# - a multi FASTA file of aligned APSES domain sequences:
refAPSES <- readFASTA("./data/refAPSES.mfa")
# Subset the sequence with "P39678" in the header # Subset the sequence with "P39678" in the header
refAPSES[grep("P39678", refAPSES) + 1] # grep() the string and add 1 refAPSES[grep("P39678", refAPSES$head) ,]
# = 2 Interpreting FASTA ================================================== # = 3 Interpreting FASTA ==================================================
# FASTA files are straightforward to interpret - just one thing may be of note: # FASTA files are straightforward to interpret - just one thing may be of note:
@ -243,22 +253,28 @@ refAPSES[grep("P39678", refAPSES) + 1] # grep() the string and add 1
# Example: How many positive charged residues in "MBP1_SACCE"? # Example: How many positive charged residues in "MBP1_SACCE"?
s <- unlist(strsplit(refAPSES[grep("MBP1_SACCE", refAPSES) + 1], "")) s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
head(s) s
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
# for the characters, sum() coerces to 1 and 0 # for the characters, sum() coerces to 1 and 0
# respectively, and that gives us the result. # respectively, and that gives us the result.
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
# residue distribution
x <- factor(s, levels = names(AACOLS))
pie(table(x)[names(AACOLS)], col = AACOLS)
# = 3 Writing FASTA =======================================================
# = 4 Writing FASTA =======================================================
# Writing FASTA files mostly just the revrese reverse of reading, with one # Writing FASTA files mostly just the revrese reverse of reading, with one
# twist: we need to break the long sequence string into chunks of the desired # twist: we need to break the long sequence string into chunks of the desired
# width. The FASTA specification calls for a maximum of 120 characters per line, # width. The FASTA specification calls for a maximum of 120 characters per line,
# but writing out much less than that is common since it allows to comfortably # but writing out much less than that is common, since it allows to comfortably
# view lines on the console, or printing them on a sheet of paper (do we still # view lines on the console, or printing them on a sheet of paper (do we still
# do that actually?). How do we break a string into chunks? A combination of # do that actually?). How do we break a string into chunks? A combination of
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
@ -268,7 +284,7 @@ sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
# be slow - in that case, we might want to precalculate the size of the output # be slow - in that case, we might want to precalculate the size of the output
# object. But that's more of a hypothetical consideration. # object. But that's more of a hypothetical consideration.
s <- refAPSES[2] ( s <- refAPSES$seq[2] )
nchar(s) nchar(s)
w <- 30 # width of chunk w <- 30 # width of chunk
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk (starts <- seq(1, nchar(s), by = w)) # starting index of chunk
@ -278,37 +294,24 @@ w <- 30 # width of chunk
# What happens if nchar(s) is an exact multiple of w? # What happens if nchar(s) is an exact multiple of w?
substring(s, starts, ends) substring(s, starts, ends)
# confirm that the output contains the first and last residue, and both
# residues adjacent to the breaks
# Here's the function ... # As always, the function has been defined in ".utilities.R" for to use
# any time... type writeFASTA to examine it.
writeFASTA <- function(s, OUT = stdout(), width = 60) { # Let's try this...
# Write an object "s" that contains one or more header/sequence pairs to file.
# Parameters:
# s chr Vector with a FASTA header string in odd elements,
# sequence in one-letter code in even elements.
# OUT chr connection to be written to; defaults to stdout() i.e.
# output is written console.
# width int max number of sequence characters per line of output.
# Value:
# NA Invoked for side effect of writing data to file
txt <- character() writeFASTA(refAPSES, width = 40)
idx <- seq(1, length(s), by = 2)
for (i in idx) {
txt <- c(txt, s[i]) # add header line to txt
starts <- seq(1, nchar(s[i + 1]), by = width) # starting indices of chunks
ends <- c((starts - 1)[-1], nchar(s[i + 1])) # ending indices of chunks
txt <- c(txt, substring(s[i + 1], starts, ends)) # add chunks to txt
}
writeLines(txt, OUT)
} # roundtrip for validation: write refAPSES with a different format,
# read it back in - the new dataframe must be identical
# Let's try this. If we don't specify OUT, the result is written to the console # to the original dataframe.
# by default. Default width for sequence is 60 characters fname <- tempfile()
writeFASTA(refAPSES, fn = fname, width = 30)
writeFASTA(refAPSES) identical(refAPSES, readFASTA(fname))
# ...works for me :-)
# [END] # [END]