Major refactoring to simplify logic and clean code

2020-09-24 18:45:22 +10:00
parent f48b6bf3b7
commit 528dc91407
1 changed files with 177 additions and 174 deletions
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@@ -1,20 +1,17 @@
 # tocID <- "RPR-FASTA.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-FASTA unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  10  14
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1    2020 Maintenance. Rewrite validation logic. Add data
+#                  to utilities. Define AACOLS
 #           1.0    New unit.
 #
 #
@@ -30,55 +27,61 @@
 #
 # ==============================================================================

+
 #TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                 Line
-#TOC> -------------------------------------
-#TOC>   1        Reading FASTA           39
-#TOC>   2        Interpreting FASTA     227
-#TOC>   3        Writing FASTA          248
-#TOC>
+#TOC> 
+#TOC>   Section  Title                                 Line
+#TOC> -----------------------------------------------------
+#TOC>   1        Reading and validating FASTA            45
+#TOC>   1.1        Validating FASTA                      81
+#TOC>   2        Parsing FASTA                          225
+#TOC>   3        Interpreting FASTA                     245
+#TOC>   4        Writing FASTA                          272
+#TOC> 
 #TOC> ==========================================================================


-
-
-# =    1  Reading FASTA  =======================================================
+# =    1  Reading and validating FASTA  ========================================

 # FASTA is a text based format, structured in lines that are separated by
 # line-feed or paragraph-break characters. Which one of these is used, depends
-# on your operating system. But Rs readLines() function knows how to handle
+# on your operating system. But R's readLines() function knows how to handle
 # these correctly, accross platforms. Don't try to read such files "by hand".
 # Here is the yeast Mbp1 gene, via SGD.

 file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
-myFASTA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")

-# The warning is generated because the programmer who implemented the code to
-# write this FASTA file neglected to place a line-break character after the last
-# sequence character.
+# The warning is generated because the programmer at the NCBI who implemented
+# the code to write this FASTA file neglected to place a line-break character
+# after the last sequence character. While this is not technically incorrect,
+# it is poor practice.

-head(myFASTA)
+head(faMBP1)

 # Note that there are NO line-break characters ("\n") at the end of these
-# strings, readLines() has "consumed" them while reading.
+# strings, even though they were present in the original file. readLines()
+# has "consumed" these characters while reading - but every single line is in
+# a vector of its own.

-tail(myFASTA)
+tail(faMBP1)

 # Also note that the last line has fewer characters - this means readLines()
-# imported the whole line, despite it not being terminated.
+# imported the whole line, despite it not being terminated by "\n".

 # It's very straightforward to work with such data, for example by collapsing
-# everything after the first line into a single string ...
+# everything except the first line into a single string ...

-f <- c(myFASTA[1], paste(myFASTA[-1], sep = "", collapse = ""))
+f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))

 f[1]
 nchar(f[2])

-# ... but this is making assumptions that everything in line 2 until the end IS
-# sequence, the whole sequence and nothing but sequence. That assumption can
-# break down in many ways:
+# ==   1.1  Validating FASTA  ==================================================
+
+# The code above is making the assumption that everything from line 2 until
+#  the end IS sequence, the whole sequence and nothing but sequence.
+#  That assumption can break down in many ways:
 #
 #  - there could be more than one header line. The specification says otherwise,
 #       but some older files use multiple, consecutive header lines. You don't
@@ -95,143 +98,150 @@ nchar(f[2])
 #
 # Data "from the wild" can (and usually does) have the most unexpected
 # variations and it is really, really important to be clear about the
-# assumptions that you are making. Here is the structure of a FASTA file,
-# specified with as few assumptions as possible.
+# assumptions that you are making. It is possible to "fix" things, according
+# to the "Robustness Principle" :
+#      "Be conservative in what you send,
+#       be liberal in what you accept".
+#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
+# ... but if you think about this, that's actually a really poor idea,
+# which is much more likely to dilute standards, make unwarranted
+# assumptions, and allow errors to pass silently and corrupt data.
 #
-#  (1) it contains characters;
-#  (2) there might be lines that begin with characters other than
-#         ">", these should be discarded;
-#  (3) it contains one or more consecutive lines that are sequence blocks;
-#  (4) each sequence block has one or more header lines;
-#  (5) header lines start with ">";
-#  (6) no actual sequence data begins with a ">";
-#  (7) header lines can contain any character;
-#  (8) sequence lines only contain letters, "-" (gap characters), or "*" (stop).
+# Let's discard this principle on the trash-heap of
+# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
+# identify problems, and follow the principle: "crash early, crash often". Of
+# course I can write code that would reformat any possible input as a FASTA
+# file - but what good will it do me if it parses the file I receive
+# from a server into FASTA format like:
 #
-# This suggests to parse as follows:
-# - drop all lines that don't begin with ">" or a letter
-# - identify consecutive lines that begin ">" and consecutive lines
-#     that do not begin ">"
-# - collapse each set of consecutive lines in-place
-# - drop all remaining lines. In this result the odd-indexed elements
-#     are headers, and the even-indexed elements are sequences.
+#   >404- Page Not Found</title</head>
+#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
+#   spellingrcntacttheadministratrsdyhtml
+#
+# Therefore, we write ourselves a FASTA checker that will enforce the following:
+#   (1) a FASTA file contains one or more sequences separated by zero or
+#       more empty lines
+#   (2) a sequence contains one header line followed by
+#       one or more sequence lines
+#   (3) a sequence line contains one or more uppercase or lowercase single
+#       letter amino acid codes, hyphens (gap character), or * (stop).
+#
+#   Anything else should generate an error.

-# Let's code this as a function. We need some tool that identifies consecutive
-# lines of something. The rle() (run-length encoding) function does this. It
-# returns a vector of the length of "runs" in its input:
+#   (Case 1): Header(s) exist
+fX <- c("ABC",
+        "defghi",
+        "klmnpq")
+sel <- grepl("^>", fX)  # "^>" is a regular expression that
+                        # means: the exact character ">" at the
+                        # beginning ("^") of the line.
+if ( ! any(sel) ) { stop("no header lines in input.") }

-myPets <- c("ant", "bat", "bat", "bat", "cat", "ant", "ant")
-(runs <- rle(myPets))

-# The cumsum() (cumulative sum) function turns these numbers into indices
-# on our original vector.
+#   (Case 2) No adjacent header lines
+fX <- c(">ABC",
+        ">123",
+        "defghi",
+        "klmnpq")
+sel <- grepl("^>", fX)
+sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
+if ( any(sel)) { stop("adjacent header lines in input.") }

-(idx <- cumsum(runs$lengths))
-myPets[idx]   # note that this is NOT unique ... "ant" appears twice, because
-              # there were two separate runs of ants in our input.
-
-# So far so good. But our FASTA file's lines are ALL different, so all the runs
-# will only have length 1 ...
-
-rle(myFASTA)$lengths
-
-# How do we deal with that? Obviously we need to actually analyze the strings we
-# are working with. grepl(<pattern>, <x>) is exactly what we need here. It
-# produces a vector of booleans, of the same length as the input vector <x>,
-# which is TRUE if the element matches the <pattern>, FALSE if not.
-
-grepl("^>", myFASTA)  # "^>" is a regular expression that means: ">" at the
-                      # beginning ("^") of the line.
-
-(runs <- rle(grepl("^>", myFASTA)))
-
-# Translating that into start positions of blocks takes a bit of bookkeeping:
-# the first start has index 1, the following starts can be calculated from
-# cumsum()'s and $length's.
-(starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1)))
-
-# ... and with that, we can parse our FASTA data. We take the specification
-# above and translate it into code. That's how we develop code: write up step by
-# instructions as comments, then implement them one by one.
-
-# Here is an example
-FA <- c(">head1 part a", ">head1 part b", "abcdef", "ghi", # two headers
-        "",                                                # empty line
-        ">head2", "jkl",                                   # one header
-        ">head3", "mno", "pqrs")                           # two sequence lines
-
-# - drop all lines that don't begin with ">" or a letter, "-", or "*"
-FA <- FA[grepl("^[A-Za-z>*-]", FA)]
-
-# - identify consecutive lines that begin ">" and consecutive lines
-#     that do not begin ">"
-runs <- rle(grepl("^>", FA))
-starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
-
-# - collapse each set of consecutive lines in-place
-
-for (i in seq_along(starts)) {
-  FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
-                         sep ="",
-                         collapse = "")
+#   (Case 3.1) all sequence lines contain only valid characters
+#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
+#               are defined with the .utilities.R script)
+AAVALID
+fX <- c(">ABC",
+        "def ;-) ghi",
+        "klmnpq")
+myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
+sel <- ! grepl("^>", fX)              # NOT headers
+if (any(grepl(myRegex, fX[sel]))) {
+  stop("invalid chracter(s) outside of header lines.")
 }

-# - drop all remaining lines.
-FA <- FA[starts]
+#   (Case 3.2) all headers are followed directly by
+#              at least one letter of sequence
+fX <- c(">ABC",
+        "",
+        ">123",
+        "defghi",
+        "klmnpq")
+sel <- grep("^>", fX) + 1             # indexes of headers + 1
+myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
+if (! all(grepl(myRegex, fX[sel]))) {
+  stop("a header has no adjacent sequence.")
+}
+# Ah, you might ask - couldn't we just have dropped all empty lines, and
+# then caught this in Case 2? No - for two reasons: we would still miss headers
+# at the end of file, and, we would have changed the line numbering - and
+# ideally our "production" function will create information about where the
+# error is to be found.

-# In this resulting vector the odd-indexed elements
-#     are headers, and the even-indexed elements are sequences.

-# As a function:
+# Now combine this into a function ...

-readFASTA <- function(IN) {
-  # Read a FASTA formatted file from IN, remove all non-header, non-sequence
-  # element, return collapsed sequences.
-  # Parameters:
-  #    IN  chr   Input file name (or connection)
-  # Value:
-  #    chr vector  in which the odd-indexed elements are headers, and the
-  #                even-indexed elements are sequences.
+val <- function(fa) {

-  FA <- readLines(IN)
-  FA <- FA[grepl("^[A-Za-z>*-]", FA)]
-
-  runs <- rle(grepl("^>", FA))
-  starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
-
-  for (i in seq_along(starts)) { # collapse runs in-place
-    FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
-                           sep ="",
-                           collapse = "")
+  if ( ! any(grepl("^>", fa)) ) {
+    stop("no header lines in input.")
  }

-  # return collapsed lines
- return(FA[starts])
+  sel <- grepl("^>", fa)
+  if ( any(sel[- length(sel)] & sel[-1])) {
+    stop("adjacent header lines in input.")
+  }
+
+  sel <- ! grepl("^>", fa)
+  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
+    stop("invalid chracter(s) outside of header lines.")
+  }
+
+  sel <- grep("^>", fa) + 1
+  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
+    stop("a header has no adjacent sequence.")
+  }
+
+  return(invisible(NULL))
 }

-# Try this: Let's try to use only the first 3 elements of myFASTA ... it's a
-# lengthy sequence. But how? We don't have a file with that contents and the
-# function expects to read from a file. Do we need to write myFASTA[1:3] to a
-# temporary file and then read it? We could - but wherever a file is expected we
-# can also pass in a "text connection" from an object in memory, with the
-# textConnection() function, like so:
+# Here is an example
+FA <- c(">head1",
+        "acdef",
+        "ghi",
+        "",
+        ">head2",
+        "kl",
+        ">head3",
+        "mn",
+        "pqrs")
+validate(FA)     # ... should not create an error

-readFASTA(textConnection(myFASTA[1:3]))

-# Here is a "real" example - a multi FASTA file of aligned APSES domain
-# sequences:
+# a somewhat more elaborate validateFA() function was loaded with the
+# ./utilities.R script

-(refAPSES <- readFASTA("./data/refAPSES.mfa"))
+# =    2  Parsing FASTA  =======================================================

-# Subset all headers:
-refAPSES[seq(1, length(refAPSES), by = 2)]
+# Once we have validated our assumptions about our input, it's quite
+# painless to parse it. I have put this together as a function and the function
+# gets loaded from ./.utilities.R
+#
+
+# Lets try this:
+#   - the first 3 elements of faMBP1:
+readFASTA(faMBP1[1:3])
+
+#   - a multi FASTA file of aligned APSES domain sequences:
+
+refAPSES <- readFASTA("./data/refAPSES.mfa")

 # Subset the sequence with "P39678" in the header
-refAPSES[grep("P39678", refAPSES) + 1]  # grep() the string and add 1
+refAPSES[grep("P39678", refAPSES$head) ,]



-# =    2  Interpreting FASTA  ==================================================
+# =    3  Interpreting FASTA  ==================================================


 # FASTA files are straightforward to interpret - just one thing may be of note:
@@ -243,22 +253,28 @@ refAPSES[grep("P39678", refAPSES) + 1]  # grep() the string and add 1

 # Example: How many positive charged residues in "MBP1_SACCE"?

-s <- unlist(strsplit(refAPSES[grep("MBP1_SACCE", refAPSES) + 1], ""))
-head(s)
+s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
+s
+
 sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
                       # for the characters, sum() coerces to 1 and 0
                       # respectively, and that gives us the result.

 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %

+# residue distribution
+x <- factor(s, levels = names(AACOLS))
+pie(table(x)[names(AACOLS)], col = AACOLS)

-# =    3  Writing FASTA  =======================================================
+
+
+# =    4  Writing FASTA  =======================================================


 # Writing FASTA files mostly just the revrese reverse of reading, with one
 # twist: we need to break the long sequence string into chunks of the desired
 # width. The FASTA specification calls for a maximum of 120 characters per line,
-# but writing out much less than that is common since it allows to comfortably
+# but writing out much less than that is common, since it allows to comfortably
 # view lines on the console, or printing them on a sheet of paper (do we still
 # do that actually?). How do we break a string into chunks? A combination of
 # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
@@ -268,7 +284,7 @@ sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
 # be slow - in that case, we might want to precalculate the size of the output
 # object. But that's more of a hypothetical consideration.

-s <- refAPSES[2]
+( s <- refAPSES$seq[2] )
 nchar(s)
 w <- 30     # width of chunk
 (starts <- seq(1, nchar(s), by = w))      # starting index of chunk
@@ -278,37 +294,24 @@ w <- 30     # width of chunk
 #       What happens if nchar(s) is an exact multiple of w?

 substring(s, starts, ends)
+# confirm that the output contains the first and last residue, and both
+# residues adjacent to the breaks

-# Here's the function ...
+# As always, the function has been defined in ".utilities.R" for to use
+# any time...  type   writeFASTA  to examine it.

-writeFASTA <- function(s, OUT = stdout(), width = 60) {
-  # Write an object "s" that contains one or more header/sequence pairs to file.
-  # Parameters:
-  #    s      chr   Vector with a FASTA header string in odd elements,
-  #                    sequence in one-letter code in even elements.
-  #    OUT    chr   connection to be written to; defaults to stdout() i.e.
-  #                 output is written console.
-  #    width  int   max number of sequence characters per line of output.
-  # Value:
-  #           NA    Invoked for side effect of writing data to file
+# Let's try this...

-  txt <- character()
-  idx <- seq(1, length(s), by = 2)
-  for (i in idx) {
-    txt <- c(txt, s[i])                              # add header line to txt
-    starts <- seq(1, nchar(s[i + 1]), by = width)    # starting indices of chunks
-    ends <- c((starts - 1)[-1], nchar(s[i + 1]))     # ending indices of chunks
-    txt <- c(txt, substring(s[i + 1], starts, ends)) # add chunks to txt
-  }
-  writeLines(txt, OUT)
+writeFASTA(refAPSES, width = 40)

-}
-
-# Let's try this. If we don't specify OUT, the result is written to the console
-# by default. Default width for sequence is 60 characters
-
-writeFASTA(refAPSES)
+# roundtrip for validation: write refAPSES with a different format,
+# read it back in - the new dataframe must be identical
+# to the original dataframe.
+fname <- tempfile()
+writeFASTA(refAPSES, fn = fname, width = 30)
+identical(refAPSES, readFASTA(fname))

+# ...works for me  :-)


 # [END]