Major refactoring to simplify logic and clean code

2020-09-24 18:45:22 +10:00
parent f48b6bf3b7
commit 528dc91407
1 changed files with 177 additions and 174 deletions
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@@ -1,20 +1,17 @@
 # tocID <- "RPR-FASTA.R"
 #
 # ---------------------------------------------------------------------------- #
 #  PATIENCE  ...                                                               #
 #    Do not yet work wih this code. Updates in progress. Thank you.            #
 #    boris.steipe@utoronto.ca                                                  #
 # ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-FASTA unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  10  14
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.1    2020 Maintenance. Rewrite validation logic. Add data
 #                  to utilities. Define AACOLS
 #           1.0    New unit.
 #
 #
@@ -30,55 +27,61 @@
 #
 # ==============================================================================
 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                 Line
+#TOC>   Section  Title                                 Line
-#TOC> -------------------------------------
+#TOC> -----------------------------------------------------
-#TOC>   1        Reading FASTA           39
+#TOC>   1        Reading and validating FASTA            45
-#TOC>   2        Interpreting FASTA     227
+#TOC>   1.1        Validating FASTA                      81
-#TOC>   3        Writing FASTA          248
+#TOC>   2        Parsing FASTA                          225
 #TOC>   3        Interpreting FASTA                     245
 #TOC>   4        Writing FASTA                          272
 #TOC> 
 #TOC> ==========================================================================
-
+# =    1  Reading and validating FASTA  ========================================
 # =    1  Reading FASTA  =======================================================
 # FASTA is a text based format, structured in lines that are separated by
 # line-feed or paragraph-break characters. Which one of these is used, depends
-# on your operating system. But Rs readLines() function knows how to handle
+# on your operating system. But R's readLines() function knows how to handle
 # these correctly, accross platforms. Don't try to read such files "by hand".
 # Here is the yeast Mbp1 gene, via SGD.
 file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
-myFASTA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-# The warning is generated because the programmer who implemented the code to
+# The warning is generated because the programmer at the NCBI who implemented
-# write this FASTA file neglected to place a line-break character after the last
+# the code to write this FASTA file neglected to place a line-break character
-# sequence character.
+# after the last sequence character. While this is not technically incorrect,
 # it is poor practice.
-head(myFASTA)
+head(faMBP1)
 # Note that there are NO line-break characters ("\n") at the end of these
-# strings, readLines() has "consumed" them while reading.
+# strings, even though they were present in the original file. readLines()
 # has "consumed" these characters while reading - but every single line is in
 # a vector of its own.
-tail(myFASTA)
+tail(faMBP1)
 # Also note that the last line has fewer characters - this means readLines()
-# imported the whole line, despite it not being terminated.
+# imported the whole line, despite it not being terminated by "\n".
 # It's very straightforward to work with such data, for example by collapsing
-# everything after the first line into a single string ...
+# everything except the first line into a single string ...
-f <- c(myFASTA[1], paste(myFASTA[-1], sep = "", collapse = ""))
+f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
 f[1]
 nchar(f[2])
-# ... but this is making assumptions that everything in line 2 until the end IS
+# ==   1.1  Validating FASTA  ==================================================
-# sequence, the whole sequence and nothing but sequence. That assumption can
+
-# break down in many ways:
+# The code above is making the assumption that everything from line 2 until
 #  the end IS sequence, the whole sequence and nothing but sequence.
 #  That assumption can break down in many ways:
 #
 #  - there could be more than one header line. The specification says otherwise,
 #       but some older files use multiple, consecutive header lines. You don't
@@ -95,143 +98,150 @@ nchar(f[2])
 #
 # Data "from the wild" can (and usually does) have the most unexpected
 # variations and it is really, really important to be clear about the
-# assumptions that you are making. Here is the structure of a FASTA file,
+# assumptions that you are making. It is possible to "fix" things, according
-# specified with as few assumptions as possible.
+# to the "Robustness Principle" :
 #      "Be conservative in what you send,
 #       be liberal in what you accept".
 #       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
 # ... but if you think about this, that's actually a really poor idea,
 # which is much more likely to dilute standards, make unwarranted
 # assumptions, and allow errors to pass silently and corrupt data.
 #
-#  (1) it contains characters;
+# Let's discard this principle on the trash-heap of
-#  (2) there might be lines that begin with characters other than
+# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
-#         ">", these should be discarded;
+# identify problems, and follow the principle: "crash early, crash often". Of
-#  (3) it contains one or more consecutive lines that are sequence blocks;
+# course I can write code that would reformat any possible input as a FASTA
-#  (4) each sequence block has one or more header lines;
+# file - but what good will it do me if it parses the file I receive
-#  (5) header lines start with ">";
+# from a server into FASTA format like:
 #  (6) no actual sequence data begins with a ">";
 #  (7) header lines can contain any character;
 #  (8) sequence lines only contain letters, "-" (gap characters), or "*" (stop).
 #
-# This suggests to parse as follows:
+#   >404- Page Not Found</title</head>
-# - drop all lines that don't begin with ">" or a letter
+#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
-# - identify consecutive lines that begin ">" and consecutive lines
+#   spellingrcntacttheadministratrsdyhtml
-#     that do not begin ">"
+#
-# - collapse each set of consecutive lines in-place
+# Therefore, we write ourselves a FASTA checker that will enforce the following:
-# - drop all remaining lines. In this result the odd-indexed elements
+#   (1) a FASTA file contains one or more sequences separated by zero or
-#     are headers, and the even-indexed elements are sequences.
+#       more empty lines
 #   (2) a sequence contains one header line followed by
 #       one or more sequence lines
 #   (3) a sequence line contains one or more uppercase or lowercase single
 #       letter amino acid codes, hyphens (gap character), or * (stop).
 #
 #   Anything else should generate an error.
-# Let's code this as a function. We need some tool that identifies consecutive
+#   (Case 1): Header(s) exist
-# lines of something. The rle() (run-length encoding) function does this. It
+fX <- c("ABC",
-# returns a vector of the length of "runs" in its input:
+        "defghi",
        "klmnpq")
 sel <- grepl("^>", fX)  # "^>" is a regular expression that
                        # means: the exact character ">" at the
                        # beginning ("^") of the line.
 if ( ! any(sel) ) { stop("no header lines in input.") }
 myPets <- c("ant", "bat", "bat", "bat", "cat", "ant", "ant")
 (runs <- rle(myPets))
-# The cumsum() (cumulative sum) function turns these numbers into indices
+#   (Case 2) No adjacent header lines
-# on our original vector.
+fX <- c(">ABC",
        ">123",
        "defghi",
        "klmnpq")
 sel <- grepl("^>", fX)
 sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
 if ( any(sel)) { stop("adjacent header lines in input.") }
-(idx <- cumsum(runs$lengths))
+#   (Case 3.1) all sequence lines contain only valid characters
-myPets[idx]   # note that this is NOT unique ... "ant" appears twice, because
+#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
-              # there were two separate runs of ants in our input.
+#               are defined with the .utilities.R script)
-
+AAVALID
-# So far so good. But our FASTA file's lines are ALL different, so all the runs
+fX <- c(">ABC",
-# will only have length 1 ...
+        "def ;-) ghi",
-
+        "klmnpq")
-rle(myFASTA)$lengths
+myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
-
+sel <- ! grepl("^>", fX)              # NOT headers
-# How do we deal with that? Obviously we need to actually analyze the strings we
+if (any(grepl(myRegex, fX[sel]))) {
-# are working with. grepl(<pattern>, <x>) is exactly what we need here. It
+  stop("invalid chracter(s) outside of header lines.")
 # produces a vector of booleans, of the same length as the input vector <x>,
 # which is TRUE if the element matches the <pattern>, FALSE if not.
 grepl("^>", myFASTA)  # "^>" is a regular expression that means: ">" at the
                      # beginning ("^") of the line.
 (runs <- rle(grepl("^>", myFASTA)))
 # Translating that into start positions of blocks takes a bit of bookkeeping:
 # the first start has index 1, the following starts can be calculated from
 # cumsum()'s and $length's.
 (starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1)))
 # ... and with that, we can parse our FASTA data. We take the specification
 # above and translate it into code. That's how we develop code: write up step by
 # instructions as comments, then implement them one by one.
 # Here is an example
 FA <- c(">head1 part a", ">head1 part b", "abcdef", "ghi", # two headers
        "",                                                # empty line
        ">head2", "jkl",                                   # one header
        ">head3", "mno", "pqrs")                           # two sequence lines
 # - drop all lines that don't begin with ">" or a letter, "-", or "*"
 FA <- FA[grepl("^[A-Za-z>*-]", FA)]
 # - identify consecutive lines that begin ">" and consecutive lines
 #     that do not begin ">"
 runs <- rle(grepl("^>", FA))
 starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
 # - collapse each set of consecutive lines in-place
 for (i in seq_along(starts)) {
  FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
                         sep ="",
                         collapse = "")
 }
-# - drop all remaining lines.
+#   (Case 3.2) all headers are followed directly by
-FA <- FA[starts]
+#              at least one letter of sequence
 fX <- c(">ABC",
        "",
        ">123",
        "defghi",
        "klmnpq")
 sel <- grep("^>", fX) + 1             # indexes of headers + 1
 myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
 if (! all(grepl(myRegex, fX[sel]))) {
  stop("a header has no adjacent sequence.")
 }
 # Ah, you might ask - couldn't we just have dropped all empty lines, and
 # then caught this in Case 2? No - for two reasons: we would still miss headers
 # at the end of file, and, we would have changed the line numbering - and
 # ideally our "production" function will create information about where the
 # error is to be found.
 # In this resulting vector the odd-indexed elements
 #     are headers, and the even-indexed elements are sequences.
-# As a function:
+# Now combine this into a function ...
-readFASTA <- function(IN) {
+val <- function(fa) {
  # Read a FASTA formatted file from IN, remove all non-header, non-sequence
  # element, return collapsed sequences.
  # Parameters:
  #    IN  chr   Input file name (or connection)
  # Value:
  #    chr vector  in which the odd-indexed elements are headers, and the
  #                even-indexed elements are sequences.
-  FA <- readLines(IN)
+  if ( ! any(grepl("^>", fa)) ) {
-  FA <- FA[grepl("^[A-Za-z>*-]", FA)]
+    stop("no header lines in input.")
  runs <- rle(grepl("^>", FA))
  starts <- c(1, (cumsum(runs$lengths)[-length(runs$lengths)] + 1))
  for (i in seq_along(starts)) { # collapse runs in-place
    FA[starts[i]] <- paste(FA[starts[i]:(starts[i] + runs$lengths[i] - 1)],
                           sep ="",
                           collapse = "")
  }
-  # return collapsed lines
+  sel <- grepl("^>", fa)
- return(FA[starts])
+  if ( any(sel[- length(sel)] & sel[-1])) {
    stop("adjacent header lines in input.")
  }
  sel <- ! grepl("^>", fa)
  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
    stop("invalid chracter(s) outside of header lines.")
  }
  sel <- grep("^>", fa) + 1
  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
    stop("a header has no adjacent sequence.")
  }
  return(invisible(NULL))
 }
-# Try this: Let's try to use only the first 3 elements of myFASTA ... it's a
+# Here is an example
-# lengthy sequence. But how? We don't have a file with that contents and the
+FA <- c(">head1",
-# function expects to read from a file. Do we need to write myFASTA[1:3] to a
+        "acdef",
-# temporary file and then read it? We could - but wherever a file is expected we
+        "ghi",
-# can also pass in a "text connection" from an object in memory, with the
+        "",
-# textConnection() function, like so:
+        ">head2",
        "kl",
        ">head3",
        "mn",
        "pqrs")
 validate(FA)     # ... should not create an error
 readFASTA(textConnection(myFASTA[1:3]))
-# Here is a "real" example - a multi FASTA file of aligned APSES domain
+# a somewhat more elaborate validateFA() function was loaded with the
-# sequences:
+# ./utilities.R script
-(refAPSES <- readFASTA("./data/refAPSES.mfa"))
+# =    2  Parsing FASTA  =======================================================
-# Subset all headers:
+# Once we have validated our assumptions about our input, it's quite
-refAPSES[seq(1, length(refAPSES), by = 2)]
+# painless to parse it. I have put this together as a function and the function
 # gets loaded from ./.utilities.R
 #
 # Lets try this:
 #   - the first 3 elements of faMBP1:
 readFASTA(faMBP1[1:3])
 #   - a multi FASTA file of aligned APSES domain sequences:
 refAPSES <- readFASTA("./data/refAPSES.mfa")
 # Subset the sequence with "P39678" in the header
-refAPSES[grep("P39678", refAPSES) + 1]  # grep() the string and add 1
+refAPSES[grep("P39678", refAPSES$head) ,]
-# =    2  Interpreting FASTA  ==================================================
+# =    3  Interpreting FASTA  ==================================================
 # FASTA files are straightforward to interpret - just one thing may be of note:
@@ -243,22 +253,28 @@ refAPSES[grep("P39678", refAPSES) + 1]  # grep() the string and add 1
 # Example: How many positive charged residues in "MBP1_SACCE"?
-s <- unlist(strsplit(refAPSES[grep("MBP1_SACCE", refAPSES) + 1], ""))
+s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
-head(s)
+s
 sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
                       # for the characters, sum() coerces to 1 and 0
                       # respectively, and that gives us the result.
 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
 # residue distribution
 x <- factor(s, levels = names(AACOLS))
 pie(table(x)[names(AACOLS)], col = AACOLS)
-# =    3  Writing FASTA  =======================================================
+
 # =    4  Writing FASTA  =======================================================
 # Writing FASTA files mostly just the revrese reverse of reading, with one
 # twist: we need to break the long sequence string into chunks of the desired
 # width. The FASTA specification calls for a maximum of 120 characters per line,
-# but writing out much less than that is common since it allows to comfortably
+# but writing out much less than that is common, since it allows to comfortably
 # view lines on the console, or printing them on a sheet of paper (do we still
 # do that actually?). How do we break a string into chunks? A combination of
 # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
@@ -268,7 +284,7 @@ sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
 # be slow - in that case, we might want to precalculate the size of the output
 # object. But that's more of a hypothetical consideration.
-s <- refAPSES[2]
+( s <- refAPSES$seq[2] )
 nchar(s)
 w <- 30     # width of chunk
 (starts <- seq(1, nchar(s), by = w))      # starting index of chunk
@@ -278,37 +294,24 @@ w <- 30     # width of chunk
 #       What happens if nchar(s) is an exact multiple of w?
 substring(s, starts, ends)
 # confirm that the output contains the first and last residue, and both
 # residues adjacent to the breaks
-# Here's the function ...
+# As always, the function has been defined in ".utilities.R" for to use
 # any time...  type   writeFASTA  to examine it.
-writeFASTA <- function(s, OUT = stdout(), width = 60) {
+# Let's try this...
  # Write an object "s" that contains one or more header/sequence pairs to file.
  # Parameters:
  #    s      chr   Vector with a FASTA header string in odd elements,
  #                    sequence in one-letter code in even elements.
  #    OUT    chr   connection to be written to; defaults to stdout() i.e.
  #                 output is written console.
  #    width  int   max number of sequence characters per line of output.
  # Value:
  #           NA    Invoked for side effect of writing data to file
-  txt <- character()
+writeFASTA(refAPSES, width = 40)
  idx <- seq(1, length(s), by = 2)
  for (i in idx) {
    txt <- c(txt, s[i])                              # add header line to txt
    starts <- seq(1, nchar(s[i + 1]), by = width)    # starting indices of chunks
    ends <- c((starts - 1)[-1], nchar(s[i + 1]))     # ending indices of chunks
    txt <- c(txt, substring(s[i + 1], starts, ends)) # add chunks to txt
  }
  writeLines(txt, OUT)
-}
+# roundtrip for validation: write refAPSES with a different format,
-
+# read it back in - the new dataframe must be identical
-# Let's try this. If we don't specify OUT, the result is written to the console
+# to the original dataframe.
-# by default. Default width for sequence is 60 characters
+fname <- tempfile()
-
+writeFASTA(refAPSES, fn = fname, width = 30)
-writeFASTA(refAPSES)
+identical(refAPSES, readFASTA(fname))
 # ...works for me  :-)
 # [END]