Maintenace, and add a Fibonacci-sequence example

2020-09-23 23:21:46 +10:00 · 2020-09-23 23:21:46 +10:00 · 4c793a6074
commit 4c793a6074
parent 76b18a8a35
1 changed files with 38 additions and 32 deletions
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -1,20 +1,15 @@
 # tocID <- "BIN-Sequence.R"
 #
 # ---------------------------------------------------------------------------- #
 #  PATIENCE  ...                                                               #
 #    Do not yet work wih this code. Updates in progress. Thank you.            #
 #    boris.steipe@utoronto.ca                                                  #
 # ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
 #
-# Version:  1.4
+# Version:  1.5
 #
-# Date:     2017  09  - 2019  01
+# Date:     2017-09  - 2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.5    2020 Updates
 #           1.4    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@ -60,12 +55,6 @@
 #TOC> ==========================================================================
 #
 #
 #
 #
 # =    1  Prepare  =============================================================
 # Much basic sequence handling is supported by the Bioconductor package
@ -116,7 +105,7 @@ as.character(a)
 length(s) # why ???
-nchar(s)  # aha
+nchar(s)  # Aha!
 # =    4  Substrings  ==========================================================
@ -134,10 +123,10 @@ substr(   myBiCodes, 1, 3)
 substring(myBiCodes, 1, 3)
 # ... however only substring() will also use vectors for start and stop
-s <- "gatattgtgatgacccagtaa"     # a DNA sequence
+s <- "gatattgtgatgacccagtaa"       # a DNA sequence
-(i <- seq(1, nchar(s), by = 3))  # an index vector
+(vI <- seq(1, nchar(s), by = 3))   # an index vector
-substr(   s, i, i+2)             # ... returns only the first nucleotide triplet
+substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
-substring(s, i, i+2)             # ... returns all triplets
+substring(s, vI, vI+2)             # ... returns all triplets
 # =    5  Creating strings: sprintf()  =========================================
@ -183,12 +172,22 @@ toupper(tolower(s))
 # ===   6.1.2  Reverse
-reverse(s)
+# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
 # Biostrings::str_rev(s)
 # The following works, of course, but awkward:
 s
 paste0(rev(unlist(strsplit(s, ""))), collapse = "")
 # reverse complement
 COMP <- c("t", "g", "c", "a")
 names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
 s
 paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
 # ===   6.1.3  Change characters
 # chartr(old, new, x) maps all characters in x that appear in "old" to the
-# correpsonding character in "new."
+# correpsonding character in "new." Kind of like the COMP vector above ...
 chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
@ -200,25 +199,32 @@ chartr(paste0(letters, collapse = ""),
 # One amusing way to use the function  is for a reversible substitution
 # cypher.
 alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
 set.seed(112358)                       # set RNG seed for repeatable randomness
-(myCypher <- paste0(sample(letters), collapse = ""))
+( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
 set.seed(NULL)                         # reset the RNG
 (lett <- paste0(letters, collapse = ""))
 # encode ...
-(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
+(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
 # decode ...
-chartr(myCypher, lett, x)
+chartr(myCypher, alBet, x)
 # (Nb. substitution cyphers are easy to crack!)
 # ===   6.1.4  Substitute characters
-(s <- gsub("IV", "i-v", s))  # gsub can change length, first argument is
+# gsub can change lengths.
-                             # a "regular expression"!
+#   Example: implementing the binary Fibonacci sequence:
 #   0 -> 1; 1 -> 10 , in three nested gsub() statements
 ( s <- 1 )
 ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
-# I use it often to delete characters I don't want ...
+# Iterate this line a few times ...
 #
 # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
 # for the features of the sequence.
 # I use gsub() often to delete unwanted characters ...
 # ... select something, and substitute the empty string for it.
 (s <- gsub("-", "", s))
@ -249,9 +255,9 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 # In our learning units, we use a function dbSanitizeSequence() to clean up
 # sequences that may be copy/pasted from Web-sources
-s <- ">FASTA header will be removed
+cat( s <- ">FASTA header will be removed
 10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR "
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
 dbSanitizeSequence(s)
@ -341,7 +347,7 @@ if (! requireNamespace("stringi", quietly = TRUE)) {
 #  data(package = "stringi")     # available datasets
-(x <- stri::stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 length(unlist(x))
 # Now you could compare that number with yeast DNA sequences, and determine