From 4c793a6074d3b10b3d5ffa40f2cc52b1754b8514 Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Wed, 23 Sep 2020 23:21:46 +1000
Subject: [PATCH] Maintenace, and add a Fibonacci-sequence example

---
 BIN-Sequence.R | 70 +++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 32 deletions(-)
diff --git a/BIN-Sequence.R b/BIN-Sequence.R
index 00315f3..4ce3e51 100644
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@@ -1,20 +1,15 @@
 # tocID <- "BIN-Sequence.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
-#
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
 #
-# Version:  1.4
+# Version:  1.5
 #
-# Date:     2017  09  - 2019  01
+# Date:     2017-09  - 2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.5    2020 Updates
 #           1.4    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -60,12 +55,6 @@
 #TOC> ==========================================================================
 
 
-#
-#
-#
-#
-
-
 # =    1  Prepare  =============================================================
 
 # Much basic sequence handling is supported by the Bioconductor package
@@ -116,7 +105,7 @@ as.character(a)
 
 
 length(s) # why ???
-nchar(s)  # aha
+nchar(s)  # Aha!
 
 
 # =    4  Substrings  ==========================================================
@@ -134,10 +123,10 @@ substr(   myBiCodes, 1, 3)
 substring(myBiCodes, 1, 3)
 
 # ... however only substring() will also use vectors for start and stop
-s <- "gatattgtgatgacccagtaa"     # a DNA sequence
-(i <- seq(1, nchar(s), by = 3))  # an index vector
-substr(   s, i, i+2)             # ... returns only the first nucleotide triplet
-substring(s, i, i+2)             # ... returns all triplets
+s <- "gatattgtgatgacccagtaa"       # a DNA sequence
+(vI <- seq(1, nchar(s), by = 3))   # an index vector
+substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
+substring(s, vI, vI+2)             # ... returns all triplets
 
 
 # =    5  Creating strings: sprintf()  =========================================
@@ -183,12 +172,22 @@ toupper(tolower(s))
 
 
 # ===   6.1.2  Reverse
-reverse(s)
+# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
+# Biostrings::str_rev(s)
+# The following works, of course, but awkward:
+s
+paste0(rev(unlist(strsplit(s, ""))), collapse = "")
+
+# reverse complement
+COMP <- c("t", "g", "c", "a")
+names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
+s
+paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
 
 
 # ===   6.1.3  Change characters
 # chartr(old, new, x) maps all characters in x that appear in "old" to the
-# correpsonding character in "new."
+# correpsonding character in "new." Kind of like the COMP vector above ...
 
 chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
 
@@ -200,25 +199,32 @@ chartr(paste0(letters, collapse = ""),
 
 # One amusing way to use the function  is for a reversible substitution
 # cypher.
+alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
 set.seed(112358)                       # set RNG seed for repeatable randomness
-(myCypher <- paste0(sample(letters), collapse = ""))
+( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
 set.seed(NULL)                         # reset the RNG
 
-(lett <- paste0(letters, collapse = ""))
-
 # encode ...
-(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
+(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
 
 # decode ...
-chartr(myCypher, lett, x)
+chartr(myCypher, alBet, x)
 # (Nb. substitution cyphers are easy to crack!)
 
 
 # ===   6.1.4  Substitute characters
-(s <- gsub("IV", "i-v", s))  # gsub can change length, first argument is
-                             # a "regular expression"!
+# gsub can change lengths.
+#   Example: implementing the binary Fibonacci sequence:
+#   0 -> 1; 1 -> 10 , in three nested gsub() statements
+( s <- 1 )
+( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
 
-# I use it often to delete characters I don't want ...
+# Iterate this line a few times ...
+#
+# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
+# for the features of the sequence.
+
+# I use gsub() often to delete unwanted characters ...
 # ... select something, and substitute the empty string for it.
 (s <- gsub("-", "", s))
 
@@ -249,9 +255,9 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 # In our learning units, we use a function dbSanitizeSequence() to clean up
 # sequences that may be copy/pasted from Web-sources
 
-s <- ">FASTA header will be removed
+cat( s <- ">FASTA header will be removed
 10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR "
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
 
 dbSanitizeSequence(s)
 
@@ -341,7 +347,7 @@ if (! requireNamespace("stringi", quietly = TRUE)) {
 #  data(package = "stringi")     # available datasets
 
 
-(x <- stri::stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 length(unlist(x))
 
 # Now you could compare that number with yeast DNA sequences, and determine