From 4c793a6074d3b10b3d5ffa40f2cc52b1754b8514 Mon Sep 17 00:00:00 2001 From: hyginn Date: Wed, 23 Sep 2020 23:21:46 +1000 Subject: [PATCH] Maintenace, and add a Fibonacci-sequence example --- BIN-Sequence.R | 70 +++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/BIN-Sequence.R b/BIN-Sequence.R index 00315f3..4ce3e51 100644 --- a/BIN-Sequence.R +++ b/BIN-Sequence.R @@ -1,20 +1,15 @@ # tocID <- "BIN-Sequence.R" # -# ---------------------------------------------------------------------------- # -# PATIENCE ... # -# Do not yet work wih this code. Updates in progress. Thank you. # -# boris.steipe@utoronto.ca # -# ---------------------------------------------------------------------------- # -# # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Sequence unit. # -# Version: 1.4 +# Version: 1.5 # -# Date: 2017 09 - 2019 01 +# Date: 2017-09 - 2020-09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.5 2020 Updates # 1.4 Change from require() to requireNamespace(), # use ::() idiom throughout, # use Biocmanager:: not biocLite() @@ -60,12 +55,6 @@ #TOC> ========================================================================== -# -# -# -# - - # = 1 Prepare ============================================================= # Much basic sequence handling is supported by the Bioconductor package @@ -116,7 +105,7 @@ as.character(a) length(s) # why ??? -nchar(s) # aha +nchar(s) # Aha! # = 4 Substrings ========================================================== @@ -134,10 +123,10 @@ substr( myBiCodes, 1, 3) substring(myBiCodes, 1, 3) # ... however only substring() will also use vectors for start and stop -s <- "gatattgtgatgacccagtaa" # a DNA sequence -(i <- seq(1, nchar(s), by = 3)) # an index vector -substr( s, i, i+2) # ... returns only the first nucleotide triplet -substring(s, i, i+2) # ... returns all triplets +s <- "gatattgtgatgacccagtaa" # a DNA sequence +(vI <- seq(1, nchar(s), by = 3)) # an index vector +substr( s, vI, vI+2) # ... returns only the first nucleotide triplet +substring(s, vI, vI+2) # ... returns all triplets # = 5 Creating strings: sprintf() ========================================= @@ -183,12 +172,22 @@ toupper(tolower(s)) # === 6.1.2 Reverse -reverse(s) +# (This used to work in Biostrings, apparently it doesn't work anymore. Why?) +# Biostrings::str_rev(s) +# The following works, of course, but awkward: +s +paste0(rev(unlist(strsplit(s, ""))), collapse = "") + +# reverse complement +COMP <- c("t", "g", "c", "a") +names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names +s +paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") # === 6.1.3 Change characters # chartr(old, new, x) maps all characters in x that appear in "old" to the -# correpsonding character in "new." +# correpsonding character in "new." Kind of like the COMP vector above ... chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") @@ -200,25 +199,32 @@ chartr(paste0(letters, collapse = ""), # One amusing way to use the function is for a reversible substitution # cypher. +alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" set.seed(112358) # set RNG seed for repeatable randomness -(myCypher <- paste0(sample(letters), collapse = "")) +( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) set.seed(NULL) # reset the RNG -(lett <- paste0(letters, collapse = "")) - # encode ... -(x <- chartr(lett, myCypher, "... seven for a secret, never to be told.")) +(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) # decode ... -chartr(myCypher, lett, x) +chartr(myCypher, alBet, x) # (Nb. substitution cyphers are easy to crack!) # === 6.1.4 Substitute characters -(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is - # a "regular expression"! +# gsub can change lengths. +# Example: implementing the binary Fibonacci sequence: +# 0 -> 1; 1 -> 10 , in three nested gsub() statements +( s <- 1 ) +( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) -# I use it often to delete characters I don't want ... +# Iterate this line a few times ... +# +# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html +# for the features of the sequence. + +# I use gsub() often to delete unwanted characters ... # ... select something, and substitute the empty string for it. (s <- gsub("-", "", s)) @@ -249,9 +255,9 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") # In our learning units, we use a function dbSanitizeSequence() to clean up # sequences that may be copy/pasted from Web-sources -s <- ">FASTA header will be removed +cat( s <- ">FASTA header will be removed 10 20 30 40 50 -MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " +MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) dbSanitizeSequence(s) @@ -341,7 +347,7 @@ if (! requireNamespace("stringi", quietly = TRUE)) { # data(package = "stringi") # available datasets -(x <- stri::stri_match_all(mySeq, regex = "CG")) +(x <- stringi::stri_match_all(mySeq, regex = "CG")) length(unlist(x)) # Now you could compare that number with yeast DNA sequences, and determine