Maintenace, and add a Fibonacci-sequence example
This commit is contained in:
parent
76b18a8a35
commit
4c793a6074
@ -1,20 +1,15 @@
|
|||||||
# tocID <- "BIN-Sequence.R"
|
# tocID <- "BIN-Sequence.R"
|
||||||
#
|
#
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
# PATIENCE ... #
|
|
||||||
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
||||||
# boris.steipe@utoronto.ca #
|
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
#
|
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Sequence unit.
|
# R code accompanying the BIN-Sequence unit.
|
||||||
#
|
#
|
||||||
# Version: 1.4
|
# Version: 1.5
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2019 01
|
# Date: 2017-09 - 2020-09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.5 2020 Updates
|
||||||
# 1.4 Change from require() to requireNamespace(),
|
# 1.4 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# use Biocmanager:: not biocLite()
|
# use Biocmanager:: not biocLite()
|
||||||
@ -60,12 +55,6 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Prepare =============================================================
|
# = 1 Prepare =============================================================
|
||||||
|
|
||||||
# Much basic sequence handling is supported by the Bioconductor package
|
# Much basic sequence handling is supported by the Bioconductor package
|
||||||
@ -116,7 +105,7 @@ as.character(a)
|
|||||||
|
|
||||||
|
|
||||||
length(s) # why ???
|
length(s) # why ???
|
||||||
nchar(s) # aha
|
nchar(s) # Aha!
|
||||||
|
|
||||||
|
|
||||||
# = 4 Substrings ==========================================================
|
# = 4 Substrings ==========================================================
|
||||||
@ -134,10 +123,10 @@ substr( myBiCodes, 1, 3)
|
|||||||
substring(myBiCodes, 1, 3)
|
substring(myBiCodes, 1, 3)
|
||||||
|
|
||||||
# ... however only substring() will also use vectors for start and stop
|
# ... however only substring() will also use vectors for start and stop
|
||||||
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
||||||
(i <- seq(1, nchar(s), by = 3)) # an index vector
|
(vI <- seq(1, nchar(s), by = 3)) # an index vector
|
||||||
substr( s, i, i+2) # ... returns only the first nucleotide triplet
|
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
|
||||||
substring(s, i, i+2) # ... returns all triplets
|
substring(s, vI, vI+2) # ... returns all triplets
|
||||||
|
|
||||||
|
|
||||||
# = 5 Creating strings: sprintf() =========================================
|
# = 5 Creating strings: sprintf() =========================================
|
||||||
@ -183,12 +172,22 @@ toupper(tolower(s))
|
|||||||
|
|
||||||
|
|
||||||
# === 6.1.2 Reverse
|
# === 6.1.2 Reverse
|
||||||
reverse(s)
|
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
|
||||||
|
# Biostrings::str_rev(s)
|
||||||
|
# The following works, of course, but awkward:
|
||||||
|
s
|
||||||
|
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
|
||||||
|
|
||||||
|
# reverse complement
|
||||||
|
COMP <- c("t", "g", "c", "a")
|
||||||
|
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
|
||||||
|
s
|
||||||
|
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
|
||||||
|
|
||||||
|
|
||||||
# === 6.1.3 Change characters
|
# === 6.1.3 Change characters
|
||||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||||
# correpsonding character in "new."
|
# correpsonding character in "new." Kind of like the COMP vector above ...
|
||||||
|
|
||||||
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
|
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
|
||||||
|
|
||||||
@ -200,25 +199,32 @@ chartr(paste0(letters, collapse = ""),
|
|||||||
|
|
||||||
# One amusing way to use the function is for a reversible substitution
|
# One amusing way to use the function is for a reversible substitution
|
||||||
# cypher.
|
# cypher.
|
||||||
|
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
|
||||||
set.seed(112358) # set RNG seed for repeatable randomness
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
(myCypher <- paste0(sample(letters), collapse = ""))
|
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
(lett <- paste0(letters, collapse = ""))
|
|
||||||
|
|
||||||
# encode ...
|
# encode ...
|
||||||
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
|
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
|
||||||
|
|
||||||
# decode ...
|
# decode ...
|
||||||
chartr(myCypher, lett, x)
|
chartr(myCypher, alBet, x)
|
||||||
# (Nb. substitution cyphers are easy to crack!)
|
# (Nb. substitution cyphers are easy to crack!)
|
||||||
|
|
||||||
|
|
||||||
# === 6.1.4 Substitute characters
|
# === 6.1.4 Substitute characters
|
||||||
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
# gsub can change lengths.
|
||||||
# a "regular expression"!
|
# Example: implementing the binary Fibonacci sequence:
|
||||||
|
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
|
||||||
|
( s <- 1 )
|
||||||
|
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
|
||||||
|
|
||||||
# I use it often to delete characters I don't want ...
|
# Iterate this line a few times ...
|
||||||
|
#
|
||||||
|
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
|
||||||
|
# for the features of the sequence.
|
||||||
|
|
||||||
|
# I use gsub() often to delete unwanted characters ...
|
||||||
# ... select something, and substitute the empty string for it.
|
# ... select something, and substitute the empty string for it.
|
||||||
(s <- gsub("-", "", s))
|
(s <- gsub("-", "", s))
|
||||||
|
|
||||||
@ -249,9 +255,9 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
|||||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
||||||
# sequences that may be copy/pasted from Web-sources
|
# sequences that may be copy/pasted from Web-sources
|
||||||
|
|
||||||
s <- ">FASTA header will be removed
|
cat( s <- ">FASTA header will be removed
|
||||||
10 20 30 40 50
|
10 20 30 40 50
|
||||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR "
|
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
|
||||||
|
|
||||||
dbSanitizeSequence(s)
|
dbSanitizeSequence(s)
|
||||||
|
|
||||||
@ -341,7 +347,7 @@ if (! requireNamespace("stringi", quietly = TRUE)) {
|
|||||||
# data(package = "stringi") # available datasets
|
# data(package = "stringi") # available datasets
|
||||||
|
|
||||||
|
|
||||||
(x <- stri::stri_match_all(mySeq, regex = "CG"))
|
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||||
length(unlist(x))
|
length(unlist(x))
|
||||||
|
|
||||||
# Now you could compare that number with yeast DNA sequences, and determine
|
# Now you could compare that number with yeast DNA sequences, and determine
|
||||||
|
Loading…
Reference in New Issue
Block a user