bch441-work-abc-units/BIN-Sequence.R

# tocID <- "BIN-Sequence.R"
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-Sequence unit.
#
# Version:  1.5
#
# Date:     2017-09  - 2020-09
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.5    2020 Updates
#           1.4    Change from require() to requireNamespace(),
#                      use <package>::<function>() idiom throughout,
#                      use Biocmanager:: not biocLite()
#           1.3    Update set.seed() usage
#           1.2    Removed irrelevant task. How did that even get in there? smh
#           1.1    Add chartr()
#           1.0    First live version 2017.
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                                Line
#TOC> ----------------------------------------------------
#TOC>   1        Prepare                                63
#TOC>   2        Storing Sequence                       80
#TOC>   3        String properties                     109
#TOC>   4        Substrings                            116
#TOC>   5        Creating strings: sprintf()           137
#TOC>   6        Changing strings                      172
#TOC>   6.1.1          Changing case                   174
#TOC>   6.1.2          Reverse                         179
#TOC>   6.1.3          Change characters               183
#TOC>   6.1.4          Substitute characters           211
#TOC>   6.2        stringi and stringr                 231
#TOC>   6.3        dbSanitizeSequence()                241
#TOC>   7        Permuting and sampling                253
#TOC>   7.1        Permutations                        260
#TOC>   7.2        Sampling                            306
#TOC>   7.2.1          Equiprobable characters         308
#TOC>   7.2.2          Defined probability vector      350
#TOC>
#TOC> ==========================================================================


# =    1  Prepare  =============================================================

# Much basic sequence handling is supported by the Bioconductor package
# Biostrings.

if (! requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
  BiocManager::install("Biostrings")
}
# Package information:
#  library(help = Biostrings)       # basic information
#  browseVignettes("Biostrings")    # available vignettes
#  data(package = "Biostrings")     # available datasets


# =    2  Storing Sequence  ====================================================


# Sequences can be represented and stored as vectors of single characters ...
(v <- c("D", "I", "V", "M", "T", "Q"))

# ... as strings ...
(s <- "DIVMTQ")

# ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc.
(a <- Biostrings::AAString("DIVMTQ"))

# ... and all of these representations can be interconverted:

# string to vector ...
unlist(strsplit(s, ""))

# vector to string ...
paste(v, sep = "", collapse = "")

# ... and AAstring to plain string.
as.character(a)

# Since operations with character vectors trivially follow all other vector
# conventions and syntax, and we will look at Biostrings methods in more
# detail in a later unit, we will focus on basic strings in the following.


# =    3  String properties  ===================================================


length(s) # why ???
nchar(s)  # Aha!


# =    4  Substrings  ==========================================================

# Use the substr() function
substr(s, 2, 4)

# or the similar substring()
substring(s, 2, 4)

# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr(   myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)

# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa"       # a DNA sequence
(vI <- seq(1, nchar(s), by = 3))   # an index vector
substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
substring(s, vI, vI+2)             # ... returns all triplets


# =    5  Creating strings: sprintf()  =========================================


# Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a bit of a  learning curve, but this is
# totally worth it:
# the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf()
# calls elsewhere in the code.

sprintf("Just a string.")
sprintf("A string and the number %d.", 5)
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer"
N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
        N, x, N, x, "one down, and pass it around", N - 1, x)

# Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string:

for (i in N:(N-4)) {
  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
              i, x, i, x, "one down, and pass it around", i - 1, x))
}

# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))


# =    6  Changing strings  ====================================================

# ===   6.1.1  Changing case
tolower(s)
toupper(tolower(s))


# ===   6.1.2  Reverse
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
# Biostrings::str_rev(s)
# The following works, of course, but awkward:
s
paste0(rev(unlist(strsplit(s, ""))), collapse = "")

# reverse complement
COMP <- c("t", "g", "c", "a")
names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
s
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")


# ===   6.1.3  Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." Kind of like the COMP vector above ...

chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")

# One could implement toupper() and tolower() with this - remember that R has
# character vectors of uppercase and lowercase letters as language constants.
chartr(paste0(letters, collapse = ""),
       paste0(LETTERS, collapse = ""),
       "Twinkle, twinkle little star, how I wonder what you are.")

# One amusing way to use the function  is for a reversible substitution
# cypher.
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
set.seed(112358)                       # set RNG seed for repeatable randomness
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
set.seed(NULL)                         # reset the RNG

# encode ...
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))

# decode ...
chartr(myCypher, alBet, x)
# (Nb. substitution cyphers are easy to crack!)


# ===   6.1.4  Substitute characters
# gsub can change lengths.
#   Example: implementing the binary Fibonacci sequence:
#   0 -> 1; 1 -> 10 , in three nested gsub() statements
( s <- 1 )
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )

# Iterate this line a few times ...
#
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
# for the features of the sequence.

# I use gsub() often to delete unwanted characters ...
# ... select something, and substitute the empty string for it.
(s <- gsub("-", "", s))

# For example: clean up a sequence
# copy/paste from UniProt
(s <- "        10         20         30         40         50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")


# remove numbers
(s <- gsub("[0-9]", "", s))

# remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s))

# ==   6.2  stringi and stringr  ===============================================

# But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at
# the function descriptions for the stringr and the stringi package. stringr is
# part of the tidyverse, and for the most part a wrapper for stringi functions.
# https://github.com/tidyverse/stringr


# ==   6.3  dbSanitizeSequence()  ==============================================

# In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources

cat( s <- ">FASTA header will be removed
10         20         30         40         50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )

dbSanitizeSequence(s)


# =    7  Permuting and sampling  ==============================================


# An important aspect of working with strings is generating random strings
# with given statistical properties: reference items to evaluate significance.


# ==   7.1  Permutations  ======================================================


# One way to produce such reference items is to permute a string. A permuted
# string has the same composition as the original, but all positional
# information is lost. The sample() function can be used to permute:

# This is the sequence of the ompA secretion signal
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))

(x <- sample(s, length(s)))  # permuted

# Here's a small example how such permuted strings may be useful. As you look
# at the ompA sequence, you suspect that the two lysines near the +-charged
# N-terminus may not be accidental, but selected for a positively charged
# N-terminus. What is the chance that such a sequence has two lysines close to
# the N-terminus simply by chance? Or put differently: what is the average
# distance of two lysines in such a sequence to the N-terminus. First, we
# need an expression that measures the distance. A simple use of the which()
# function will do just fine.

which(s == "K")        # shows they are in position 2 and 3, so ...
mean(which(s == "K"))  # ... gives us the average, and ...
mean(which(x == "K"))  # ... gives us the average of the permuted sequence.

# So what does the distribution look like? Lets do 10,000 trials.

(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000
d <- numeric(N)

set.seed(112358)                       # set RNG seed for repeatable randomness
for (i in 1:N) {
  d[i] <- mean(which(sample(s, length(s)) == "K"))
}
set.seed(NULL)                         # reset the RNG

hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
              # N-terminus or more. That's just below the signifcance
              # threshold of 5 %. It's a trend, but to be sure we are looking
              # at a biological effect we would need to see more
              # sequences.


# ==   7.2  Sampling  ==========================================================

# ===   7.2.1  Equiprobable characters

# Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it:

nuc <- c("A", "C", "G", "T")
N <- 100

set.seed(16818)                        # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL)                         # reset the RNG

(mySeq <- paste(v, collapse = ""))

# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # 51 is close to expected

# What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function

if (! requireNamespace("stringi", quietly = TRUE)) {
  install.packages("stringi")
}
# Package information:
#  library(help = stringi)       # basic information
#  browseVignettes("stringi")    # available vignettes
#  data(package = "stringi")     # available datasets


(x <- stringi::stri_match_all(mySeq, regex = "CG"))
length(unlist(x))

# Now you could compare that number with yeast DNA sequences, and determine
# whether there are more or less CpG motifs than expected by chance.
# (cf. https://en.wikipedia.org/wiki/CpG_site)
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
# with the same probability. But the yeast genomic DNA GC content is only
# 38%. So you would expect fewer CpG motifs based on the statistical properties
# of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that?

# ===   7.2.2  Defined probability vector

# This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with
# 19 C, 19 G, 31 A and 31 T
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
# ... but that doesn't scale if the numeric accuracy needs to be higher.
#
# However sample() has an argument that takes care of that: you can explicitly
# specify the probabilities with which each element of the the sampling vector
# should be chosen:

nuc <- c("A", "C", "G", "T")
N <- 100
myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities

set.seed(16818)                       # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL)                         # reset the RNG

(mySeq <- paste(v, collapse = ""))

# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # Close to expected

# What's the number of CpG motifs?
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case.


# [END]
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# tocID <- "BIN-Sequence.R"`
			`#`
New unit 2017-09-29 16:23:57 +00:00			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the BIN-Sequence unit.`
			`#`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# Version: 1.5`
New unit 2017-09-29 16:23:57 +00:00			`#`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# Date: 2017-09 - 2020-09`
New unit 2017-09-29 16:23:57 +00:00			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# 1.5 2020 Updates`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# 1.4 Change from require() to requireNamespace(),`
			`# use <package>::<function>() idiom throughout,`
			`# use Biocmanager:: not biocLite()`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# 1.3 Update set.seed() usage`
Update 2017-11-02 03:28:12 +00:00			`# 1.2 Removed irrelevant task. How did that even get in there? smh`
Update: chartr() 2017-10-02 18:50:33 +00:00			`# 1.1 Add chartr()`
Update version 2017-09-29 16:26:22 +00:00			`# 1.0 First live version 2017.`
New unit 2017-09-29 16:23:57 +00:00			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
			`#`
			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
			`#`
			`# ==============================================================================`

Added package information code after every library() call. 2017-10-29 03:05:53 +00:00
New unit 2017-09-29 16:23:57 +00:00			`#TOC> ==========================================================================`
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#TOC>`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`#TOC> Section Title Line`
			`#TOC> ----------------------------------------------------`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`#TOC> 1 Prepare 63`
			`#TOC> 2 Storing Sequence 80`
			`#TOC> 3 String properties 109`
			`#TOC> 4 Substrings 116`
			`#TOC> 5 Creating strings: sprintf() 137`
			`#TOC> 6 Changing strings 172`
			`#TOC> 6.1.1 Changing case 174`
			`#TOC> 6.1.2 Reverse 179`
			`#TOC> 6.1.3 Change characters 183`
			`#TOC> 6.1.4 Substitute characters 211`
			`#TOC> 6.2 stringi and stringr 231`
			`#TOC> 6.3 dbSanitizeSequence() 241`
			`#TOC> 7 Permuting and sampling 253`
			`#TOC> 7.1 Permutations 260`
			`#TOC> 7.2 Sampling 306`
			`#TOC> 7.2.1 Equiprobable characters 308`
			`#TOC> 7.2.2 Defined probability vector 350`
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#TOC>`
New unit 2017-09-29 16:23:57 +00:00			`#TOC> ==========================================================================`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00

New unit 2017-09-29 16:23:57 +00:00			`# = 1 Prepare =============================================================`

			`# Much basic sequence handling is supported by the Bioconductor package`
			`# Biostrings.`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`if (! requireNamespace("BiocManager", quietly = TRUE)) {`
			`install.packages("BiocManager")`
			`}`
			`if (! requireNamespace("Biostrings", quietly = TRUE)) {`
			`BiocManager::install("Biostrings")`
New unit 2017-09-29 16:23:57 +00:00			`}`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`# Package information:`
			`# library(help = Biostrings) # basic information`
			`# browseVignettes("Biostrings") # available vignettes`
			`# data(package = "Biostrings") # available datasets`
New unit 2017-09-29 16:23:57 +00:00

			`# = 2 Storing Sequence ====================================================`


			`# Sequences can be represented and stored as vectors of single characters ...`
			`(v <- c("D", "I", "V", "M", "T", "Q"))`

			`# ... as strings ...`
			`(s <- "DIVMTQ")`

			`# ... or as more complex objects with rich metadata e.g. as a Biostrings`
			`# DNAstring, RNAstring, AAString, etc.`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`(a <- Biostrings::AAString("DIVMTQ"))`
New unit 2017-09-29 16:23:57 +00:00
			`# ... and all of these representations can be interconverted:`

			`# string to vector ...`
			`unlist(strsplit(s, ""))`

			`# vector to string ...`
			`paste(v, sep = "", collapse = "")`

			`# ... and AAstring to plain string.`
			`as.character(a)`

			`# Since operations with character vectors trivially follow all other vector`
			`# conventions and syntax, and we will look at Biostrings methods in more`
			`# detail in a later unit, we will focus on basic strings in the following.`


			`# = 3 String properties ===================================================`


			`length(s) # why ???`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`nchar(s) # Aha!`
New unit 2017-09-29 16:23:57 +00:00

			`# = 4 Substrings ==========================================================`

			`# Use the substr() function`
			`substr(s, 2, 4)`

Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# or the similar substring()`
			`substring(s, 2, 4)`

			`# Note: both functions are vectorized (i.e. they operate on vectors`
			`# of arguments, you don't need to loop over input)...`
			`myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")`
			`substr( myBiCodes, 1, 3)`
			`substring(myBiCodes, 1, 3)`

			`# ... however only substring() will also use vectors for start and stop`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`s <- "gatattgtgatgacccagtaa" # a DNA sequence`
			`(vI <- seq(1, nchar(s), by = 3)) # an index vector`
			`substr( s, vI, vI+2) # ... returns only the first nucleotide triplet`
			`substring(s, vI, vI+2) # ... returns all triplets`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00
New unit 2017-09-29 16:23:57 +00:00
			`# = 5 Creating strings: sprintf() =========================================`


Update: chartr() 2017-10-02 18:50:33 +00:00			`# Sprintf is a very smart, very powerful function and has cognates in all`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# other programming languages. It has a bit of a learning curve, but this is`
New unit 2017-09-29 16:23:57 +00:00			`# totally worth it:`
			`# the function takes a format string, and a list of other arguments. It returns`
			`# a formatted string. Here are some examples - watch carefully for sprintf()`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# calls elsewhere in the code.`
New unit 2017-09-29 16:23:57 +00:00
			`sprintf("Just a string.")`
			`sprintf("A string and the number %d.", 5)`
			`sprintf("More numbers: %d ate %d.", 7, 9) # Sorry`
			`sprintf("Pi is ~ %1.2f ...", pi)`
			`sprintf("or more accurately ~ %1.11f.", pi)`
			`x <- "bottles of beer"`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`N <- 99`
New unit 2017-09-29 16:23:57 +00:00			`sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`N, x, N, x, "one down, and pass it around", N - 1, x)`
New unit 2017-09-29 16:23:57 +00:00
			`# Note that in the last example, the value of the string was displayed with`
			`# R's usual print-formatting function and therefore the line-break "\n" did`
			`# not actually break the line. To have line breaks, tabs etc, you need to use`
			`# cat() to display the string:`

Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`for (i in N:(N-4)) {`
New unit 2017-09-29 16:23:57 +00:00			`cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`i, x, i, x, "one down, and pass it around", i - 1, x))`
New unit 2017-09-29 16:23:57 +00:00			`}`

Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# sprintf() is vectorized: if one of its parameters is a vector, it`
			`# will generate one output string for each of the vector's elements:`
			`cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))`

New unit 2017-09-29 16:23:57 +00:00
			`# = 6 Changing strings ====================================================`

2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 6.1.1 Changing case`
New unit 2017-09-29 16:23:57 +00:00			`tolower(s)`
			`toupper(tolower(s))`

Update: chartr() 2017-10-02 18:50:33 +00:00
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 6.1.2 Reverse`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)`
			`# Biostrings::str_rev(s)`
			`# The following works, of course, but awkward:`
			`s`
			`paste0(rev(unlist(strsplit(s, ""))), collapse = "")`

			`# reverse complement`
			`COMP <- c("t", "g", "c", "a")`
			`names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names`
			`s`
			`paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")`
New unit 2017-09-29 16:23:57 +00:00
Update: chartr() 2017-10-02 18:50:33 +00:00
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 6.1.3 Change characters`
Update: chartr() 2017-10-02 18:50:33 +00:00			`# chartr(old, new, x) maps all characters in x that appear in "old" to the`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# correpsonding character in "new." Kind of like the COMP vector above ...`
Update: chartr() 2017-10-02 18:50:33 +00:00
			`chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")`

			`# One could implement toupper() and tolower() with this - remember that R has`
			`# character vectors of uppercase and lowercase letters as language constants.`
			`chartr(paste0(letters, collapse = ""),`
			`paste0(LETTERS, collapse = ""),`
			`"Twinkle, twinkle little star, how I wonder what you are.")`

			`# One amusing way to use the function is for a reversible substitution`
			`# cypher.`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`set.seed(112358) # set RNG seed for repeatable randomness`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`set.seed(NULL) # reset the RNG`

			`# encode ...`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00
			`# decode ...`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`chartr(myCypher, alBet, x)`
Update: chartr() 2017-10-02 18:50:33 +00:00			`# (Nb. substitution cyphers are easy to crack!)`


2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 6.1.4 Substitute characters`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# gsub can change lengths.`
			`# Example: implementing the binary Fibonacci sequence:`
			`# 0 -> 1; 1 -> 10 , in three nested gsub() statements`
			`( s <- 1 )`
			`( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )`

			`# Iterate this line a few times ...`
			`#`
			`# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html`
			`# for the features of the sequence.`
New unit 2017-09-29 16:23:57 +00:00
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`# I use gsub() often to delete unwanted characters ...`
Update: chartr() 2017-10-02 18:50:33 +00:00			`# ... select something, and substitute the empty string for it.`
New unit 2017-09-29 16:23:57 +00:00			`(s <- gsub("-", "", s))`

			`# For example: clean up a sequence`
			`# copy/paste from UniProt`
			`(s <- " 10 20 30 40 50`
			`MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")`


			`# remove numbers`
			`(s <- gsub("[0-9]", "", s))`

			`# remove "whitespace" (spaces, tabs, line breaks)...`
			`(s <- gsub("\\s", "", s))`

Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# == 6.2 stringi and stringr ===============================================`
New unit 2017-09-29 16:23:57 +00:00
			`# But there are also specialized functions eg. to remove leading/trailing`
			`# whitespace which may be important to sanitize user input etc. Have a look at`
			`# the function descriptions for the stringr and the stringi package. stringr is`
			`# part of the tidyverse, and for the most part a wrapper for stringi functions.`
			`# https://github.com/tidyverse/stringr`



Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`# == 6.3 dbSanitizeSequence() ==============================================`
New unit 2017-09-29 16:23:57 +00:00
			`# In our learning units, we use a function dbSanitizeSequence() to clean up`
			`# sequences that may be copy/pasted from Web-sources`

Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`cat( s <- ">FASTA header will be removed`
New unit 2017-09-29 16:23:57 +00:00			`10 20 30 40 50`
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )`
New unit 2017-09-29 16:23:57 +00:00
			`dbSanitizeSequence(s)`


			`# = 7 Permuting and sampling ==============================================`


			`# An important aspect of working with strings is generating random strings`
			`# with given statistical properties: reference items to evaluate significance.`


			`# == 7.1 Permutations ======================================================`


			`# One way to produce such reference items is to permute a string. A permuted`
			`# string has the same composition as the original, but all positional`
			`# information is lost. The sample() function can be used to permute:`

			`# This is the sequence of the ompA secretion signal`
			`(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))`

			`(x <- sample(s, length(s))) # permuted`

			`# Here's a small example how such permuted strings may be useful. As you look`
			`# at the ompA sequence, you suspect that the two lysines near the +-charged`
			`# N-terminus may not be accidental, but selected for a positively charged`
			`# N-terminus. What is the chance that such a sequence has two lysines close to`
			`# the N-terminus simply by chance? Or put differently: what is the average`
			`# distance of two lysines in such a sequence to the N-terminus. First, we`
			`# need an expression that measures the distance. A simple use of the which()`
			`# function will do just fine.`

			`which(s == "K") # shows they are in position 2 and 3, so ...`
			`mean(which(s == "K")) # ... gives us the average, and ...`
			`mean(which(x == "K")) # ... gives us the average of the permuted sequence.`

			`# So what does the distribution look like? Lets do 10,000 trials.`

			`(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))`
			`N <- 10000`
			`d <- numeric(N)`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00
			`set.seed(112358) # set RNG seed for repeatable randomness`
New unit 2017-09-29 16:23:57 +00:00			`for (i in 1:N) {`
			`d[i] <- mean(which(sample(s, length(s)) == "K"))`
			`}`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`set.seed(NULL) # reset the RNG`

New unit 2017-09-29 16:23:57 +00:00			`hist(d, breaks = 20)`
			`abline(v = 2.5, lwd = 2, col = "firebrick")`
			`sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the`
			`# N-terminus or more. That's just below the signifcance`
			`# threshold of 5 %. It's a trend, but to be sure we are looking`
			`# at a biological effect we would need to see more`
			`# sequences.`


			`# == 7.2 Sampling ==========================================================`

2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 7.2.1 Equiprobable characters`
New unit 2017-09-29 16:23:57 +00:00
			`# Assume you need a large random-nucleotide string for some statistical model.`
			`# How to create such a string? sample() can easily create it:`

			`nuc <- c("A", "C", "G", "T")`
			`N <- 100`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00
			`set.seed(16818) # set RNG seed for repeatable randomness`
New unit 2017-09-29 16:23:57 +00:00			`v <- sample(nuc, N, replace = TRUE)`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`set.seed(NULL) # reset the RNG`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00
New unit 2017-09-29 16:23:57 +00:00			`(mySeq <- paste(v, collapse = ""))`

			`# What's the GC content?`
			`table(v)`
			`sum(table(v)[c("G", "C")]) # 51 is close to expected`

			`# What's the number of CpG motifs? Easy to check with the stringi`
			`# stri_match_all() function`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`if (! requireNamespace("stringi", quietly = TRUE)) {`
New unit 2017-09-29 16:23:57 +00:00			`install.packages("stringi")`
			`}`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`# Package information:`
			`# library(help = stringi) # basic information`
			`# browseVignettes("stringi") # available vignettes`
			`# data(package = "stringi") # available datasets`

New unit 2017-09-29 16:23:57 +00:00
Maintenace, and add a Fibonacci-sequence example 2020-09-23 13:21:46 +00:00			`(x <- stringi::stri_match_all(mySeq, regex = "CG"))`
New unit 2017-09-29 16:23:57 +00:00			`length(unlist(x))`

			`# Now you could compare that number with yeast DNA sequences, and determine`
			`# whether there are more or less CpG motifs than expected by chance.`
			`# (cf. https://en.wikipedia.org/wiki/CpG_site)`
			`# But hold on: is that a fair comparison? sample() gives us all four nucleotides`
			`# with the same probability. But the yeast genomic DNA GC content is only`
			`# 38%. So you would expect fewer CpG motifs based on the statistical properties`
			`# of the smaller number of Cs and Gs - before biology even comes into play. How`
			`# do we account for that?`

2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# === 7.2.2 Defined probability vector`
New unit 2017-09-29 16:23:57 +00:00
			`# This is where we need to know how to create samples with specific probability`
			`# distributions. A crude hack would be to create a sampling source vector with`
			`# 19 C, 19 G, 31 A and 31 T`
			`c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))`
			`# ... but that doesn't scale if the numeric accuracy needs to be higher.`
			`#`
			`# However sample() has an argument that takes care of that: you can explicitly`
			`# specify the probabilities with which each element of the the sampling vector`
			`# should be chosen:`

			`nuc <- c("A", "C", "G", "T")`
			`N <- 100`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities`

			`set.seed(16818) # set RNG seed for repeatable randomness`
New unit 2017-09-29 16:23:57 +00:00			`v <- sample(nuc, N, prob = myProb, replace = TRUE)`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`set.seed(NULL) # reset the RNG`

New unit 2017-09-29 16:23:57 +00:00			`(mySeq <- paste(v, collapse = ""))`

			`# What's the GC content?`
			`table(v)`
			`sum(table(v)[c("G", "C")]) # Close to expected`

			`# What's the number of CpG motifs?`
Maintenance updates, and revised set.seed() usage 2019-01-07 06:17:23 +00:00			`(x <- stringi::stri_match_all(mySeq, regex = "CG"))`
New unit 2017-09-29 16:23:57 +00:00			`# ... not a single one in this case.`



			`# [END]`