bch441-work-abc-units/RPR-Biostrings.R

# tocID <- "RPR-Biostrings.R"
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the RPR-Biostrings unit.
#
# Version:  1.2
#
# Date:     2017-10  -  2020-09
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.2    2020 Updates
#           1.1    Change from require() to requireNamespace(),
#                      use <package>::<function>() idiom throughout,
#                      use Biocmanager:: not biocLite()
#           1.0    2017 Revisions
#           0.1    First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC> 
#TOC>   Section  Title                                             Line
#TOC> -----------------------------------------------------------------
#TOC>   1        The Biostrings:: Package                            56
#TOC>   2        Getting Data into Biostrings:: Objects              88
#TOC>   3        Working with Biostrings:: Objects                  110
#TOC>   3.1        Properties                                       127
#TOC>   3.2        Subsetting                                       168
#TOC>   3.3        Operators                                        180
#TOC>   3.4        Transformations                                  187
#TOC>   4        Getting Data out of Biostrings:: Objects           194
#TOC>   5        More                                               203
#TOC>   5.1        Views                                            205
#TOC>   5.2        Iranges                                          219
#TOC>   5.3        StringSets                                       225
#TOC> 
#TOC> ==========================================================================


# This is a very brief introduction to the Biostrings:: package, other units will
# be using more of the Biostrings:: functions.


# =    1  The Biostrings:: Package  ============================================


# First, we install and load the Biostrings:: package from bioconductor (if we
# haven't done so already).

if (! requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
  BiocManager::install("Biostrings")
}
# Examine the package information:
library(help = Biostrings)       # basic information
browseVignettes("Biostrings")    # available vignettes
data(package = "Biostrings")     # available datasets


# At its core, Biostrings:: objects are "classes" of type XString (you can think
# of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information.

class(Biostrings::RNAString("AUG"))
class(Biostrings::DNAString("ATG"))
class(Biostrings::AAString("M"))

# An essential property of Biostrings:: objects is that they only allow letters
# from the applicable IUPAC alphabet:
Biostrings::RNAString("AUG")
Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes


# =    2  Getting Data into Biostrings:: Objects  ==============================


# Example: read FASTA. Extract sequence. Convert to DNAString object.
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
rawSeq <- dbSanitizeSequence(rawSeq)
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
                                            # into an object of class DNAstring

# Multi FASTA files can be read directly as a "XStringSet) ...
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))

# ... and if you subset one sequence from the set, you get an XString object
# back again.
(Xseq <- biosDNASet[[1]])

biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.


# =    3  Working with Biostrings:: Objects  ===================================

# Biostrings:: is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with
# Biostrings::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from.

library(Biostrings)


# ==   3.1  Properties  ========================================================
str(rawSeq)
str(biosDNAseq)

length(rawSeq)       # ... is 1: one string only. To get the number of
                     # characters in a string, you need nchar().
length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
nchar(rawSeq)
nchar(biosDNAseq)    # ... but nchar() works too.

(uL <- Biostrings::uniqueLetters(biosDNAseq))

# Count frequencies - with strings, you would strsplit() into a character
# vector and then use table(). biost
Biostrings::alphabetFrequency(biosDNAseq)

# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
# returns.
Biostrings::letterFrequency(biosDNAseq, uL)
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
  length(biosDNAseq) # GC contents

Biostrings::dinucleotideFrequency(biosDNAseq)
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)

(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
barplot(sort(triNuc), col="#4499EE33")
triNuc[triNuc == max(triNuc)]
triNuc[triNuc == min(triNuc)]
max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT

# compare to a shuffled sequence:
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
max(triNuc)
# Interpret this plot.
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433")
max(triNuc)


# ==   3.2  Subsetting  ========================================================

# Subsetting any XString object works as expected:
biosDNAseq[4:15]

# ... well - maybe not expected, because rawSeq[4:15] would not work.

# Alternatively to the "[" operator, use the subseq() function - especially for
# long sequences. This is far more efficient.
Biostrings::subseq(biosDNAseq, start = 1, end = 30)


# ==   3.3  Operators  =========================================================

# RNAstring() and DNAstring() objects compare U and T as equals!
  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")


# ==   3.4  Transformations  ===================================================

biosDNAseq[4:15]
Biostrings::reverseComplement(biosDNAseq[4:15])
Biostrings::translate(biosDNAseq[4:15])


# =    4  Getting Data out of Biostrings:: Objects  ============================

# If you need a character object, use toString():

Biostrings::toString(biosDNAseq[4:15])

# saveRDS() and readRDS() works like on all other R objects.


# =    5  More  ================================================================

# ==   5.1  Views  =============================================================

# Biostring "Views" are objects that store multiple substrings of one
# Biostring object.

(myView <- Biostrings::Views(biosDNAseq,
                             start = c(1, 19, 37),
                             end = c(15, 30, 45)))

# Views are convenient to store feature annotations
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))


# ==   5.2  Iranges  ===========================================================

# Biostrings:: Iranges are like Views with a common start point. These can be
# useful for feature annotations. Instead of start/end you store start/width.


# ==   5.3  StringSets  ========================================================

# Biostring "StringSets" store multiple sequences.
#
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
sample(ompA) # sample can work directly on a Biostring object to shuffle it

x <- Biostrings::toString(ompA)
for (i in 2:10) {
  x[i] <- Biostrings::toString(sample(ompA))
}
shuffledPeptideSet <- Biostrings::AAStringSet(x)
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
shuffledPeptideSet

length(shuffledPeptideSet)
Biostrings::width(shuffledPeptideSet)
Biostrings::alphabetFrequency(shuffledPeptideSet)


# [END]
Line termination change and old code. 2021-11-16 05:31:48 +00:00			`# tocID <- "RPR-Biostrings.R"`
			`#`
			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the RPR-Biostrings unit.`
			`#`
			`# Version: 1.2`
			`#`
			`# Date: 2017-10 - 2020-09`
			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
			`# 1.2 2020 Updates`
			`# 1.1 Change from require() to requireNamespace(),`
			`# use <package>::<function>() idiom throughout,`
			`# use Biocmanager:: not biocLite()`
			`# 1.0 2017 Revisions`
			`# 0.1 First code copied from 2016 material.`
			`#`
			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
			`#`
			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
			`#`
			`# ==============================================================================`


			`#TOC> ==========================================================================`
			`#TOC>`
			`#TOC> Section Title Line`
			`#TOC> -----------------------------------------------------------------`
			`#TOC> 1 The Biostrings:: Package 56`
			`#TOC> 2 Getting Data into Biostrings:: Objects 88`
			`#TOC> 3 Working with Biostrings:: Objects 110`
			`#TOC> 3.1 Properties 127`
			`#TOC> 3.2 Subsetting 168`
			`#TOC> 3.3 Operators 180`
			`#TOC> 3.4 Transformations 187`
			`#TOC> 4 Getting Data out of Biostrings:: Objects 194`
			`#TOC> 5 More 203`
			`#TOC> 5.1 Views 205`
			`#TOC> 5.2 Iranges 219`
			`#TOC> 5.3 StringSets 225`
			`#TOC>`
			`#TOC> ==========================================================================`


			`# This is a very brief introduction to the Biostrings:: package, other units will`
			`# be using more of the Biostrings:: functions.`


			`# = 1 The Biostrings:: Package ============================================`


			`# First, we install and load the Biostrings:: package from bioconductor (if we`
			`# haven't done so already).`

			`if (! requireNamespace("BiocManager", quietly = TRUE)) {`
			`install.packages("BiocManager")`
			`}`
			`if (! requireNamespace("Biostrings", quietly = TRUE)) {`
			`BiocManager::install("Biostrings")`
			`}`
			`# Examine the package information:`
			`library(help = Biostrings) # basic information`
			`browseVignettes("Biostrings") # available vignettes`
			`data(package = "Biostrings") # available datasets`


			`# At its core, Biostrings:: objects are "classes" of type XString (you can think`
			`# of a "class" in R as a special kind of list), that can take on particular`
			`# flavours for RNA, DNA or amino acid sequence information.`

			`class(Biostrings::RNAString("AUG"))`
			`class(Biostrings::DNAString("ATG"))`
			`class(Biostrings::AAString("M"))`

			`# An essential property of Biostrings:: objects is that they only allow letters`
			`# from the applicable IUPAC alphabet:`
			`Biostrings::RNAString("AUG")`
			`Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes`


			`# = 2 Getting Data into Biostrings:: Objects ==============================`


			`# Example: read FASTA. Extract sequence. Convert to DNAString object.`
			`rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")`
			`rawSeq <- dbSanitizeSequence(rawSeq)`
			`biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence`
			`# into an object of class DNAstring`

			`# Multi FASTA files can be read directly as a "XStringSet) ...`
			`rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"`
			`(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))`

			`# ... and if you subset one sequence from the set, you get an XString object`
			`# back again.`
			`(Xseq <- biosDNASet[[1]])`

			`biosDNAseq == Xseq # the comparison evaluates to TRUE ...`
			`identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.`



			`# = 3 Working with Biostrings:: Objects ===================================`

			`# Biostrings:: is a highly engineered package that is tightly integrated into`
			`# the Bioconductor world - unfortunately that brings with it a somewhat`
			`# undesirable level of computational overhead and dependencies. Using the`
			`# package as we normally do - i.e. calling required functions with their`
			`# explicit package prefix is therefore not advisable. There are generics`
			`# that won't be propery dispatched. If you only need a small number of`
			`# functions for a very specific context, you will probably get away with`
			`# Biostrings::<function>() - but even in the demonstration code of this script`
			`# not everything works out of the box. We'll therefore load the library,`
			`# but we'll (redundantly) use the prefix anyway so as to emphasize where`
			`# the functions come from.`

			`library(Biostrings)`


			`# == 3.1 Properties ========================================================`
			`str(rawSeq)`
			`str(biosDNAseq)`

			`length(rawSeq) # ... is 1: one string only. To get the number of`
			`# characters in a string, you need nchar().`
			`length(biosDNAseq) # but the length of a "Bstring" is the number of elements`
			`nchar(rawSeq)`
			`nchar(biosDNAseq) # ... but nchar() works too.`

			`(uL <- Biostrings::uniqueLetters(biosDNAseq))`

			`# Count frequencies - with strings, you would strsplit() into a character`
			`# vector and then use table(). biost`
			`Biostrings::alphabetFrequency(biosDNAseq)`

			`# letterFrequency() works with a defined alphabet - such as what uniqueLetters()`
			`# returns.`
			`Biostrings::letterFrequency(biosDNAseq, uL)`
			`sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /`
			`length(biosDNAseq) # GC contents`

			`Biostrings::dinucleotideFrequency(biosDNAseq)`
			`barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)`

			`(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))`
			`barplot(sort(triNuc), col="#4499EE33")`
			`triNuc[triNuc == max(triNuc)]`
			`triNuc[triNuc == min(triNuc)]`
			`max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT`

			`# compare to a shuffled sequence:`
			`(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))`
			`barplot(sort(triNuc), col="#EEEE4433", add = TRUE)`
			`max(triNuc)`
			`# Interpret this plot.`
			`(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))`
			`barplot(sort(triNuc), col="#EEEE4433")`
			`max(triNuc)`


			`# == 3.2 Subsetting ========================================================`

			`# Subsetting any XString object works as expected:`
			`biosDNAseq[4:15]`

			`# ... well - maybe not expected, because rawSeq[4:15] would not work.`

			`# Alternatively to the "[" operator, use the subseq() function - especially for`
			`# long sequences. This is far more efficient.`
			`Biostrings::subseq(biosDNAseq, start = 1, end = 30)`


			`# == 3.3 Operators =========================================================`

			`# RNAstring() and DNAstring() objects compare U and T as equals!`
			`Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==`
			`Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")`


			`# == 3.4 Transformations ===================================================`

			`biosDNAseq[4:15]`
			`Biostrings::reverseComplement(biosDNAseq[4:15])`
			`Biostrings::translate(biosDNAseq[4:15])`


			`# = 4 Getting Data out of Biostrings:: Objects ============================`

			`# If you need a character object, use toString():`

			`Biostrings::toString(biosDNAseq[4:15])`

			`# saveRDS() and readRDS() works like on all other R objects.`


			`# = 5 More ================================================================`

			`# == 5.1 Views =============================================================`

			`# Biostring "Views" are objects that store multiple substrings of one`
			`# Biostring object.`

			`(myView <- Biostrings::Views(biosDNAseq,`
			`start = c(1, 19, 37),`
			`end = c(15, 30, 45)))`

			`# Views are convenient to store feature annotations`
			`names(myView) <- c("Feature-A", "Feature-B", "Feature-C")`
			`cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))`


			`# == 5.2 Iranges ===========================================================`

			`# Biostrings:: Iranges are like Views with a common start point. These can be`
			`# useful for feature annotations. Instead of start/end you store start/width.`


			`# == 5.3 StringSets ========================================================`

			`# Biostring "StringSets" store multiple sequences.`
			`#`
			`ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")`
			`sample(ompA) # sample can work directly on a Biostring object to shuffle it`

			`x <- Biostrings::toString(ompA)`
			`for (i in 2:10) {`
			`x[i] <- Biostrings::toString(sample(ompA))`
			`}`
			`shuffledPeptideSet <- Biostrings::AAStringSet(x)`
			`names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))`
			`shuffledPeptideSet`

			`length(shuffledPeptideSet)`
			`Biostrings::width(shuffledPeptideSet)`
			`Biostrings::alphabetFrequency(shuffledPeptideSet)`


			`# [END]`