New unit RPR-Biostrings.R

2017-10-21 21:00:55 -04:00
parent b748c1264c
commit 46285224d6
1 changed files with 176 additions and 10 deletions
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@@ -3,49 +3,215 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Biostrings unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10  20
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    2017 Revisions
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================
 
-# = 1 The Biostrings package
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                     Line
+#TOC> ---------------------------------------------------------
+#TOC>   1        The Biostrings package                      53
+#TOC>   2        Getting Data into Biostrings Objects        82
+#TOC>   3        Working with Biostrings Objects            102
+#TOC>   3.1      Properties                                 105
+#TOC>   3.2      Subsetting                                 142
+#TOC>   3.3      Operators                                  154
+#TOC>   3.4      Transformations                            161
+#TOC>   4        Getting Data out of Biostrings Objects     168
+#TOC>   5        More                                       177
+#TOC>   5.1      Views                                      179
+#TOC>   5.2      Iranges                                    191
+#TOC>   5.3      StringSets                                 197
+#TOC> 
+#TOC> ==========================================================================
+ 
+
+
+
+# This is a very brief introduction to the biostrings package, other units will
+# be using more of the biostrings functions.
+
+
+# =    1  The Biostrings package  ==============================================
+
+
+# First, we install and load the Biostrings package from bioconductor

-# First, we install and load the Biostrings package.
 if (!require(Biostrings, quietly=TRUE)) {
  source("https://bioconductor.org/biocLite.R")
  biocLite("Biostrings")
  library(Biostrings)
 }

+
 # This is a large collection of tools ...
 help(package = "Biostrings")

+# At its core, Biostrings objects are "classes" of type XString (you can think
+# of a "class" in R as a special kind of list), that can take on particular
+# flavours for RNA, DNA or amino acid sequence information.
+
+class(RNAString("AUG"))
+class(DNAString("ATG"))
+class(AAString("M"))
+
+# An essential property of Biostrings objects is that they only allow letters
+# from the applicable IUPAC alphabet:
+RNAString("AUG")
+DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
+
+
+# =    2  Getting Data into Biostrings Objects  ================================
+
+
+# Example: read FASTA. Extract sequence. Convert to DNAString object.
+x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+x <- dbSanitizeSequence(x)
+myDNAseq <- DNAString(x)   # takes the nucleotide sequence and conerts into a
+                           # object of class DNAstring
+
+# Multi FASTA files can be read directly ...
+readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa") # Note: XStringSet
+
+# ... and if you subset one sequence from the set, you get an XString object
+(  x <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa")[[1]]  )
+
+myDNAseq == x
+identical(myDNAseq, x)



-
-# = 1.1 <<<Subsection>>>
+# =    3  Working with Biostrings Objects  =====================================


+# ==   3.1  Properties  ========================================================
+str(myDNAseq)
+length(nchar(x))  # This gives you the _number of nucleotides_!
+                  # By comparison ...
+length(x)         # ... is 1: one string only. To get the number of
+                  # characters in a string, you need nchar().
+nchar(x)          # However ...
+nchar(myDNAseq)   # ... also works.

-# = 1 Tasks
+uniqueLetters(myDNAseq)
+
+# Count frequencies - with strings, you would strsplit() into a character
+# vector and then use table(). biost
+alphabetFrequency(myDNAseq)
+
+# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
+# returns.
+letterFrequency(myDNAseq, uniqueLetters(myDNAseq))
+
+sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents
+
+dinucleotideFrequency(myDNAseq)
+barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5)
+
+(x <- trinucleotideFrequency(myDNAseq))
+barplot(sort(x), col="#4499EE33")
+x[x == max(x)]
+x[x == min(x)]
+max(x) / min(x)  # AAA is more than 13 times as frequent as CGT
+
+# compare to a shuffled sequence:
+(x <- trinucleotideFrequency(sample(myDNAseq)))
+barplot(sort(x), col="#EEEE4433", add = TRUE)
+
+# Interpret this plot.


+# ==   3.2  Subsetting  ========================================================
+
+# Subsetting any XString object works as expected:
+myDNAseq[4:15]
+
+# ... well - maybe not expected, because x[4:15] would not work.
+
+# Alternatively to the "[" operator, use the subseq() function - especially for
+# long sequences. This is far more efficient.
+subseq(myDNAseq, start = 1, end = 30)
+
+
+# ==   3.3  Operators  =========================================================
+
+# RNAstring() and DNAstring() objects compare U and T as equals!
+RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
+DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
+
+
+# ==   3.4  Transformations  ===================================================
+
+myDNAseq[4:15]
+reverseComplement(myDNAseq[4:15])
+translate(myDNAseq[4:15])
+
+
+# =    4  Getting Data out of Biostrings Objects  ==============================
+
+# If you need a character object, use toString():
+
+toString(myDNAseq[4:15])
+
+# save() and load() works like on all other R objects.
+
+
+# =    5  More  ================================================================
+
+# ==   5.1  Views  =============================================================
+
+# Biostring "Views" are objects that store mutliple substrings of one
+# Biostring object.
+
+(myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45)))
+
+# Views are convenient to store feature annotations
+names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
+cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
+
+
+# ==   5.2  Iranges  ===========================================================
+
+# Biostrings Iranges are like Views with a common start point. These can be
+# useful for feature annotations. Instead of start/end you store start/width.
+
+
+# ==   5.3  StringSets  ========================================================
+
+# Biostring "StringSets" store multiple sequences.
+#
+ompA <- AAString("MKKTAIAIAVALAGFATVAQA")
+sample(ompA) # sample can work directly on a Biostring object to shuffle it
+
+x[1] <- toString(ompA)
+for (i in 2:10) {
+  x[i] <- toString(sample(ompA))
+}
+shuffledPeptideSet <- AAStringSet(x)
+names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
+shuffledPeptideSet
+
+length(shuffledPeptideSet)
+width(shuffledPeptideSet)
+alphabetFrequency(shuffledPeptideSet)


 # [END]