New unit

2017-11-17 23:43:50 -05:00
parent 40b9e6fc04
commit 450aefa119
2 changed files with 262 additions and 202 deletions
--- a/BIN-SEQA-Comparison.R
+++ b/BIN-SEQA-Comparison.R
@@ -1,202 +0,0 @@
-# BIN-SEQA-Comparison.R
-#
-# Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-SEQA-Comparison unit
-#
-# Version: 0.1
-#
-# Date:    2017  08  25
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# V 0.1    First code copied from BCH441_A03_makeYFOlist.R
-#
-# TODO:
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# DO NOT SIMPLY  source()  THESE FILES!
-
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# ==============================================================================
-
-# ==============================================================================
-#        PART THREE: Sequence Analysis
-# ==============================================================================
-
-
-if (!require(seqinr, quietly=TRUE)) {
-  install.packages("seqinr")
-  library(seqinr)
-}
-# Package information:
-#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
-
-
-# Let's try a simple function
-?computePI
-
-# This takes as input a vector of upper-case AA codes
-# Let's retrieve the MYSPE sequence from our datamodel
-# (assuming it is the last one that was added):
-
-db$protein[nrow(db$protein), "sequence"]
-
-# We can use the function strsplit() to split the string
-# into single characters
-
-s <- db$protein[nrow(db$protein), "sequence"]
-s <- strsplit(s, "") # splitting on the empty spring
-# splits into single characters
-s <- unlist(s)       # strsplit() returns a list! Why?
-# (But we don't need a list now...)
-
-# Alternatively, seqinr provides
-# the function s2c() to convert strings into
-# character vectors (and c2s to convert them back).
-
-s <- s2c(db$protein[nrow(db$protein), "sequence"])
-s
-
-computePI(s)  # isoelectric point
-pmw(s)        # molecular weight
-AAstat(s)     # This also plots the distribution of
-# values along the sequence
-
-# A true Labor of Love has gone into the
-# compilation of the "aaindex" data:
-
-?aaindex
-data(aaindex)  # "attach" the dataset - i.e. make it accessible as an
-# R object
-
-length(aaindex)
-
-# Here are all the index descriptions
-for (i in 1:length(aaindex)) {
-  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
-}
-
-
-# Lets use one of the indices to calculate and plot amino-acid
-# composition enrichment:
-aaindex[[459]]
-
-# === Sequence Composition Enrichment
-#
-# Let's construct an enrichment plot to compare one of the amino acid indices
-# with the situation in our sequence.
-
-refData <- aaindex[[459]]$I   # reference frequencies in %
-names(refData) <- a(names(refData))  # change names to single-letter
-# code using seqinr's "a()" function
-refData
-
-
-# tabulate our sequence of interest and normalize
-obsData <- table(s)                # count occurrences
-obsData = 100 * (obsData / sum(obsData))   # Normalize
-obsData
-
-len <- length(refData)
-
-logRatio <- numeric() # create an empty vector
-
-# loop over all elements of the reference, calculate log-ratios
-# and store them in the vector
-for (i in 1:len) {
-  aa <- names(refData)[i] # get the name of that amino acid
-  fObs <- obsData[aa]  # retrieve the frequency for that name
-  fRef <- refData[aa]
-  logRatio[aa] <- log(fObs / fRef) / log(2)  # remember log Ratio from
-  # the lecture?
-}
-
-barplot(logRatio)
-
-# Sort by frequency, descending
-logRatio <- sort(logRatio, decreasing = TRUE)
-
-barplot(logRatio)  # If you can't see all of the amino acid letters in the
-# x-axis legend, make the plot wider by dragging the
-# vertical pane-separator to the left
-
-
-
-# label the y-axis
-# (see text() for details)
-label <- expression(paste(log[2],"( f(obs) / f(ref) )", sep = ""))
-
-barplot(logRatio,
-        main = paste("AA composition enrichment"),
-        ylab = label,
-        cex.names=0.9)
-
-
-
-# color the bars by type.
-# define colors
-chargePlus  <- "#404580"
-chargeMinus <- "#ab3853"
-hydrophilic <- "#9986bf"
-hydrophobic <- "#d5eeb1"
-plain       <- "#f2f7f7"
-
-# Assign the colors to the different amino acid names
-barColors <- character(len)
-
-for (i in 1:length(refData)) {
-  AA <- names(logRatio[i])
-  if (grepl("[HKR]",      AA)) {barColors[i] <- chargePlus }
-  else if (grepl("[DE]",       AA)) {barColors[i] <- chargeMinus}
-  else if (grepl("[NQST]",     AA)) {barColors[i] <- hydrophilic}
-  else if (grepl("[FAMILYVW]", AA)) {barColors[i] <- hydrophobic}
-  else                               barColors[i] <- plain
-}
-
-barplot(logRatio,
-        main = paste("AA composition enrichment"),
-        ylab = label,
-        col = barColors,
-        cex.names=0.9)
-
-
-# draw a horizontal line at y = 0
-abline(h=0)
-
-# add a legend that indicates what the colours mean
-legend (x = 1,
-        y = -1,
-        legend = c("charged (+)",
-                   "charged (-)",
-                   "hydrophilic",
-                   "hydrophobic",
-                   "plain"),
-        bty = "n",
-        fill = c(chargePlus,
-                 chargeMinus,
-                 hydrophilic,
-                 hydrophobic,
-                 plain)
-)
-
-# == TASK ==
-# Interpret this plot. (Can you?)
-
-#
-#
-# ==============================================================================
-
-
-
-
-
-
-
-
-# [END]
--- a/BIN-SEQA-Composition.R
+++ b/BIN-SEQA-Composition.R
@@ -0,0 +1,262 @@
+# BIN-SEQA-Composition.R
+#
+# Purpose: A Bioinformatics Course:
+#              R code accompanying the BIN-SEQA-Comparison unit
+#
+# Version: 1.0
+#
+# Date:    2017  11  17
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# V 1.0    First live version 2017
+# V 0.1    First code copied from BCH441_A03_makeYFOlist.R
+#
+# TODO:
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# DO NOT SIMPLY  source()  THESE FILES!
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                Line
+#TOC> ----------------------------------------------------
+#TOC>   1        Preparation                            41
+#TOC>   2        Aggregate properties                   63
+#TOC>   3        Sequence Composition Enrichment       106
+#TOC>   3.1      Barplot, and side-by-side barplot     129
+#TOC>   3.2      Plotting ratios                       164
+#TOC>   3.3      Plotting log ratios                   180
+#TOC>   3.4      Sort by frequency                     195
+#TOC>   3.5      Color by amino acid type              210
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Preparation  =========================================================
+
+if (!require(seqinr, quietly=TRUE)) {
+  install.packages("seqinr")
+  library(seqinr)
+}
+# Package information:
+#  library(help = seqinr)       # basic information
+#  browseVignettes("seqinr")    # available vignettes
+#  data(package = "seqinr")     # available datasets
+
+# Load a reference sequence to work with:
+
+# If you have done the BIN-Storing_data unit:
+   source("makeProteinDB.R")
+   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
+   mySeq <- myDB$protein$sequence[sel]
+
+# If not, use the yeast Mbp1 sequence:
+   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
+
+
+# =    2  Aggregate properties  ================================================
+
+
+# Let's try a simple function from seqinr: computing the pI of the sequence
+?computePI
+
+# This takes as input a vector of upper-case AA codes
+
+# We can use the function strsplit() to split the string
+# into single characters
+
+(s <- strsplit(mySeq, "")) # splitting on the empty spring
+                           # splits into single characters
+s <- unlist(s)             # strsplit() returns a list! Why?
+                           # (But we don't need a list now...)
+
+# Alternatively, seqinr provides
+# the function s2c() to convert strings into
+# character vectors (and c2s to convert them back).
+
+s2c(mySeq)
+
+
+computePI(s2c(mySeq))  # isoelectric point
+pmw(s2c(mySeq))        # molecular weight
+AAstat(s2c(mySeq))     # This also plots the distribution of
+                       # values along the sequence
+
+# A true Labor of Love has gone into the
+# compilation of the "aaindex" data:
+
+?aaindex
+data(aaindex)  # "attach" the dataset - i.e. make it accessible as an
+               # R object
+
+length(aaindex)
+
+# Here are all the index descriptions
+for (i in 1:length(aaindex)) {
+  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
+}
+
+
+# =    3  Sequence Composition Enrichment  =====================================
+
+
+# Lets use one of the indices to calculate and plot amino-acid
+# composition enrichment:
+aaindex[[459]]$D
+
+#
+# Let's construct an enrichment plot to compare average frequencies
+# with the amino acid counts in our sequence.
+
+(refData <- aaindex[[459]]$I)        # reference frequencies in %
+names(refData) <- a(names(refData))  # change names to single-letter
+                                     # code using seqinr's "a()" function
+sum(refData)
+refData        # ... in %
+
+
+# tabulate the amino acid counts in mySeq
+(obsData <- table(s2c(mySeq)))                # counts
+(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
+
+
+# ==   3.1  Barplot, and side-by-side barplot  =================================
+
+barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
+abline(h = 100/20, col="#BB0000")
+
+barplot(refData, col = "#BB0000", cex.names = 0.7)
+abline(h = 100/20, col="#555555")
+
+# Ok: first problem - the values in obsData are in alphabetical order. But the
+# values in refData are in alphabetical order of amino acid name: alanine,
+# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
+# order a lot - one of the old biochemistry tropes in the field. So we need to
+# re-order one of the vectors to match the other. That's easy though:
+refData
+(refData <- refData[names(obsData)])
+
+barplot(refData, col = "#BB0000", cex.names = 0.7)
+abline(h = 100/20, col="#555555")
+
+# To compare the values, we want to see them in a barplot, side-by-side ...
+barplot(rbind(obsData, refData),
+        ylim = c(0, 12),
+        beside = TRUE,
+        col = c("#CCCCCC", "#BB0000"),
+        cex.names = 0.7)
+abline(h = 100/20, col="#00000044")
+
+# ... and add a legend
+legend (x = 1, y = 12,
+        legend = c("mySeq", "Average composition"),
+        fill = c("#CCCCCC", "#BB0000"),
+        cex = 0.7,
+        bty = "n")
+
+
+# ==   3.2  Plotting ratios  ===================================================
+
+# To better compare the values, we'll calculate ratios between
+# obsData and refData
+
+barplot(obsData / refData,
+        col = "#CCCCCC",
+        ylab = "Sequence / Average",
+        cex.names = 0.7)
+abline(h = 1, col="#BB0000")
+abline(h = c(1/3, 3), lty = 2, col="#BB000055")
+
+# ... but  ratios are not very good here, since the difference in height on the
+# plot now depends on the order we compare in: ratios of 1/3 and 3 (dotted
+# lines) are exactly the same fold-difference !
+
+# ==   3.3  Plotting log ratios  ===============================================
+
+# A better way to display this
+# is to plot log(ratios).
+
+barplot(log(obsData / refData),
+        col = "#CCCCCC",
+        ylab = "log(Sequence / Average)",
+        cex.names = 0.7)
+abline(h = log(1), col="#BB0000")
+abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055")
+
+# Note how the three-fold difference lines are now the same distance from the
+# line of equal ratio.
+
+# ==   3.4  Sort by frequency  =================================================
+
+barplot(sort(log(obsData / refData), decreasing = TRUE),
+        ylim = c(-3.5,2),
+        col = "#CCCCCC",
+        ylab = "log(Sequence / Average)",
+        cex.names = 0.7)
+abline(h = log(1), col="#BB0000")
+abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055")
+
+arrows(4, 1.8, 0, 1.8, length = 0.07)
+text(5.5, 1.8, "Enriched", cex = 0.7)
+arrows(20, 1.8, 24, 1.8, length = 0.07)
+text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7)
+
+# ==   3.5  Color by amino acid type  ==========================================
+
+# color the bars by type.
+# define colors
+AAcol <- character()
+AAcol["A"] <- "#AABBAA"
+AAcol["C"] <- "#FFEE77"
+AAcol["D"] <- "#DD6600"
+AAcol["E"] <- "#DD3300"
+AAcol["F"] <- "#767D38"
+AAcol["G"] <- "#BBBBCC"
+AAcol["H"] <- "#A2A1FD"
+AAcol["I"] <- "#70B6C6"
+AAcol["K"] <- "#4563BB"
+AAcol["L"] <- "#80C6B6"
+AAcol["M"] <- "#AFCC34"
+AAcol["N"] <- "#BB88CC"
+AAcol["P"] <- "#7292B7"
+AAcol["Q"] <- "#8866BB"
+AAcol["R"] <- "#74A0FF"
+AAcol["S"] <- "#9999CC"
+AAcol["T"] <- "#99AADD"
+AAcol["V"] <- "#9DB500"
+AAcol["W"] <- "#76AD48"
+AAcol["Y"] <- "#44CA97"
+
+barplot(rep(1, 20), names.arg = names(AAcol), col = AAcol, cex.names = 0.5)
+
+lR <- sort(log(obsData / refData), decreasing = TRUE)
+barplot(lR,
+        ylim = c(-3.5,2),
+        col = AAcol[names(lR)],
+        ylab = "log(Sequence / Average)",
+        cex.names = 0.7)
+abline(h = log(1), col="#00000055")
+abline(h = log(c(1/3, 3)), lty = 2, col="#00000033")
+
+arrows(4, 1.8, 0, 1.8, length = 0.07)
+text(5.5, 1.8, "Enriched", cex = 0.7)
+arrows(20, 1.8, 24, 1.8, length = 0.07)
+text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7)
+
+
+# Task:
+#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
+#   Depleted?
+
+
+
+
+# [END]