From 450aefa1190f32129abdcd00d1c50917e901947a Mon Sep 17 00:00:00 2001 From: hyginn Date: Fri, 17 Nov 2017 23:43:50 -0500 Subject: [PATCH] New unit --- BIN-SEQA-Comparison.R | 202 ------------------------------- BIN-SEQA-Composition.R | 262 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+), 202 deletions(-) delete mode 100644 BIN-SEQA-Comparison.R create mode 100644 BIN-SEQA-Composition.R diff --git a/BIN-SEQA-Comparison.R b/BIN-SEQA-Comparison.R deleted file mode 100644 index 2e5f8c2..0000000 --- a/BIN-SEQA-Comparison.R +++ /dev/null @@ -1,202 +0,0 @@ -# BIN-SEQA-Comparison.R -# -# Purpose: A Bioinformatics Course: -# R code accompanying the BIN-SEQA-Comparison unit -# -# Version: 0.1 -# -# Date: 2017 08 25 -# Author: Boris Steipe (boris.steipe@utoronto.ca) -# -# V 0.1 First code copied from BCH441_A03_makeYFOlist.R -# -# TODO: -# -# -# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== -# -# DO NOT SIMPLY source() THESE FILES! - -# If there are portions you don't understand, use R's help system, Google for an -# answer, or ask your instructor. Don't continue if you don't understand what's -# going on. That's not how it works ... -# -# ============================================================================== - -# ============================================================================== -# PART THREE: Sequence Analysis -# ============================================================================== - - -if (!require(seqinr, quietly=TRUE)) { - install.packages("seqinr") - library(seqinr) -} -# Package information: -# library(help = seqinr) # basic information -# browseVignettes("seqinr") # available vignettes -# data(package = "seqinr") # available datasets - - -# Let's try a simple function -?computePI - -# This takes as input a vector of upper-case AA codes -# Let's retrieve the MYSPE sequence from our datamodel -# (assuming it is the last one that was added): - -db$protein[nrow(db$protein), "sequence"] - -# We can use the function strsplit() to split the string -# into single characters - -s <- db$protein[nrow(db$protein), "sequence"] -s <- strsplit(s, "") # splitting on the empty spring -# splits into single characters -s <- unlist(s) # strsplit() returns a list! Why? -# (But we don't need a list now...) - -# Alternatively, seqinr provides -# the function s2c() to convert strings into -# character vectors (and c2s to convert them back). - -s <- s2c(db$protein[nrow(db$protein), "sequence"]) -s - -computePI(s) # isoelectric point -pmw(s) # molecular weight -AAstat(s) # This also plots the distribution of -# values along the sequence - -# A true Labor of Love has gone into the -# compilation of the "aaindex" data: - -?aaindex -data(aaindex) # "attach" the dataset - i.e. make it accessible as an -# R object - -length(aaindex) - -# Here are all the index descriptions -for (i in 1:length(aaindex)) { - cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) -} - - -# Lets use one of the indices to calculate and plot amino-acid -# composition enrichment: -aaindex[[459]] - -# === Sequence Composition Enrichment -# -# Let's construct an enrichment plot to compare one of the amino acid indices -# with the situation in our sequence. - -refData <- aaindex[[459]]$I # reference frequencies in % -names(refData) <- a(names(refData)) # change names to single-letter -# code using seqinr's "a()" function -refData - - -# tabulate our sequence of interest and normalize -obsData <- table(s) # count occurrences -obsData = 100 * (obsData / sum(obsData)) # Normalize -obsData - -len <- length(refData) - -logRatio <- numeric() # create an empty vector - -# loop over all elements of the reference, calculate log-ratios -# and store them in the vector -for (i in 1:len) { - aa <- names(refData)[i] # get the name of that amino acid - fObs <- obsData[aa] # retrieve the frequency for that name - fRef <- refData[aa] - logRatio[aa] <- log(fObs / fRef) / log(2) # remember log Ratio from - # the lecture? -} - -barplot(logRatio) - -# Sort by frequency, descending -logRatio <- sort(logRatio, decreasing = TRUE) - -barplot(logRatio) # If you can't see all of the amino acid letters in the -# x-axis legend, make the plot wider by dragging the -# vertical pane-separator to the left - - - -# label the y-axis -# (see text() for details) -label <- expression(paste(log[2],"( f(obs) / f(ref) )", sep = "")) - -barplot(logRatio, - main = paste("AA composition enrichment"), - ylab = label, - cex.names=0.9) - - - -# color the bars by type. -# define colors -chargePlus <- "#404580" -chargeMinus <- "#ab3853" -hydrophilic <- "#9986bf" -hydrophobic <- "#d5eeb1" -plain <- "#f2f7f7" - -# Assign the colors to the different amino acid names -barColors <- character(len) - -for (i in 1:length(refData)) { - AA <- names(logRatio[i]) - if (grepl("[HKR]", AA)) {barColors[i] <- chargePlus } - else if (grepl("[DE]", AA)) {barColors[i] <- chargeMinus} - else if (grepl("[NQST]", AA)) {barColors[i] <- hydrophilic} - else if (grepl("[FAMILYVW]", AA)) {barColors[i] <- hydrophobic} - else barColors[i] <- plain -} - -barplot(logRatio, - main = paste("AA composition enrichment"), - ylab = label, - col = barColors, - cex.names=0.9) - - -# draw a horizontal line at y = 0 -abline(h=0) - -# add a legend that indicates what the colours mean -legend (x = 1, - y = -1, - legend = c("charged (+)", - "charged (-)", - "hydrophilic", - "hydrophobic", - "plain"), - bty = "n", - fill = c(chargePlus, - chargeMinus, - hydrophilic, - hydrophobic, - plain) -) - -# == TASK == -# Interpret this plot. (Can you?) - -# -# -# ============================================================================== - - - - - - - - -# [END] diff --git a/BIN-SEQA-Composition.R b/BIN-SEQA-Composition.R new file mode 100644 index 0000000..0942c56 --- /dev/null +++ b/BIN-SEQA-Composition.R @@ -0,0 +1,262 @@ +# BIN-SEQA-Composition.R +# +# Purpose: A Bioinformatics Course: +# R code accompanying the BIN-SEQA-Comparison unit +# +# Version: 1.0 +# +# Date: 2017 11 17 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# V 1.0 First live version 2017 +# V 0.1 First code copied from BCH441_A03_makeYFOlist.R +# +# TODO: +# +# +# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== +# +# DO NOT SIMPLY source() THESE FILES! +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... +# +# ============================================================================== + + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ---------------------------------------------------- +#TOC> 1 Preparation 41 +#TOC> 2 Aggregate properties 63 +#TOC> 3 Sequence Composition Enrichment 106 +#TOC> 3.1 Barplot, and side-by-side barplot 129 +#TOC> 3.2 Plotting ratios 164 +#TOC> 3.3 Plotting log ratios 180 +#TOC> 3.4 Sort by frequency 195 +#TOC> 3.5 Color by amino acid type 210 +#TOC> +#TOC> ========================================================================== + + +# = 1 Preparation ========================================================= + +if (!require(seqinr, quietly=TRUE)) { + install.packages("seqinr") + library(seqinr) +} +# Package information: +# library(help = seqinr) # basic information +# browseVignettes("seqinr") # available vignettes +# data(package = "seqinr") # available datasets + +# Load a reference sequence to work with: + +# If you have done the BIN-Storing_data unit: + source("makeProteinDB.R") + sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) + mySeq <- myDB$protein$sequence[sel] + +# If not, use the yeast Mbp1 sequence: + mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) + + +# = 2 Aggregate properties ================================================ + + +# Let's try a simple function from seqinr: computing the pI of the sequence +?computePI + +# This takes as input a vector of upper-case AA codes + +# We can use the function strsplit() to split the string +# into single characters + +(s <- strsplit(mySeq, "")) # splitting on the empty spring + # splits into single characters +s <- unlist(s) # strsplit() returns a list! Why? + # (But we don't need a list now...) + +# Alternatively, seqinr provides +# the function s2c() to convert strings into +# character vectors (and c2s to convert them back). + +s2c(mySeq) + + +computePI(s2c(mySeq)) # isoelectric point +pmw(s2c(mySeq)) # molecular weight +AAstat(s2c(mySeq)) # This also plots the distribution of + # values along the sequence + +# A true Labor of Love has gone into the +# compilation of the "aaindex" data: + +?aaindex +data(aaindex) # "attach" the dataset - i.e. make it accessible as an + # R object + +length(aaindex) + +# Here are all the index descriptions +for (i in 1:length(aaindex)) { + cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) +} + + +# = 3 Sequence Composition Enrichment ===================================== + + +# Lets use one of the indices to calculate and plot amino-acid +# composition enrichment: +aaindex[[459]]$D + +# +# Let's construct an enrichment plot to compare average frequencies +# with the amino acid counts in our sequence. + +(refData <- aaindex[[459]]$I) # reference frequencies in % +names(refData) <- a(names(refData)) # change names to single-letter + # code using seqinr's "a()" function +sum(refData) +refData # ... in % + + +# tabulate the amino acid counts in mySeq +(obsData <- table(s2c(mySeq))) # counts +(obsData <- 100 * (obsData / sum(obsData))) # frequencies + + +# == 3.1 Barplot, and side-by-side barplot ================================= + +barplot(obsData, col = "#CCCCCC", cex.names = 0.7) +abline(h = 100/20, col="#BB0000") + +barplot(refData, col = "#BB0000", cex.names = 0.7) +abline(h = 100/20, col="#555555") + +# Ok: first problem - the values in obsData are in alphabetical order. But the +# values in refData are in alphabetical order of amino acid name: alanine, +# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this +# order a lot - one of the old biochemistry tropes in the field. So we need to +# re-order one of the vectors to match the other. That's easy though: +refData +(refData <- refData[names(obsData)]) + +barplot(refData, col = "#BB0000", cex.names = 0.7) +abline(h = 100/20, col="#555555") + +# To compare the values, we want to see them in a barplot, side-by-side ... +barplot(rbind(obsData, refData), + ylim = c(0, 12), + beside = TRUE, + col = c("#CCCCCC", "#BB0000"), + cex.names = 0.7) +abline(h = 100/20, col="#00000044") + +# ... and add a legend +legend (x = 1, y = 12, + legend = c("mySeq", "Average composition"), + fill = c("#CCCCCC", "#BB0000"), + cex = 0.7, + bty = "n") + + +# == 3.2 Plotting ratios =================================================== + +# To better compare the values, we'll calculate ratios between +# obsData and refData + +barplot(obsData / refData, + col = "#CCCCCC", + ylab = "Sequence / Average", + cex.names = 0.7) +abline(h = 1, col="#BB0000") +abline(h = c(1/3, 3), lty = 2, col="#BB000055") + +# ... but ratios are not very good here, since the difference in height on the +# plot now depends on the order we compare in: ratios of 1/3 and 3 (dotted +# lines) are exactly the same fold-difference ! + +# == 3.3 Plotting log ratios =============================================== + +# A better way to display this +# is to plot log(ratios). + +barplot(log(obsData / refData), + col = "#CCCCCC", + ylab = "log(Sequence / Average)", + cex.names = 0.7) +abline(h = log(1), col="#BB0000") +abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055") + +# Note how the three-fold difference lines are now the same distance from the +# line of equal ratio. + +# == 3.4 Sort by frequency ================================================= + +barplot(sort(log(obsData / refData), decreasing = TRUE), + ylim = c(-3.5,2), + col = "#CCCCCC", + ylab = "log(Sequence / Average)", + cex.names = 0.7) +abline(h = log(1), col="#BB0000") +abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055") + +arrows(4, 1.8, 0, 1.8, length = 0.07) +text(5.5, 1.8, "Enriched", cex = 0.7) +arrows(20, 1.8, 24, 1.8, length = 0.07) +text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7) + +# == 3.5 Color by amino acid type ========================================== + +# color the bars by type. +# define colors +AAcol <- character() +AAcol["A"] <- "#AABBAA" +AAcol["C"] <- "#FFEE77" +AAcol["D"] <- "#DD6600" +AAcol["E"] <- "#DD3300" +AAcol["F"] <- "#767D38" +AAcol["G"] <- "#BBBBCC" +AAcol["H"] <- "#A2A1FD" +AAcol["I"] <- "#70B6C6" +AAcol["K"] <- "#4563BB" +AAcol["L"] <- "#80C6B6" +AAcol["M"] <- "#AFCC34" +AAcol["N"] <- "#BB88CC" +AAcol["P"] <- "#7292B7" +AAcol["Q"] <- "#8866BB" +AAcol["R"] <- "#74A0FF" +AAcol["S"] <- "#9999CC" +AAcol["T"] <- "#99AADD" +AAcol["V"] <- "#9DB500" +AAcol["W"] <- "#76AD48" +AAcol["Y"] <- "#44CA97" + +barplot(rep(1, 20), names.arg = names(AAcol), col = AAcol, cex.names = 0.5) + +lR <- sort(log(obsData / refData), decreasing = TRUE) +barplot(lR, + ylim = c(-3.5,2), + col = AAcol[names(lR)], + ylab = "log(Sequence / Average)", + cex.names = 0.7) +abline(h = log(1), col="#00000055") +abline(h = log(c(1/3, 3)), lty = 2, col="#00000033") + +arrows(4, 1.8, 0, 1.8, length = 0.07) +text(5.5, 1.8, "Enriched", cex = 0.7) +arrows(20, 1.8, 24, 1.8, length = 0.07) +text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7) + + +# Task: +# Interpret this plot. (Can you?) Which types of amino acids are enriched? +# Depleted? + + + + +# [END]