New unit
This commit is contained in:
parent
40b9e6fc04
commit
450aefa119
@ -1,202 +0,0 @@
|
||||
# BIN-SEQA-Comparison.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-SEQA-Comparison unit
|
||||
#
|
||||
# Version: 0.1
|
||||
#
|
||||
# Date: 2017 08 25
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# ==============================================================================
|
||||
# PART THREE: Sequence Analysis
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (!require(seqinr, quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
library(seqinr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
|
||||
# Let's try a simple function
|
||||
?computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
# Let's retrieve the MYSPE sequence from our datamodel
|
||||
# (assuming it is the last one that was added):
|
||||
|
||||
db$protein[nrow(db$protein), "sequence"]
|
||||
|
||||
# We can use the function strsplit() to split the string
|
||||
# into single characters
|
||||
|
||||
s <- db$protein[nrow(db$protein), "sequence"]
|
||||
s <- strsplit(s, "") # splitting on the empty spring
|
||||
# splits into single characters
|
||||
s <- unlist(s) # strsplit() returns a list! Why?
|
||||
# (But we don't need a list now...)
|
||||
|
||||
# Alternatively, seqinr provides
|
||||
# the function s2c() to convert strings into
|
||||
# character vectors (and c2s to convert them back).
|
||||
|
||||
s <- s2c(db$protein[nrow(db$protein), "sequence"])
|
||||
s
|
||||
|
||||
computePI(s) # isoelectric point
|
||||
pmw(s) # molecular weight
|
||||
AAstat(s) # This also plots the distribution of
|
||||
# values along the sequence
|
||||
|
||||
# A true Labor of Love has gone into the
|
||||
# compilation of the "aaindex" data:
|
||||
|
||||
?aaindex
|
||||
data(aaindex) # "attach" the dataset - i.e. make it accessible as an
|
||||
# R object
|
||||
|
||||
length(aaindex)
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
|
||||
# Lets use one of the indices to calculate and plot amino-acid
|
||||
# composition enrichment:
|
||||
aaindex[[459]]
|
||||
|
||||
# === Sequence Composition Enrichment
|
||||
#
|
||||
# Let's construct an enrichment plot to compare one of the amino acid indices
|
||||
# with the situation in our sequence.
|
||||
|
||||
refData <- aaindex[[459]]$I # reference frequencies in %
|
||||
names(refData) <- a(names(refData)) # change names to single-letter
|
||||
# code using seqinr's "a()" function
|
||||
refData
|
||||
|
||||
|
||||
# tabulate our sequence of interest and normalize
|
||||
obsData <- table(s) # count occurrences
|
||||
obsData = 100 * (obsData / sum(obsData)) # Normalize
|
||||
obsData
|
||||
|
||||
len <- length(refData)
|
||||
|
||||
logRatio <- numeric() # create an empty vector
|
||||
|
||||
# loop over all elements of the reference, calculate log-ratios
|
||||
# and store them in the vector
|
||||
for (i in 1:len) {
|
||||
aa <- names(refData)[i] # get the name of that amino acid
|
||||
fObs <- obsData[aa] # retrieve the frequency for that name
|
||||
fRef <- refData[aa]
|
||||
logRatio[aa] <- log(fObs / fRef) / log(2) # remember log Ratio from
|
||||
# the lecture?
|
||||
}
|
||||
|
||||
barplot(logRatio)
|
||||
|
||||
# Sort by frequency, descending
|
||||
logRatio <- sort(logRatio, decreasing = TRUE)
|
||||
|
||||
barplot(logRatio) # If you can't see all of the amino acid letters in the
|
||||
# x-axis legend, make the plot wider by dragging the
|
||||
# vertical pane-separator to the left
|
||||
|
||||
|
||||
|
||||
# label the y-axis
|
||||
# (see text() for details)
|
||||
label <- expression(paste(log[2],"( f(obs) / f(ref) )", sep = ""))
|
||||
|
||||
barplot(logRatio,
|
||||
main = paste("AA composition enrichment"),
|
||||
ylab = label,
|
||||
cex.names=0.9)
|
||||
|
||||
|
||||
|
||||
# color the bars by type.
|
||||
# define colors
|
||||
chargePlus <- "#404580"
|
||||
chargeMinus <- "#ab3853"
|
||||
hydrophilic <- "#9986bf"
|
||||
hydrophobic <- "#d5eeb1"
|
||||
plain <- "#f2f7f7"
|
||||
|
||||
# Assign the colors to the different amino acid names
|
||||
barColors <- character(len)
|
||||
|
||||
for (i in 1:length(refData)) {
|
||||
AA <- names(logRatio[i])
|
||||
if (grepl("[HKR]", AA)) {barColors[i] <- chargePlus }
|
||||
else if (grepl("[DE]", AA)) {barColors[i] <- chargeMinus}
|
||||
else if (grepl("[NQST]", AA)) {barColors[i] <- hydrophilic}
|
||||
else if (grepl("[FAMILYVW]", AA)) {barColors[i] <- hydrophobic}
|
||||
else barColors[i] <- plain
|
||||
}
|
||||
|
||||
barplot(logRatio,
|
||||
main = paste("AA composition enrichment"),
|
||||
ylab = label,
|
||||
col = barColors,
|
||||
cex.names=0.9)
|
||||
|
||||
|
||||
# draw a horizontal line at y = 0
|
||||
abline(h=0)
|
||||
|
||||
# add a legend that indicates what the colours mean
|
||||
legend (x = 1,
|
||||
y = -1,
|
||||
legend = c("charged (+)",
|
||||
"charged (-)",
|
||||
"hydrophilic",
|
||||
"hydrophobic",
|
||||
"plain"),
|
||||
bty = "n",
|
||||
fill = c(chargePlus,
|
||||
chargeMinus,
|
||||
hydrophilic,
|
||||
hydrophobic,
|
||||
plain)
|
||||
)
|
||||
|
||||
# == TASK ==
|
||||
# Interpret this plot. (Can you?)
|
||||
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
262
BIN-SEQA-Composition.R
Normal file
262
BIN-SEQA-Composition.R
Normal file
@ -0,0 +1,262 @@
|
||||
# BIN-SEQA-Composition.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-SEQA-Comparison unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 11 17
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 First live version 2017
|
||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Preparation 41
|
||||
#TOC> 2 Aggregate properties 63
|
||||
#TOC> 3 Sequence Composition Enrichment 106
|
||||
#TOC> 3.1 Barplot, and side-by-side barplot 129
|
||||
#TOC> 3.2 Plotting ratios 164
|
||||
#TOC> 3.3 Plotting log ratios 180
|
||||
#TOC> 3.4 Sort by frequency 195
|
||||
#TOC> 3.5 Color by amino acid type 210
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation =========================================================
|
||||
|
||||
if (!require(seqinr, quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
library(seqinr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Load a reference sequence to work with:
|
||||
|
||||
# If you have done the BIN-Storing_data unit:
|
||||
source("makeProteinDB.R")
|
||||
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
|
||||
mySeq <- myDB$protein$sequence[sel]
|
||||
|
||||
# If not, use the yeast Mbp1 sequence:
|
||||
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
|
||||
|
||||
|
||||
# = 2 Aggregate properties ================================================
|
||||
|
||||
|
||||
# Let's try a simple function from seqinr: computing the pI of the sequence
|
||||
?computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
|
||||
# We can use the function strsplit() to split the string
|
||||
# into single characters
|
||||
|
||||
(s <- strsplit(mySeq, "")) # splitting on the empty spring
|
||||
# splits into single characters
|
||||
s <- unlist(s) # strsplit() returns a list! Why?
|
||||
# (But we don't need a list now...)
|
||||
|
||||
# Alternatively, seqinr provides
|
||||
# the function s2c() to convert strings into
|
||||
# character vectors (and c2s to convert them back).
|
||||
|
||||
s2c(mySeq)
|
||||
|
||||
|
||||
computePI(s2c(mySeq)) # isoelectric point
|
||||
pmw(s2c(mySeq)) # molecular weight
|
||||
AAstat(s2c(mySeq)) # This also plots the distribution of
|
||||
# values along the sequence
|
||||
|
||||
# A true Labor of Love has gone into the
|
||||
# compilation of the "aaindex" data:
|
||||
|
||||
?aaindex
|
||||
data(aaindex) # "attach" the dataset - i.e. make it accessible as an
|
||||
# R object
|
||||
|
||||
length(aaindex)
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
|
||||
# = 3 Sequence Composition Enrichment =====================================
|
||||
|
||||
|
||||
# Lets use one of the indices to calculate and plot amino-acid
|
||||
# composition enrichment:
|
||||
aaindex[[459]]$D
|
||||
|
||||
#
|
||||
# Let's construct an enrichment plot to compare average frequencies
|
||||
# with the amino acid counts in our sequence.
|
||||
|
||||
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
||||
names(refData) <- a(names(refData)) # change names to single-letter
|
||||
# code using seqinr's "a()" function
|
||||
sum(refData)
|
||||
refData # ... in %
|
||||
|
||||
|
||||
# tabulate the amino acid counts in mySeq
|
||||
(obsData <- table(s2c(mySeq))) # counts
|
||||
(obsData <- 100 * (obsData / sum(obsData))) # frequencies
|
||||
|
||||
|
||||
# == 3.1 Barplot, and side-by-side barplot =================================
|
||||
|
||||
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#BB0000")
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# Ok: first problem - the values in obsData are in alphabetical order. But the
|
||||
# values in refData are in alphabetical order of amino acid name: alanine,
|
||||
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
|
||||
# order a lot - one of the old biochemistry tropes in the field. So we need to
|
||||
# re-order one of the vectors to match the other. That's easy though:
|
||||
refData
|
||||
(refData <- refData[names(obsData)])
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# To compare the values, we want to see them in a barplot, side-by-side ...
|
||||
barplot(rbind(obsData, refData),
|
||||
ylim = c(0, 12),
|
||||
beside = TRUE,
|
||||
col = c("#CCCCCC", "#BB0000"),
|
||||
cex.names = 0.7)
|
||||
abline(h = 100/20, col="#00000044")
|
||||
|
||||
# ... and add a legend
|
||||
legend (x = 1, y = 12,
|
||||
legend = c("mySeq", "Average composition"),
|
||||
fill = c("#CCCCCC", "#BB0000"),
|
||||
cex = 0.7,
|
||||
bty = "n")
|
||||
|
||||
|
||||
# == 3.2 Plotting ratios ===================================================
|
||||
|
||||
# To better compare the values, we'll calculate ratios between
|
||||
# obsData and refData
|
||||
|
||||
barplot(obsData / refData,
|
||||
col = "#CCCCCC",
|
||||
ylab = "Sequence / Average",
|
||||
cex.names = 0.7)
|
||||
abline(h = 1, col="#BB0000")
|
||||
abline(h = c(1/3, 3), lty = 2, col="#BB000055")
|
||||
|
||||
# ... but ratios are not very good here, since the difference in height on the
|
||||
# plot now depends on the order we compare in: ratios of 1/3 and 3 (dotted
|
||||
# lines) are exactly the same fold-difference !
|
||||
|
||||
# == 3.3 Plotting log ratios ===============================================
|
||||
|
||||
# A better way to display this
|
||||
# is to plot log(ratios).
|
||||
|
||||
barplot(log(obsData / refData),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055")
|
||||
|
||||
# Note how the three-fold difference lines are now the same distance from the
|
||||
# line of equal ratio.
|
||||
|
||||
# == 3.4 Sort by frequency =================================================
|
||||
|
||||
barplot(sort(log(obsData / refData), decreasing = TRUE),
|
||||
ylim = c(-3.5,2),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/3, 3)), lty = 2, col="#BB000055")
|
||||
|
||||
arrows(4, 1.8, 0, 1.8, length = 0.07)
|
||||
text(5.5, 1.8, "Enriched", cex = 0.7)
|
||||
arrows(20, 1.8, 24, 1.8, length = 0.07)
|
||||
text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
# == 3.5 Color by amino acid type ==========================================
|
||||
|
||||
# color the bars by type.
|
||||
# define colors
|
||||
AAcol <- character()
|
||||
AAcol["A"] <- "#AABBAA"
|
||||
AAcol["C"] <- "#FFEE77"
|
||||
AAcol["D"] <- "#DD6600"
|
||||
AAcol["E"] <- "#DD3300"
|
||||
AAcol["F"] <- "#767D38"
|
||||
AAcol["G"] <- "#BBBBCC"
|
||||
AAcol["H"] <- "#A2A1FD"
|
||||
AAcol["I"] <- "#70B6C6"
|
||||
AAcol["K"] <- "#4563BB"
|
||||
AAcol["L"] <- "#80C6B6"
|
||||
AAcol["M"] <- "#AFCC34"
|
||||
AAcol["N"] <- "#BB88CC"
|
||||
AAcol["P"] <- "#7292B7"
|
||||
AAcol["Q"] <- "#8866BB"
|
||||
AAcol["R"] <- "#74A0FF"
|
||||
AAcol["S"] <- "#9999CC"
|
||||
AAcol["T"] <- "#99AADD"
|
||||
AAcol["V"] <- "#9DB500"
|
||||
AAcol["W"] <- "#76AD48"
|
||||
AAcol["Y"] <- "#44CA97"
|
||||
|
||||
barplot(rep(1, 20), names.arg = names(AAcol), col = AAcol, cex.names = 0.5)
|
||||
|
||||
lR <- sort(log(obsData / refData), decreasing = TRUE)
|
||||
barplot(lR,
|
||||
ylim = c(-3.5,2),
|
||||
col = AAcol[names(lR)],
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#00000055")
|
||||
abline(h = log(c(1/3, 3)), lty = 2, col="#00000033")
|
||||
|
||||
arrows(4, 1.8, 0, 1.8, length = 0.07)
|
||||
text(5.5, 1.8, "Enriched", cex = 0.7)
|
||||
arrows(20, 1.8, 24, 1.8, length = 0.07)
|
||||
text(19.5, 1.8, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
|
||||
# Task:
|
||||
# Interpret this plot. (Can you?) Which types of amino acids are enriched?
|
||||
# Depleted?
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user