bch441-work-abc-units/BIN-FUNC-Domain_annotation.R
2017-09-12 16:09:20 -04:00

148 lines
5.2 KiB
R

# BIN-FUNC-Domain_annotation.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 SMART Domain annotations
# Plot domain annotations as colored rectangles on a sequence.
# Step one: enter your domain annotations as features into the database.
#
# == Update myDB
# If the reference database has changed, we need to merge it in with myDB.
load("myDB.03.RData") # load the previous version of myDB
# the new version of refDB was loaded when you
# pulled it from GitHub, and then typed init()
myDB <- dbMerge(myDB) # merge the two databases and update myDB with the result
save(myDB, file = "myDB.04.RData") # save the new version
# == Update myDB
# Every annotated feature requires its own entry in the database. You have added
# the feature for the "APSES fold" before, so you can copy and edit that code
# from your myCode.R script. Here is again the table of feature IDs:
myDB$feature[ , c("ID", "name", "description")]
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
# your corrected annotation script again. Execute ...
myDB$proteinAnnotation
# ... to confirm.
#
# Once you are sure your annotations are correct, save the database again.
save(myDB, file = "myDB.05.RData") # save the new version
#
# Now let's plot the annotations.
#
# We need a small utility function that draws the annotation boxes on a
# representation of sequence. It will accept the left and right boundaries, the
# height and the color of the box and plot it using R's rect() function.
drawBox <- function(xLeft, xRight, y, colour) {
# Draw a box from xLeft to xRight at y, filled with colour
rect(xLeft, (y - 0.1), xRight, (y + 0.1),
border = "black", col = colour)
}
# test this:
plot(c(-1.5, 1.5), c(0, 0), type = "l")
drawBox(-1, 1, 0.0, "peachpuff")
# Next, we define a function to plot annotations for one protein: the name of
# the protein, a horizontal grey line for its length, and all of its features.
plotProtein <- function(DB, ID, y) {
# DB: protein database, probably you want myDB
# ID: the ID of the protein to plot.
# y: where to draw the plot
#
# Define colors: we create a vector of color values, one for
# each feature, and we give it names of the feature ID. Then we
# can easily get the color value from the feature name.
# A: make a vector of color values. The syntax may appear unusual -
# colorRampPalette() returns a function, and we simply append
# the parameter (number-of-features) without assigning the function
# to its own variable name.
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
"#62C923", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(DB$feature))
# B: Features may overlap, so we make the colors transparent by setting
# their "alpha channel" to 1/2 (hex: 7F)
ftrCol <- paste(ftrCol, "7F", sep = "")
# C: we asssign names
names(ftrCol) <- DB$feature$ID
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
# find the row-index of the protein ID in the protein table of DB
iProtein <- which(DB$protein$ID == ID)
# write the name of the protein
text(-30, y, adj=1, labels=DB$protein$name[iProtein], cex=0.75 )
#draw a line from 0 to nchar(sequence-of-the-protein)
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
lwd=3, col="#999999")
# get the rows of feature annotations for the protein
iFtr <- which(DB$proteinAnnotation$protein.ID == ID)
# draw a colored box for each feature
for (i in iFtr) {
drawBox(DB$proteinAnnotation$start[i],
DB$proteinAnnotation$end[i],
y,
ftrCol[ DB$proteinAnnotation$feature.ID[i] ])
}
}
# Plot each annotated protein:
# Get the rows of all unique annotated protein IDs in the protein table
iRows <- which(myDB$protein$ID %in% unique(myDB$proteinAnnotation$protein.ID))
# define the size of the plot-frame to accomodate all proteins
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
plot(1,1, xlim=c(-200, xMax), ylim=c(0, yMax),
type="n", axes=FALSE, bty="n", xlab="sequence position", ylab="")
axis(1, at = seq(0, xMax, by = 100))
# Finally, iterate over all proteins and call plotProtein()
for (i in 1:length(iRows)) {
plotProtein(myDB, myDB$protein$ID[iRows[i]], i)
}
# The plot shows clearly what is variable and what is constant about the
# annotations in a group of related proteins. Print the plot and bring it to
# class for the next quiz.
#
# = 1 Tasks
# [END]