148 lines
5.2 KiB
R
148 lines
5.2 KiB
R
# BIN-FUNC-Domain_annotation.R
|
|
#
|
|
# Purpose: A Bioinformatics Course:
|
|
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
|
#
|
|
# Version: 0.1
|
|
#
|
|
# Date: 2017 08 28
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
#
|
|
# Versions:
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
# TODO:
|
|
#
|
|
#
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
# going on. That's not how it works ...
|
|
|
|
# ==============================================================================
|
|
|
|
# = 1 SMART Domain annotations
|
|
|
|
# Plot domain annotations as colored rectangles on a sequence.
|
|
# Step one: enter your domain annotations as features into the database.
|
|
#
|
|
# == Update myDB
|
|
# If the reference database has changed, we need to merge it in with myDB.
|
|
load("myDB.03.RData") # load the previous version of myDB
|
|
# the new version of refDB was loaded when you
|
|
# pulled it from GitHub, and then typed init()
|
|
myDB <- dbMerge(myDB) # merge the two databases and update myDB with the result
|
|
save(myDB, file = "myDB.04.RData") # save the new version
|
|
|
|
# == Update myDB
|
|
|
|
# Every annotated feature requires its own entry in the database. You have added
|
|
# the feature for the "APSES fold" before, so you can copy and edit that code
|
|
# from your myCode.R script. Here is again the table of feature IDs:
|
|
myDB$feature[ , c("ID", "name", "description")]
|
|
|
|
# Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
|
|
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
|
|
# your corrected annotation script again. Execute ...
|
|
myDB$proteinAnnotation
|
|
# ... to confirm.
|
|
#
|
|
# Once you are sure your annotations are correct, save the database again.
|
|
save(myDB, file = "myDB.05.RData") # save the new version
|
|
#
|
|
# Now let's plot the annotations.
|
|
#
|
|
# We need a small utility function that draws the annotation boxes on a
|
|
# representation of sequence. It will accept the left and right boundaries, the
|
|
# height and the color of the box and plot it using R's rect() function.
|
|
|
|
drawBox <- function(xLeft, xRight, y, colour) {
|
|
# Draw a box from xLeft to xRight at y, filled with colour
|
|
rect(xLeft, (y - 0.1), xRight, (y + 0.1),
|
|
border = "black", col = colour)
|
|
}
|
|
|
|
# test this:
|
|
plot(c(-1.5, 1.5), c(0, 0), type = "l")
|
|
drawBox(-1, 1, 0.0, "peachpuff")
|
|
|
|
# Next, we define a function to plot annotations for one protein: the name of
|
|
# the protein, a horizontal grey line for its length, and all of its features.
|
|
|
|
plotProtein <- function(DB, ID, y) {
|
|
# DB: protein database, probably you want myDB
|
|
# ID: the ID of the protein to plot.
|
|
# y: where to draw the plot
|
|
#
|
|
# Define colors: we create a vector of color values, one for
|
|
# each feature, and we give it names of the feature ID. Then we
|
|
# can easily get the color value from the feature name.
|
|
# A: make a vector of color values. The syntax may appear unusual -
|
|
# colorRampPalette() returns a function, and we simply append
|
|
# the parameter (number-of-features) without assigning the function
|
|
# to its own variable name.
|
|
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
|
|
"#62C923", "#0A9A9B", "#1958C3",
|
|
"#8000D3", "#D0007F"),
|
|
space="Lab",
|
|
interpolate="linear")(nrow(DB$feature))
|
|
# B: Features may overlap, so we make the colors transparent by setting
|
|
# their "alpha channel" to 1/2 (hex: 7F)
|
|
ftrCol <- paste(ftrCol, "7F", sep = "")
|
|
# C: we asssign names
|
|
names(ftrCol) <- DB$feature$ID
|
|
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
|
|
|
|
# find the row-index of the protein ID in the protein table of DB
|
|
iProtein <- which(DB$protein$ID == ID)
|
|
|
|
# write the name of the protein
|
|
text(-30, y, adj=1, labels=DB$protein$name[iProtein], cex=0.75 )
|
|
|
|
#draw a line from 0 to nchar(sequence-of-the-protein)
|
|
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
|
|
lwd=3, col="#999999")
|
|
|
|
# get the rows of feature annotations for the protein
|
|
iFtr <- which(DB$proteinAnnotation$protein.ID == ID)
|
|
|
|
# draw a colored box for each feature
|
|
for (i in iFtr) {
|
|
drawBox(DB$proteinAnnotation$start[i],
|
|
DB$proteinAnnotation$end[i],
|
|
y,
|
|
ftrCol[ DB$proteinAnnotation$feature.ID[i] ])
|
|
}
|
|
}
|
|
|
|
# Plot each annotated protein:
|
|
# Get the rows of all unique annotated protein IDs in the protein table
|
|
iRows <- which(myDB$protein$ID %in% unique(myDB$proteinAnnotation$protein.ID))
|
|
# define the size of the plot-frame to accomodate all proteins
|
|
yMax <- length(iRows) * 1.1
|
|
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
|
|
|
# plot an empty frame
|
|
plot(1,1, xlim=c(-200, xMax), ylim=c(0, yMax),
|
|
type="n", axes=FALSE, bty="n", xlab="sequence position", ylab="")
|
|
axis(1, at = seq(0, xMax, by = 100))
|
|
|
|
# Finally, iterate over all proteins and call plotProtein()
|
|
for (i in 1:length(iRows)) {
|
|
plotProtein(myDB, myDB$protein$ID[iRows[i]], i)
|
|
}
|
|
|
|
# The plot shows clearly what is variable and what is constant about the
|
|
# annotations in a group of related proteins. Print the plot and bring it to
|
|
# class for the next quiz.
|
|
#
|
|
|
|
# = 1 Tasks
|
|
|
|
|
|
|
|
|
|
# [END]
|