# tocID <- "BIN-FUNC-Domain_annotation.R" # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC-Domain_annotation unit. # # Version: 1.2 # # Date: 2017-11 - 2020-10 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.2 Consistently: data in ./myScripts/ ; # begin SHARING DATA section # 1.1 2020 Updates # 1.0 Live version 2017 # 0.1 First code copied from 2016 material. # # TODO: # Complete SHARING DATA section ... # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------------- #TOC> 1 Update your database script 42 #TOC> 1.1 Preparing an annotation file ... 49 #TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 52 #TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 97 #TOC> 1.2 Execute and Validate 124 #TOC> 2 Plot Annotations 149 #TOC> #TOC> ========================================================================== # = 1 Update your database script ========================================= # Since you have recorded domain features at the SMART database, we can store # the feature annotations in myDB ... # == 1.1 Preparing an annotation file ... ================================== # === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" # # IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: # # You DON'T already have a file called "-Annotations.json" in the # ./myScripts/ directory: # # - Make a copy of the file "./data/refAnnotations.json" and put it in your # myScripts/ directory. # # - Give it a name that is structured like "-Annotations.json" - e.g. # if MYSPE is called "Crptycoccus neoformans", your file should be called # "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is # "MBP1_CRYNE"). # # - Open the file in the RStudio editor and delete all blocks for # the Mbp1 protein annotations except the first one. # # - From that block, delete all lines that have annotations you did not # find in SMART for MBP1_MYSPE. # # - Make enough copies of the "Ankyrin fold" and "low complexity" region # lines to have a line for each feature you found. # # - Then delete the comma at the end of the last line. # # - Edit the annotations: change MBP1_SACCE to MBP1_ everywhere # and change the "start" and "end" features to the coordinates you # recorded in the SMART database. # # - Save your file in the ./myScripts/ folder. # # - Validate your file online at https://jsonlint.com/ # # - Update your "./myScripts/makeProteinDB.R" script to load your new # annotation when you recreate the database. Open the script in the # RStudio editor, and add the following command at the end: # # myDB <- dbAddAnnotation(myDB, # jsonlite::fromJSON("./myScripts/-Annotations.json")) # ^^^^^^^ # edit this! # # - save and close the file. # # Then SKIP the next section. # # # === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" # # IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: # # You SHOULD have a file called "-Annotations.json" in the # ./myScripts/ directory: # # - Open the file in the RStudio editor. # # - Make as many copies of the "APSES fold" line as you have found # features in SMART. # # - Add a comma after every line except for the last one # # - Edit the annotations but include only features that are in the # myDB$feature table. Check which features are in the database by executing # # myDB$feature$name # # - Update the "start" and "end" coordinates for each feature to the # values you found. # # - Save your file. # # - Validate your file online at https://jsonlint.com/ # # # == 1.2 Execute and Validate ============================================== # # - source() your database creation script: # # source("./myScripts/makeProteinDB.R") # # This should run without errors or warnings. If it doesn't work and you # can't figure out quickly what's happening, ask for help on the # Discussion Board. # # - Confirm # The following commands should retrieve all of the features that have been # annotated for MBP1_MYSPE sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") (proID <- myDB$protein$ID[sel]) (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) (ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) myDB$feature$name[ftrIDs] # This should list ALL of your annotated features # (once). If not, consider what could have gone wrong # and ask on the list if you have difficulties fixing # it. # = 2 Plot Annotations ==================================================== # In this section we will plot domain annotations as colored rectangles on a # sequence, as an example of using the R plotting system for generic, data # driven images. # We need a small utility function that draws the annotation boxes on a # representation of sequence. It should accept the start and end coordinates, # the y value where it should be plotted and the color of the box, and plot a # rectangle using R's rect() function. drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { # Draw a box from xStart to xEnd at y, filled with colour myCol # The height of the box is y +- DELTA rect(xStart, (y - DELTA), xEnd, (y + DELTA), border = "black", col = myCol) } # test this: plot(c(-1.5, 1.5), c(0, 0), type = "l") drawBox(-1, 1, 0.0, "peachpuff") # Next, we define a function to plot annotations for one protein: the name of # the protein, a horizontal grey line for its length, and all of its features. plotProtein <- function(DB, name, y) { # DB: protein database # name: the name of the protein in the database. # y: height where to draw the plot # # Define colors: we create a vector of color values, one for # each feature, and we give it names of the feature ID. Then we # can easily get the color value from the feature name. # A: make a vector of color values. The syntax may appear unusual - # colorRampPalette() returns a function, and we simply append # the parameter (number-of-features) without assigning the function # to its own variable name. ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", "#62C923", "#0A9A9B", "#1958C3", "#8000D3", "#D0007F"), space="Lab", interpolate="linear")(nrow(DB$feature)) # B: Features may overlap, so we make the colors transparent by setting # their "alpha channel" to 1/3 (hex: 55) ftrCol <- paste0(ftrCol, "55") # C: we asssign names names(ftrCol) <- DB$feature$ID # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] # find the row-index of the protein ID in the protein table of DB iProtein <- which(DB$protein$name == name) # write the name of the protein text(-30, y, adj=1, labels=name, cex=0.75 ) #draw a line from 0 to nchar(sequence-of-the-protein) lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), lwd=3, col="#999999") # get the rows of feature annotations for the protein iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) # draw a colored box for each feature for (i in iFtr) { drawBox(DB$annotation$start[i], DB$annotation$end[i], y, ftrCol[ DB$annotation$featureID[i] ]) } } # Plot each annotated protein: # Get the rows of all unique annotated Mbp1 proteins in myDB iRows <- grep("^MBP1_", myDB$protein$name) # define the size of the plot-frame to accomodate all proteins yMax <- length(iRows) * 1.1 xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence # plot an empty frame oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and # decrease margins plot(1, 1, xlim = c(-200, xMax + 100), ylim = c(0, yMax), type = "n", axes = FALSE, bty = "n", main = "Mbp1 orthologue domain annotations", xlab = "sequence position", cex.axis = 0.8, ylab="") axis(1, at = seq(0, xMax, by = 100)) myCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", "#62C923", "#0A9A9B", "#1958C3", "#8000D3", "#D0007F"), space="Lab", interpolate="linear")(nrow(myDB$feature)) myCol <- paste0(myCol, "55") legend(xMax - 150, 6, legend = myDB$feature$name, cex = 0.7, fill = myCol) # Finally, iterate over all proteins and call plotProtein() for (i in seq_along(iRows)) { plotProtein(myDB, myDB$protein$name[iRows[i]], i) } par(oPar) # reset the plot parameters # The plot shows what is variable and what is constant about the annotations in # a group of related proteins. Your MBP1_MYSPE annotations should appear at the # top. # Task: # Put a copy of the plot into your journal and interpret it with respect # to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. # Task: # It would be better to align the motif borders, at least approximately (not # all proteins have all motifs). How would you go about doing that? # = 1 SHARING DATA ====== # It's particularly interesting to compare such annotations across many # homologous proteins. I have created a file on the student Wiki that you can # edit, and then download the data from the entire class directly to your # RStudio project. # # Task: # ===== # Navigate to # [END]