2020-09-18 11:56:30 +00:00
|
|
|
# tocID <- "BIN-FUNC-Domain_annotation.R"
|
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# Purpose: A Bioinformatics Course:
|
|
|
|
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
|
|
|
#
|
2021-09-16 05:29:19 +00:00
|
|
|
# ==============================================================================
|
2020-10-13 12:37:31 +00:00
|
|
|
# Version: 1.4
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2020-10-02 08:50:36 +00:00
|
|
|
# Date: 2017-11 - 2020-10
|
2017-09-12 20:09:20 +00:00
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# Versions:
|
2020-10-13 12:37:31 +00:00
|
|
|
# 1.4 Add code for shared data import from the Wiki
|
2020-10-13 06:08:35 +00:00
|
|
|
# 1.3 Add code for database export to JSON and instructions
|
|
|
|
# for uploading annotations to the Public Student Wiki page
|
2020-10-02 08:50:36 +00:00
|
|
|
# 1.2 Consistently: data in ./myScripts/ ;
|
|
|
|
# begin SHARING DATA section
|
2020-09-25 05:24:51 +00:00
|
|
|
# 1.1 2020 Updates
|
2017-11-14 07:57:13 +00:00
|
|
|
# 1.0 Live version 2017
|
2017-09-12 20:09:20 +00:00
|
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
|
|
# TODO:
|
2020-10-13 12:37:31 +00:00
|
|
|
# Put the domain plot into a function
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
|
|
# going on. That's not how it works ...
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# ==============================================================================
|
|
|
|
|
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
#TOC> ==========================================================================
|
2021-10-26 15:04:31 +00:00
|
|
|
#TOC>
|
2020-09-25 05:24:51 +00:00
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> ---------------------------------------------------------------------
|
2021-10-26 15:04:31 +00:00
|
|
|
#TOC> 1 Update your database script 51
|
|
|
|
#TOC> 1.1 Preparing an annotation file ... 58
|
|
|
|
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
|
|
|
|
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
|
|
|
|
#TOC> 1.2 Execute and Validate 136
|
|
|
|
#TOC> 2 Plot Annotations 161
|
|
|
|
#TOC> 3 SHARING DATA 287
|
|
|
|
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303
|
|
|
|
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
|
|
|
|
#TOC>
|
2017-11-14 07:57:13 +00:00
|
|
|
#TOC> ==========================================================================
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
# = 1 Update your database script =========================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
# Since you have recorded domain features at the SMART database, we can store
|
2020-09-25 05:24:51 +00:00
|
|
|
# the feature annotations in myDB ...
|
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
|
|
|
|
# == 1.1 Preparing an annotation file ... ==================================
|
2020-09-25 05:24:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
2020-09-25 05:24:51 +00:00
|
|
|
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
2020-10-02 08:50:36 +00:00
|
|
|
# ./myScripts/ directory:
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
2020-10-02 08:50:36 +00:00
|
|
|
# myScripts/ directory.
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
|
|
|
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
|
|
|
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
|
|
|
# "MBP1_CRYNE").
|
|
|
|
#
|
|
|
|
# - Open the file in the RStudio editor and delete all blocks for
|
|
|
|
# the Mbp1 protein annotations except the first one.
|
|
|
|
#
|
|
|
|
# - From that block, delete all lines that have annotations you did not
|
|
|
|
# find in SMART for MBP1_MYSPE.
|
|
|
|
#
|
|
|
|
# - Make enough copies of the "Ankyrin fold" and "low complexity" region
|
|
|
|
# lines to have a line for each feature you found.
|
|
|
|
#
|
|
|
|
# - Then delete the comma at the end of the last line.
|
|
|
|
#
|
|
|
|
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
|
|
|
|
# and change the "start" and "end" features to the coordinates you
|
|
|
|
# recorded in the SMART database.
|
|
|
|
#
|
2020-10-02 08:50:36 +00:00
|
|
|
# - Save your file in the ./myScripts/ folder.
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - Validate your file online at https://jsonlint.com/
|
|
|
|
#
|
2020-10-02 08:50:36 +00:00
|
|
|
# - Update your "./myScripts/makeProteinDB.R" script to load your new
|
2017-11-14 07:57:13 +00:00
|
|
|
# annotation when you recreate the database. Open the script in the
|
|
|
|
# RStudio editor, and add the following command at the end:
|
|
|
|
#
|
2020-10-02 08:50:36 +00:00
|
|
|
# myDB <- dbAddAnnotation(myDB,
|
|
|
|
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
|
|
|
|
# ^^^^^^^
|
|
|
|
# edit this!
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - save and close the file.
|
|
|
|
#
|
|
|
|
# Then SKIP the next section.
|
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2021-10-26 15:04:31 +00:00
|
|
|
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2020-09-25 05:24:51 +00:00
|
|
|
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2020-09-25 05:24:51 +00:00
|
|
|
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the
|
2020-10-02 08:50:36 +00:00
|
|
|
# ./myScripts/ directory:
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - Open the file in the RStudio editor.
|
|
|
|
#
|
|
|
|
# - Make as many copies of the "APSES fold" line as you have found
|
|
|
|
# features in SMART.
|
|
|
|
#
|
|
|
|
# - Add a comma after every line except for the last one
|
|
|
|
#
|
|
|
|
# - Edit the annotations but include only features that are in the
|
2020-10-02 08:50:36 +00:00
|
|
|
# myDB$feature table. Check which features are in the database by executing
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# myDB$feature$name
|
|
|
|
#
|
|
|
|
# - Update the "start" and "end" coordinates for each feature to the
|
|
|
|
# values you found.
|
|
|
|
#
|
|
|
|
# - Save your file.
|
|
|
|
#
|
|
|
|
# - Validate your file online at https://jsonlint.com/
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# == 1.2 Execute and Validate ==============================================
|
|
|
|
#
|
|
|
|
# - source() your database creation script:
|
|
|
|
#
|
2020-10-02 08:50:36 +00:00
|
|
|
# source("./myScripts/makeProteinDB.R")
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# This should run without errors or warnings. If it doesn't work and you
|
2020-09-25 05:24:51 +00:00
|
|
|
# can't figure out quickly what's happening, ask for help on the
|
|
|
|
# Discussion Board.
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# - Confirm
|
|
|
|
# The following commands should retrieve all of the features that have been
|
|
|
|
# annotated for MBP1_MYSPE
|
|
|
|
|
|
|
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
|
|
|
|
|
|
|
(proID <- myDB$protein$ID[sel])
|
|
|
|
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
|
|
|
|
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
|
|
|
|
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
|
|
|
|
# (once). If not, consider what could have gone wrong
|
|
|
|
# and ask on the list if you have difficulties fixing
|
|
|
|
# it.
|
|
|
|
|
|
|
|
|
|
|
|
# = 2 Plot Annotations ====================================================
|
|
|
|
|
|
|
|
# In this section we will plot domain annotations as colored rectangles on a
|
2020-09-25 05:24:51 +00:00
|
|
|
# sequence, as an example of using the R plotting system for generic, data
|
2017-11-14 07:57:13 +00:00
|
|
|
# driven images.
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# We need a small utility function that draws the annotation boxes on a
|
2017-11-14 07:57:13 +00:00
|
|
|
# representation of sequence. It should accept the start and end coordinates,
|
|
|
|
# the y value where it should be plotted and the color of the box, and plot a
|
|
|
|
# rectangle using R's rect() function.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2020-09-25 05:24:51 +00:00
|
|
|
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
|
2017-11-14 07:57:13 +00:00
|
|
|
# Draw a box from xStart to xEnd at y, filled with colour myCol
|
2020-09-25 05:24:51 +00:00
|
|
|
# The height of the box is y +- DELTA
|
|
|
|
rect(xStart, (y - DELTA), xEnd, (y + DELTA),
|
2017-11-14 07:57:13 +00:00
|
|
|
border = "black", col = myCol)
|
2017-09-12 20:09:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# test this:
|
|
|
|
plot(c(-1.5, 1.5), c(0, 0), type = "l")
|
|
|
|
drawBox(-1, 1, 0.0, "peachpuff")
|
|
|
|
|
|
|
|
# Next, we define a function to plot annotations for one protein: the name of
|
|
|
|
# the protein, a horizontal grey line for its length, and all of its features.
|
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
plotProtein <- function(DB, name, y) {
|
|
|
|
# DB: protein database
|
|
|
|
# name: the name of the protein in the database.
|
|
|
|
# y: height where to draw the plot
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
# Define colors: we create a vector of color values, one for
|
|
|
|
# each feature, and we give it names of the feature ID. Then we
|
|
|
|
# can easily get the color value from the feature name.
|
|
|
|
# A: make a vector of color values. The syntax may appear unusual -
|
|
|
|
# colorRampPalette() returns a function, and we simply append
|
|
|
|
# the parameter (number-of-features) without assigning the function
|
|
|
|
# to its own variable name.
|
|
|
|
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
|
|
|
|
"#62C923", "#0A9A9B", "#1958C3",
|
|
|
|
"#8000D3", "#D0007F"),
|
|
|
|
space="Lab",
|
|
|
|
interpolate="linear")(nrow(DB$feature))
|
|
|
|
# B: Features may overlap, so we make the colors transparent by setting
|
2017-11-14 07:57:13 +00:00
|
|
|
# their "alpha channel" to 1/3 (hex: 55)
|
|
|
|
ftrCol <- paste0(ftrCol, "55")
|
2017-09-12 20:09:20 +00:00
|
|
|
# C: we asssign names
|
|
|
|
names(ftrCol) <- DB$feature$ID
|
|
|
|
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
|
|
|
|
|
|
|
|
# find the row-index of the protein ID in the protein table of DB
|
2017-11-14 07:57:13 +00:00
|
|
|
iProtein <- which(DB$protein$name == name)
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# write the name of the protein
|
2017-11-14 07:57:13 +00:00
|
|
|
text(-30, y, adj=1, labels=name, cex=0.75 )
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
#draw a line from 0 to nchar(sequence-of-the-protein)
|
|
|
|
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
|
|
|
|
lwd=3, col="#999999")
|
|
|
|
|
|
|
|
# get the rows of feature annotations for the protein
|
2017-11-14 07:57:13 +00:00
|
|
|
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# draw a colored box for each feature
|
|
|
|
for (i in iFtr) {
|
2017-11-14 07:57:13 +00:00
|
|
|
drawBox(DB$annotation$start[i],
|
|
|
|
DB$annotation$end[i],
|
2017-09-12 20:09:20 +00:00
|
|
|
y,
|
2017-11-14 07:57:13 +00:00
|
|
|
ftrCol[ DB$annotation$featureID[i] ])
|
2017-09-12 20:09:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Plot each annotated protein:
|
2017-11-14 07:57:13 +00:00
|
|
|
# Get the rows of all unique annotated Mbp1 proteins in myDB
|
|
|
|
|
|
|
|
iRows <- grep("^MBP1_", myDB$protein$name)
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# define the size of the plot-frame to accomodate all proteins
|
|
|
|
yMax <- length(iRows) * 1.1
|
|
|
|
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
|
|
|
|
|
|
|
# plot an empty frame
|
2020-09-25 05:24:51 +00:00
|
|
|
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
|
2020-10-02 08:50:36 +00:00
|
|
|
# decrease margins
|
2017-11-14 07:57:13 +00:00
|
|
|
plot(1, 1,
|
|
|
|
xlim = c(-200, xMax + 100),
|
|
|
|
ylim = c(0, yMax),
|
|
|
|
type = "n",
|
|
|
|
axes = FALSE,
|
|
|
|
bty = "n",
|
|
|
|
main = "Mbp1 orthologue domain annotations",
|
|
|
|
xlab = "sequence position",
|
2020-09-25 05:24:51 +00:00
|
|
|
cex.axis = 0.8,
|
2017-11-14 07:57:13 +00:00
|
|
|
ylab="")
|
2017-09-12 20:09:20 +00:00
|
|
|
axis(1, at = seq(0, xMax, by = 100))
|
2017-11-14 07:57:13 +00:00
|
|
|
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
|
|
|
"#f0ea00", "#62C923",
|
|
|
|
"#0A9A9B", "#1958C3",
|
|
|
|
"#8000D3", "#D0007F"),
|
|
|
|
space="Lab",
|
|
|
|
interpolate="linear")(nrow(myDB$feature))
|
|
|
|
myCol <- paste0(myCol, "55")
|
2020-10-13 12:37:31 +00:00
|
|
|
legend(xMax - 150, 7,
|
2017-11-14 07:57:13 +00:00
|
|
|
legend = myDB$feature$name,
|
|
|
|
cex = 0.7,
|
2020-10-13 12:37:31 +00:00
|
|
|
fill = myCol,
|
|
|
|
bty = "n")
|
2017-11-14 07:57:13 +00:00
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# Finally, iterate over all proteins and call plotProtein()
|
2017-11-14 07:57:13 +00:00
|
|
|
for (i in seq_along(iRows)) {
|
|
|
|
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
2017-09-12 20:09:20 +00:00
|
|
|
}
|
2020-09-25 05:24:51 +00:00
|
|
|
par(oPar) # reset the plot parameters
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
# The plot shows what is variable and what is constant about the annotations in
|
|
|
|
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
|
|
|
|
# top.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-11-14 07:57:13 +00:00
|
|
|
# Task:
|
|
|
|
# Put a copy of the plot into your journal and interpret it with respect
|
|
|
|
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2020-09-25 05:24:51 +00:00
|
|
|
# Task:
|
|
|
|
# It would be better to align the motif borders, at least approximately (not
|
|
|
|
# all proteins have all motifs). How would you go about doing that?
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2020-10-13 06:08:35 +00:00
|
|
|
# = 3 SHARING DATA ========================================================
|
2020-10-02 08:50:36 +00:00
|
|
|
|
|
|
|
# It's particularly interesting to compare such annotations across many
|
2020-10-13 06:08:35 +00:00
|
|
|
# homologous proteins. I have created a page on the Student Wiki () that you can
|
2020-10-02 08:50:36 +00:00
|
|
|
# edit, and then download the data from the entire class directly to your
|
|
|
|
# RStudio project.
|
|
|
|
#
|
2020-10-13 06:08:35 +00:00
|
|
|
|
|
|
|
# I have provided a function that extracts all information that refers to a
|
|
|
|
# single protein from the database, and prints it out as well-formatted JSON,
|
|
|
|
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
|
|
|
|
# bookkeeping involved, but the code is not otherwise very enlightening so I
|
|
|
|
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
|
|
|
# would want to have a look.
|
|
|
|
|
2020-10-13 12:37:31 +00:00
|
|
|
|
2020-10-13 06:08:35 +00:00
|
|
|
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
|
|
|
|
2020-10-02 08:50:36 +00:00
|
|
|
# Task:
|
|
|
|
# =====
|
2020-10-13 06:08:35 +00:00
|
|
|
# 1: Run the following code:
|
|
|
|
|
|
|
|
cat("{{Vspace}}",
|
|
|
|
"<!-- ==== BEGIN PROTEIN ==== -->",
|
2020-10-13 12:37:31 +00:00
|
|
|
"<pre class=\"protein-data\">",
|
2020-10-13 06:08:35 +00:00
|
|
|
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
|
|
|
"</pre>",
|
|
|
|
"<!-- ===== END PROTEIN ====== -->",
|
|
|
|
"", sep = "\n"
|
|
|
|
)
|
|
|
|
|
2020-10-13 12:37:31 +00:00
|
|
|
# 2: Copy the entire output from the console.
|
2020-10-13 06:08:35 +00:00
|
|
|
# 3: Navigate to
|
|
|
|
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
|
|
|
# ... edit the page, and paste your output at the top.
|
|
|
|
# 4: Save your edits.
|
|
|
|
|
|
|
|
|
2020-10-02 08:50:36 +00:00
|
|
|
|
2020-10-13 12:37:31 +00:00
|
|
|
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
|
|
|
|
|
|
|
|
# Once we have collected a number of protein annotations, we can access the
|
|
|
|
# Wiki-page and import the data into our database. The Wiki page is an html
|
|
|
|
# document with lots of MediaWiki specific stuff - but the contents we are
|
|
|
|
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
|
|
|
|
# work like normal HTML <pre> tags, but we have defined a special class for them
|
|
|
|
# to make it easy to parse out the contents we want. The rvest:: package in
|
|
|
|
# combination with xml2:: provides us with all the tools we need for such
|
|
|
|
# "Webscraping" of data....
|
|
|
|
|
|
|
|
if (! requireNamespace("rvest", quietly=TRUE)) {
|
|
|
|
install.packages("rvest")
|
|
|
|
}
|
|
|
|
|
|
|
|
if (! requireNamespace("xml2", quietly=TRUE)) {
|
|
|
|
install.packages("xml2")
|
|
|
|
}
|
|
|
|
|
|
|
|
# Here's the process:
|
|
|
|
# The URL is an "open" page on the student Wiki. Users that are not logged in
|
|
|
|
# can view the contents, but you can only edit if you are logged in.
|
|
|
|
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
|
|
|
|
|
|
|
|
# First thing is to retrieve the HTML from the url...
|
|
|
|
x <- xml2::read_html(myURL)
|
|
|
|
|
|
|
|
# This retrieves the page source, but that still needs to be parsed into its
|
|
|
|
# logical elements. HTML is a subset of XML and such documents are structured as
|
|
|
|
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
|
|
|
|
# parses out the document structure and then uses a so-called "xpath" expression
|
|
|
|
# to select nodes we are interested in. Now, xpath is one of those specialized
|
|
|
|
# languages of which there are a few more to learn than one would care for. You
|
|
|
|
# MUST know how to format sprintf() expressions, and you SHOULD be competent
|
|
|
|
# with regular expressions. But if you want to be really competent in your work,
|
|
|
|
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
|
|
|
|
# to search on Stackoverflow for what you need for parsing data out of Web
|
|
|
|
# documents...
|
|
|
|
|
|
|
|
# The expression we use below is:
|
|
|
|
# - get any node anywhere in the tree ("//*") ...
|
|
|
|
# - that has a particular attribute("[@ ... ]").
|
|
|
|
# - The attribute we want is that the class of the node is "protein-data";
|
|
|
|
# that is the class we have defined for our <pre> tags.
|
|
|
|
# As a result of this selection, we get a list of pointers to the document tree.
|
|
|
|
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
|
|
|
|
|
|
|
|
# Next we fetch the actual payload - the text - from the tree:
|
|
|
|
# rvest::html_text() gets the text from the list of pointers. The result is a
|
|
|
|
# normal list of character strings.
|
|
|
|
z <- rvest::html_text(y)
|
|
|
|
|
|
|
|
# Finally we can iterate over the list, and add all proteins we don't already
|
|
|
|
# have to our database. There may well be items that are rejected because they
|
|
|
|
# are already present in the database - for example, unless somebody has
|
|
|
|
# annotated new features, all of the features are already there. Don't worry -
|
|
|
|
# that is intended; we don't want duplicate entries.
|
|
|
|
|
|
|
|
for (thisJSON in z) {
|
|
|
|
thisData <- jsonlite::fromJSON(thisJSON)
|
|
|
|
if (! thisData$protein$name %in% myDB$protein$name) {
|
|
|
|
myDB <- dbAddProtein(myDB, thisData$protein)
|
|
|
|
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
|
|
|
|
myDB <- dbAddFeature(myDB, thisData$feature)
|
|
|
|
myDB <- dbAddAnnotation(myDB, thisData$annotation)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
|
|
|
|
|
|
|
|
iRows <- grep("^MBP1_", myDB$protein$name)
|
|
|
|
yMax <- length(iRows) * 1.1
|
|
|
|
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
|
|
|
|
|
|
|
# plot an empty frame
|
|
|
|
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
|
|
|
|
plot(1, 1,
|
|
|
|
xlim = c(-200, xMax + 100),
|
|
|
|
ylim = c(0, yMax),
|
|
|
|
type = "n",
|
|
|
|
axes = FALSE,
|
|
|
|
bty = "n",
|
|
|
|
main = "Mbp1 orthologue domain annotations",
|
|
|
|
xlab = "sequence position",
|
|
|
|
cex.axis = 0.8,
|
|
|
|
ylab="")
|
|
|
|
axis(1, at = seq(0, xMax, by = 100))
|
|
|
|
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
|
|
|
"#f0ea00", "#62C923",
|
|
|
|
"#0A9A9B", "#1958C3",
|
|
|
|
"#8000D3", "#D0007F"),
|
|
|
|
space="Lab",
|
|
|
|
interpolate="linear")(nrow(myDB$feature))
|
|
|
|
myCol <- paste0(myCol, "55")
|
|
|
|
legend(xMax - 150, 7,
|
|
|
|
legend = myDB$feature$name,
|
|
|
|
cex = 0.7,
|
|
|
|
fill = myCol,
|
|
|
|
bty = "n")
|
|
|
|
|
|
|
|
for (i in seq_along(iRows)) {
|
|
|
|
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
|
|
|
}
|
|
|
|
par(oPar) # reset the plot parameters
|
|
|
|
|
|
|
|
# ... the more proteins we can compare, the more we learn about the
|
|
|
|
# architectural principles of this family's domains.
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# [END]
|