Add code for shared protein data import
This commit is contained in:
parent
5524832a75
commit
d2fffdf0b8
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
||||||
#
|
#
|
||||||
# Version: 1.3
|
# Version: 1.4
|
||||||
#
|
#
|
||||||
# Date: 2017-11 - 2020-10
|
# Date: 2017-11 - 2020-10
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.4 Add code for shared data import from the Wiki
|
||||||
# 1.3 Add code for database export to JSON and instructions
|
# 1.3 Add code for database export to JSON and instructions
|
||||||
# for uploading annotations to the Public Student Wiki page
|
# for uploading annotations to the Public Student Wiki page
|
||||||
# 1.2 Consistently: data in ./myScripts/ ;
|
# 1.2 Consistently: data in ./myScripts/ ;
|
||||||
@ -18,7 +19,7 @@
|
|||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
# Complete SHARING DATA section ...
|
# Put the domain plot into a function
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
#
|
#
|
||||||
@ -33,14 +34,15 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------------
|
||||||
#TOC> 1 Update your database script 48
|
#TOC> 1 Update your database script 50
|
||||||
#TOC> 1.1 Preparing an annotation file ... 55
|
#TOC> 1.1 Preparing an annotation file ... 57
|
||||||
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 58
|
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 60
|
||||||
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 106
|
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 108
|
||||||
#TOC> 1.2 Execute and Validate 133
|
#TOC> 1.2 Execute and Validate 135
|
||||||
#TOC> 2 Plot Annotations 158
|
#TOC> 2 Plot Annotations 160
|
||||||
#TOC> 3 SHARING DATA 283
|
#TOC> 3 SHARING DATA 286
|
||||||
#TOC> 3.1 Post MBP1_MYSPE as JSON data 298
|
#TOC> 3.1 Post MBP1_MYSPE as JSON data 302
|
||||||
|
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 325
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -256,10 +258,11 @@ myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
|||||||
space="Lab",
|
space="Lab",
|
||||||
interpolate="linear")(nrow(myDB$feature))
|
interpolate="linear")(nrow(myDB$feature))
|
||||||
myCol <- paste0(myCol, "55")
|
myCol <- paste0(myCol, "55")
|
||||||
legend(xMax - 150, 6,
|
legend(xMax - 150, 7,
|
||||||
legend = myDB$feature$name,
|
legend = myDB$feature$name,
|
||||||
cex = 0.7,
|
cex = 0.7,
|
||||||
fill = myCol)
|
fill = myCol,
|
||||||
|
bty = "n")
|
||||||
|
|
||||||
# Finally, iterate over all proteins and call plotProtein()
|
# Finally, iterate over all proteins and call plotProtein()
|
||||||
for (i in seq_along(iRows)) {
|
for (i in seq_along(iRows)) {
|
||||||
@ -295,6 +298,7 @@ par(oPar) # reset the plot parameters
|
|||||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
||||||
# would want to have a look.
|
# would want to have a look.
|
||||||
|
|
||||||
|
|
||||||
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
||||||
|
|
||||||
# Task:
|
# Task:
|
||||||
@ -303,24 +307,128 @@ par(oPar) # reset the plot parameters
|
|||||||
|
|
||||||
cat("{{Vspace}}",
|
cat("{{Vspace}}",
|
||||||
"<!-- ==== BEGIN PROTEIN ==== -->",
|
"<!-- ==== BEGIN PROTEIN ==== -->",
|
||||||
"<pre>",
|
"<pre class=\"protein-data\">",
|
||||||
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
||||||
"</pre>",
|
"</pre>",
|
||||||
"<!-- ===== END PROTEIN ====== -->",
|
"<!-- ===== END PROTEIN ====== -->",
|
||||||
"", sep = "\n"
|
"", sep = "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2: Copy the entire output,
|
# 2: Copy the entire output from the console.
|
||||||
# 3: Navigate to
|
# 3: Navigate to
|
||||||
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
||||||
# ... edit the page, and paste your output at the top.
|
# ... edit the page, and paste your output at the top.
|
||||||
# 4: Save your edits.
|
# 4: Save your edits.
|
||||||
|
|
||||||
# Next, once we have collected a number of protein annotations, we can access
|
|
||||||
# the page and import the data into our database.
|
|
||||||
#
|
|
||||||
# Code to come soon ...
|
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
|
||||||
|
|
||||||
|
# Once we have collected a number of protein annotations, we can access the
|
||||||
|
# Wiki-page and import the data into our database. The Wiki page is an html
|
||||||
|
# document with lots of MediaWiki specific stuff - but the contents we are
|
||||||
|
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
|
||||||
|
# work like normal HTML <pre> tags, but we have defined a special class for them
|
||||||
|
# to make it easy to parse out the contents we want. The rvest:: package in
|
||||||
|
# combination with xml2:: provides us with all the tools we need for such
|
||||||
|
# "Webscraping" of data....
|
||||||
|
|
||||||
|
if (! requireNamespace("rvest", quietly=TRUE)) {
|
||||||
|
install.packages("rvest")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||||
|
install.packages("xml2")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Here's the process:
|
||||||
|
# The URL is an "open" page on the student Wiki. Users that are not logged in
|
||||||
|
# can view the contents, but you can only edit if you are logged in.
|
||||||
|
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
|
||||||
|
|
||||||
|
# First thing is to retrieve the HTML from the url...
|
||||||
|
x <- xml2::read_html(myURL)
|
||||||
|
|
||||||
|
# This retrieves the page source, but that still needs to be parsed into its
|
||||||
|
# logical elements. HTML is a subset of XML and such documents are structured as
|
||||||
|
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
|
||||||
|
# parses out the document structure and then uses a so-called "xpath" expression
|
||||||
|
# to select nodes we are interested in. Now, xpath is one of those specialized
|
||||||
|
# languages of which there are a few more to learn than one would care for. You
|
||||||
|
# MUST know how to format sprintf() expressions, and you SHOULD be competent
|
||||||
|
# with regular expressions. But if you want to be really competent in your work,
|
||||||
|
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
|
||||||
|
# to search on Stackoverflow for what you need for parsing data out of Web
|
||||||
|
# documents...
|
||||||
|
|
||||||
|
# The expression we use below is:
|
||||||
|
# - get any node anywhere in the tree ("//*") ...
|
||||||
|
# - that has a particular attribute("[@ ... ]").
|
||||||
|
# - The attribute we want is that the class of the node is "protein-data";
|
||||||
|
# that is the class we have defined for our <pre> tags.
|
||||||
|
# As a result of this selection, we get a list of pointers to the document tree.
|
||||||
|
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
|
||||||
|
|
||||||
|
# Next we fetch the actual payload - the text - from the tree:
|
||||||
|
# rvest::html_text() gets the text from the list of pointers. The result is a
|
||||||
|
# normal list of character strings.
|
||||||
|
z <- rvest::html_text(y)
|
||||||
|
|
||||||
|
# Finally we can iterate over the list, and add all proteins we don't already
|
||||||
|
# have to our database. There may well be items that are rejected because they
|
||||||
|
# are already present in the database - for example, unless somebody has
|
||||||
|
# annotated new features, all of the features are already there. Don't worry -
|
||||||
|
# that is intended; we don't want duplicate entries.
|
||||||
|
|
||||||
|
for (thisJSON in z) {
|
||||||
|
thisData <- jsonlite::fromJSON(thisJSON)
|
||||||
|
if (! thisData$protein$name %in% myDB$protein$name) {
|
||||||
|
myDB <- dbAddProtein(myDB, thisData$protein)
|
||||||
|
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
|
||||||
|
myDB <- dbAddFeature(myDB, thisData$feature)
|
||||||
|
myDB <- dbAddAnnotation(myDB, thisData$annotation)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
|
||||||
|
|
||||||
|
iRows <- grep("^MBP1_", myDB$protein$name)
|
||||||
|
yMax <- length(iRows) * 1.1
|
||||||
|
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
||||||
|
|
||||||
|
# plot an empty frame
|
||||||
|
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
|
||||||
|
plot(1, 1,
|
||||||
|
xlim = c(-200, xMax + 100),
|
||||||
|
ylim = c(0, yMax),
|
||||||
|
type = "n",
|
||||||
|
axes = FALSE,
|
||||||
|
bty = "n",
|
||||||
|
main = "Mbp1 orthologue domain annotations",
|
||||||
|
xlab = "sequence position",
|
||||||
|
cex.axis = 0.8,
|
||||||
|
ylab="")
|
||||||
|
axis(1, at = seq(0, xMax, by = 100))
|
||||||
|
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
||||||
|
"#f0ea00", "#62C923",
|
||||||
|
"#0A9A9B", "#1958C3",
|
||||||
|
"#8000D3", "#D0007F"),
|
||||||
|
space="Lab",
|
||||||
|
interpolate="linear")(nrow(myDB$feature))
|
||||||
|
myCol <- paste0(myCol, "55")
|
||||||
|
legend(xMax - 150, 7,
|
||||||
|
legend = myDB$feature$name,
|
||||||
|
cex = 0.7,
|
||||||
|
fill = myCol,
|
||||||
|
bty = "n")
|
||||||
|
|
||||||
|
for (i in seq_along(iRows)) {
|
||||||
|
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
||||||
|
}
|
||||||
|
par(oPar) # reset the plot parameters
|
||||||
|
|
||||||
|
# ... the more proteins we can compare, the more we learn about the
|
||||||
|
# architectural principles of this family's domains.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user