Add code for scripting data download unit
This commit is contained in:
parent
4fb2506e58
commit
b33feed50a
154
RPR-PROSITE_POST.R
Normal file
154
RPR-PROSITE_POST.R
Normal file
@ -0,0 +1,154 @@
|
||||
# RPR-PROSITE_POST.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 10 05
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Constructing a POST command from a Web query 40
|
||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 134
|
||||
#TOC> 2 Task solutions 142
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
# = 1 Constructing a POST command from a Web query ========================
|
||||
|
||||
|
||||
if (!require(httr)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
# We have reverse engineered the Web form for a ScanProsite request, and can now
|
||||
# construct a POST request. The command is similar to GET(), but we need an
|
||||
# explicit request body: a list of key/value pairs
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||
|
||||
response <- POST(URL,
|
||||
body = list(meta = "opt1",
|
||||
meta1_protein = "opt1",
|
||||
seq = UniProtID,
|
||||
skip = "on",
|
||||
output = "tabular"))
|
||||
|
||||
# Send off this request, and you should have a response in a few
|
||||
# seconds. Let's check the status first:
|
||||
|
||||
status_code(response) # If this is not 200, something went wrong and it
|
||||
# makes no sense to continue. If this persists, ask
|
||||
# on the mailing list what to do.
|
||||
|
||||
|
||||
# The text contents of the response is available with the
|
||||
# content() function:
|
||||
content(response, "text")
|
||||
|
||||
# ... should show you the same as the page contents that
|
||||
# you have seen in the browser. The date we need Now we need to extract
|
||||
# the data from the page: we need regular expressions, but
|
||||
# only simple ones. First, we strsplit() the response into
|
||||
# individual lines, since each of our data elements is on
|
||||
# its own line. We simply split on the "\\n" newline character.
|
||||
|
||||
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
||||
head(lines)
|
||||
|
||||
# Now we define a query pattern for the lines we want:
|
||||
# we can use the uID, bracketed by two "|" pipe
|
||||
# characters:
|
||||
|
||||
patt <- sprintf("\\|%s\\|", UniProtID)
|
||||
|
||||
# ... and select only the lines that match this
|
||||
# pattern:
|
||||
|
||||
lines <- lines[grep(patt, lines)]
|
||||
lines
|
||||
|
||||
# ... captures the four lines of output.
|
||||
|
||||
# Now we break the lines apart into tokens: this is another application of
|
||||
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
|
||||
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
|
||||
|
||||
unlist(strsplit(lines[1], "\\t|\\|"))
|
||||
|
||||
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
|
||||
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
|
||||
# not the literal character "t". And "|" has to be escaped because we mean the
|
||||
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
|
||||
# backslash turns a special meaning off, and sometimes it turns a special
|
||||
# meaning on. Unfortunately there's no easy way to tell - you just need to
|
||||
# remember the characters - or have a reference handy. The metacharacters are
|
||||
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
|
||||
# where in the regex they are.
|
||||
|
||||
# Let's put the tokens into named slots of a data frame
|
||||
|
||||
features <- data.frame()
|
||||
for (line in lines) {
|
||||
tokens <- unlist(strsplit(line, "\\t|\\|"))
|
||||
features <- rbind(features,
|
||||
data.frame(uID = tokens[2],
|
||||
start = as.numeric(tokens[4]),
|
||||
end = as.numeric(tokens[5]),
|
||||
psID = tokens[6],
|
||||
psName = tokens[7],
|
||||
stringsAsFactors = FALSE))
|
||||
}
|
||||
features
|
||||
|
||||
# This forms the base of a function that collects the features automatically
|
||||
# from a PrositeScan result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchPrositeFeatures() function ============================
|
||||
|
||||
|
||||
# Task: write a function that takes as input a UniProt ID, fetches the
|
||||
# features it contains from ScanProsite and returns a list as given above, or
|
||||
# a list of length 0 if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function into the dbUtilities script: look it up by
|
||||
# clicking on dbFetchPrositeFeatures() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchPrositeFeatures("P39678")
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
118
RPR-UniProt_GET.R
Normal file
118
RPR-UniProt_GET.R
Normal file
@ -0,0 +1,118 @@
|
||||
# RPR-UniProt_GET.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 10 05
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 UniProt files via GET 40
|
||||
#TOC> 1.1 Task - fetchUniProtSeq() function 98
|
||||
#TOC> 2 Task solutions 105
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
# = 1 UniProt files via GET ===============================================
|
||||
|
||||
|
||||
# Perhaps the simplest example of scripted download is to retrieve a protein
|
||||
# FASTA sequence from UniProt. All we need is to construct an URL with the
|
||||
# correct UniProt ID.
|
||||
|
||||
# An interface between R scripts and We=b servers is provided by the httr
|
||||
# package. This sends and receives information via the http protocol, just like
|
||||
# a Web browser. Since this is a short and simple request, the GET verb is the
|
||||
# right tool:
|
||||
|
||||
if (!require(httr)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
# The UniProt ID for Mbp1 is ...
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
# and the base URL to retrieve data is ...
|
||||
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
|
||||
# retrieve a FASTA sequence:
|
||||
|
||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
||||
|
||||
# the GET() function from httr will get the data.
|
||||
response <- GET(URL)
|
||||
|
||||
str(response) # the response object is a bit complex ...
|
||||
as.character(response) # ... but it is easy to pull out the data.
|
||||
|
||||
# to process ...
|
||||
x <- as.character(response)
|
||||
x <- strsplit(x, "\n")
|
||||
dbSanitizeSequence(x)
|
||||
|
||||
# Simple.
|
||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
||||
|
||||
response <- GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||
as.character(response)
|
||||
# this is a large HTML page that tells us the URL was not found. So we need to
|
||||
# check for errors. The Right way to do this is to evaluate the staus code that
|
||||
# every Web server returns for every transaction.
|
||||
#
|
||||
status_code(response) # 404 == Page Not Found
|
||||
|
||||
# There are many possible codes, but the only code we will be happy with
|
||||
# is 200 - oK.
|
||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
||||
|
||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
||||
response <- GET(URL)
|
||||
status_code(response)
|
||||
|
||||
|
||||
# == 1.1 Task - fetchUniProtSeq() function =================================
|
||||
|
||||
# Task: write a function that takes as input a UniProt ID, fetches the
|
||||
# FASTA sequence, returns only the sequence if the operation is successful, or
|
||||
# a vector of length 0 if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function into the dbUtilities script: look it up by
|
||||
# clicking on dbFetchUniProtSeq() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchUniProtSeq("P39678")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
165
RPR-eUtils_XML.R
Normal file
165
RPR-eUtils_XML.R
Normal file
@ -0,0 +1,165 @@
|
||||
# RPR-eUtils_and_XML.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 10 05
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------
|
||||
#TOC> 1 Working with NCBI eUtils 40
|
||||
#TOC> 1.1 Task - fetchNCBItaxData() function 149
|
||||
#TOC> 2 Task solutions 156
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
# = 1 Working with NCBI eUtils ============================================
|
||||
|
||||
|
||||
|
||||
# To begin, we load some libraries with functions
|
||||
# we need...
|
||||
|
||||
# httr sends and receives information via the http
|
||||
# protocol, just like a Web browser.
|
||||
if (!require(httr, quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
# NCBI's eUtils send information in XML format; we
|
||||
# need to be able to parse XML.
|
||||
if (!require(xml2)) {
|
||||
install.packages("xml2")
|
||||
library(xml2)
|
||||
}
|
||||
|
||||
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1
|
||||
refSeqID <- "NP_010227"
|
||||
|
||||
|
||||
# First we build a query URL...
|
||||
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
|
||||
|
||||
# Then we assemble an URL that will search for get the
|
||||
# unique, NCBI internal identifier, the GI number,
|
||||
# for our refSeqID...
|
||||
URL <- paste(eUtilsBase,
|
||||
"esearch.fcgi?", # ...using the esearch program
|
||||
# that finds an entry in an
|
||||
# NCBI database
|
||||
"db=protein",
|
||||
"&term=", refSeqID,
|
||||
sep="")
|
||||
# Copy the URL and paste it into your browser to see
|
||||
# what the response should look like.
|
||||
URL
|
||||
|
||||
# To fetch a response in R, we use the function GET() from the httr package
|
||||
# with our URL as its argument.
|
||||
myXML <- read_xml(URL)
|
||||
myXML
|
||||
|
||||
# This is XML. We can take the response apart into
|
||||
# its indvidual components with the as_list() function.
|
||||
|
||||
as_list(myXML)
|
||||
|
||||
# Note how the XML "tree" is represented as a list of
|
||||
# lists of lists ...
|
||||
# If we know exactly what elelement we are looking for,
|
||||
# we can extract it from this structure:
|
||||
as_list(myXML)[["IdList"]][["Id"]][[1]]
|
||||
|
||||
# But this is not very robust, it would break with the
|
||||
# slightest change that the NCBI makes to their response
|
||||
# and the NCBI changes things A LOT!
|
||||
|
||||
# Somewhat more robust is to specify the type of element
|
||||
# we want - its the text contained in an <id>...</id>
|
||||
# element, and use the XPath XML parsing language to
|
||||
# retrieve it.
|
||||
|
||||
xml_find_all(myXML, "//Id") # returns a "node set"
|
||||
|
||||
xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
|
||||
|
||||
# We will need doing this a lot, so we write a function
|
||||
# for it...
|
||||
node2text <- function(doc, tag) {
|
||||
# an extractor function for the contents of elements
|
||||
# between given tags in an XML response.
|
||||
# Contents of all matching elements is returned in
|
||||
# a vector of strings.
|
||||
path <- paste0("//", tag)
|
||||
nodes <- xml_find_all(doc, path)
|
||||
return(xml_text(nodes))
|
||||
}
|
||||
|
||||
# using node2text() ...
|
||||
(GID <- node2text(myXML, "Id"))
|
||||
|
||||
# The GI is the pivot for all our data requests at the
|
||||
# NCBI.
|
||||
|
||||
# Let's first get the associated data for this GI
|
||||
URL <- paste0(eUtilsBase,
|
||||
"esummary.fcgi?",
|
||||
"db=protein",
|
||||
"&id=",
|
||||
GID,
|
||||
"&version=2.0")
|
||||
(myXML <- read_xml(URL))
|
||||
|
||||
(taxID <- node2text(myXML, "TaxId"))
|
||||
(organism <- node2text(myXML, "Organism"))
|
||||
|
||||
# This forms the base of a function that gets taxonomy data
|
||||
# from an Entrez result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchNCBItaxData() function ================================
|
||||
|
||||
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
|
||||
# information, returns a list with taxID and organism, if the operation is
|
||||
# successful, or a list of length 0 if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
# I have placed such a function into the dbUtilities script: look it up by
|
||||
# clicking on dbFetchNCBItaxData() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchNCBItaxData("NP_010227")
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user