Add code for scripting data download unit

2017-10-06 08:49:43 -04:00
parent 4fb2506e58
commit b33feed50a
3 changed files with 437 additions and 0 deletions
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@@ -0,0 +1,154 @@
+# RPR-PROSITE_POST.R
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.0
+#
+# Date:     2017  10  05
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        Constructing a POST command from a Web query      40
+#TOC>   1.1      Task - fetchPrositeFeatures() function           134
+#TOC>   2        Task solutions                                   142
+#TOC> 
+#TOC> ==========================================================================
+ 
+
+
+
+# =    1  Constructing a POST command from a Web query  ========================
+
+
+if (!require(httr)) {
+  install.packages("httr")
+  library(httr)
+}
+
+# We have reverse engineered the Web form for a ScanProsite request, and can now
+# construct a POST request. The command is similar to GET(), but we need an
+# explicit request body: a list of key/value pairs
+
+UniProtID <- "P39678"
+
+URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
+
+response <- POST(URL,
+                 body = list(meta = "opt1",
+                             meta1_protein = "opt1",
+                             seq = UniProtID,
+                             skip = "on",
+                             output = "tabular"))
+
+# Send off this request, and you should have a response in a few
+# seconds. Let's check the status first:
+
+status_code(response)  # If this is not 200, something went wrong and it
+                       # makes no sense to continue. If this persists, ask
+                       # on the mailing list what to do.
+
+
+# The text contents of the response is available with the
+# content() function:
+content(response, "text")
+
+# ... should show you the same as the page contents that
+# you have seen in the browser. The date we need Now we need to extract
+# the data from the page: we need regular expressions, but
+# only simple ones. First, we strsplit() the response into
+# individual lines, since each of our data elements is on
+# its own line. We simply split on the "\\n" newline character.
+
+lines <- unlist(strsplit(content(response, "text"), "\\n"))
+head(lines)
+
+# Now we define a query pattern for the lines we want:
+# we can use the uID, bracketed by two "|" pipe
+# characters:
+
+patt <- sprintf("\\|%s\\|", UniProtID)
+
+# ... and select only the lines that match this
+# pattern:
+
+lines <- lines[grep(patt, lines)]
+lines
+
+# ... captures the four lines of output.
+
+# Now we break the lines apart into tokens: this is another application of
+# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
+# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
+
+unlist(strsplit(lines[1], "\\t|\\|"))
+
+# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
+# with a backslash. "t" has to be escaped because we want to match a tab (\t),
+# not the literal character "t". And "|" has to be escaped because we mean the
+# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
+# backslash turns a special meaning off, and sometimes it turns a special
+# meaning on. Unfortunately there's no easy way to tell - you just need to
+# remember the characters - or have a reference handy. The metacharacters are
+# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
+# where in the regex they are.
+
+# Let's put the tokens into named slots of a data frame
+
+features <- data.frame()
+for (line in lines) {
+  tokens <- unlist(strsplit(line, "\\t|\\|"))
+  features <- rbind(features,
+                    data.frame(uID   =  tokens[2],
+                               start =  as.numeric(tokens[4]),
+                               end   =  as.numeric(tokens[5]),
+                               psID  =  tokens[6],
+                               psName = tokens[7],
+                               stringsAsFactors = FALSE))
+}
+features
+
+#  This forms the base of a function that collects the features automatically
+#  from a PrositeScan result. You can write this!
+
+
+# ==   1.1  Task - fetchPrositeFeatures() function  ============================
+
+
+# Task: write a function that takes as input a UniProt ID, fetches the
+# features it contains from ScanProsite and returns a list as given above, or
+# a list of length 0 if there is an error.
+
+
+# =    2  Task solutions  ======================================================
+
+
+# I have placed such a function into the dbUtilities script: look it up by
+# clicking on  dbFetchPrositeFeatures() in the Environment pane.
+
+# Test:
+dbFetchPrositeFeatures("P39678")
+
+
+
+
+# [END]
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@@ -0,0 +1,118 @@
+# RPR-UniProt_GET.R
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.0
+#
+# Date:     2017  10  05
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                Line
+#TOC> ----------------------------------------------------
+#TOC>   1        UniProt files via GET                  40
+#TOC>   1.1      Task - fetchUniProtSeq() function      98
+#TOC>   2        Task solutions                        105
+#TOC> 
+#TOC> ==========================================================================
+ 
+
+
+
+# =    1  UniProt files via GET  ===============================================
+
+
+# Perhaps the simplest example of scripted download is to retrieve a protein
+# FASTA sequence from UniProt. All we need is to construct an URL with the
+# correct UniProt ID.
+
+# An interface between R scripts and We=b servers is provided by the httr
+# package. This sends and receives information via the http protocol, just like
+# a Web browser. Since this is a short and simple request, the GET verb is the
+# right tool:
+
+if (!require(httr)) {
+  install.packages("httr")
+  library(httr)
+}
+
+# The UniProt ID for Mbp1 is ...
+
+UniProtID <- "P39678"
+
+# and the base URL to retrieve data is  ...
+# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
+# retrieve a FASTA sequence:
+
+(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
+
+# the GET() function from httr will get the data.
+response <- GET(URL)
+
+str(response) # the response object is a bit complex ...
+as.character(response) # ... but it is easy to pull out the data.
+
+# to process  ...
+x <- as.character(response)
+x <- strsplit(x, "\n")
+dbSanitizeSequence(x)
+
+# Simple.
+# But what happens if there is an error, e.g. the uniprot ID does not exist?
+
+response <- GET("http://www.uniprot.org/uniprot/X000000.fasta")
+as.character(response)
+# this is a large HTML page that tells us the URL was not found. So we need to
+# check for errors.  The Right way to do this is to evaluate the staus code that
+# every Web server returns for every transaction.
+#
+status_code(response)  # 404 == Page Not Found
+
+# There are many possible codes, but the only code we will be happy with
+# is 200 - oK.
+# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
+
+URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
+response <- GET(URL)
+status_code(response)
+
+
+# ==   1.1  Task - fetchUniProtSeq() function  =================================
+
+# Task: write a function that takes as input a UniProt ID, fetches the
+# FASTA sequence, returns only the sequence if the operation is successful, or
+# a vector of length 0 if there is an error.
+
+
+# =    2  Task solutions  ======================================================
+
+
+# I have placed such a function into the dbUtilities script: look it up by
+# clicking on  dbFetchUniProtSeq() in the Environment pane.
+
+# Test:
+dbFetchUniProtSeq("P39678")
+
+
+
+
+
+# [END]
--- a/RPR-eUtils_XML.R
+++ b/RPR-eUtils_XML.R
@@ -0,0 +1,165 @@
+# RPR-eUtils_and_XML.R
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.0
+#
+# Date:     2017  10  05
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                 Line
+#TOC> -----------------------------------------------------
+#TOC>   1        Working with NCBI eUtils                40
+#TOC>   1.1      Task - fetchNCBItaxData() function     149
+#TOC>   2        Task solutions                         156
+#TOC> 
+#TOC> ==========================================================================
+ 
+
+
+
+# =    1  Working with NCBI eUtils  ============================================
+
+
+
+# To begin, we load some libraries with functions
+# we need...
+
+# httr sends and receives information via the http
+# protocol, just like a Web browser.
+if (!require(httr, quietly=TRUE)) {
+  install.packages("httr")
+  library(httr)
+}
+
+# NCBI's eUtils send information in XML format; we
+# need to be able to parse XML.
+if (!require(xml2)) {
+  install.packages("xml2")
+  library(xml2)
+}
+
+
+
+# We will walk through the process with the refSeqID
+# of yeast Mbp1
+refSeqID <- "NP_010227"
+
+
+# First we build a query URL...
+eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+
+
+# Then we assemble an URL that will search for get the
+# unique, NCBI internal identifier,  the GI number,
+# for our refSeqID...
+URL <- paste(eUtilsBase,
+             "esearch.fcgi?",     # ...using the esearch program
+                                  # that finds an entry in an
+                                  # NCBI database
+             "db=protein",
+             "&term=", refSeqID,
+             sep="")
+# Copy the URL and paste it into your browser to see
+# what the response should look like.
+URL
+
+# To fetch a response in R, we use the function GET() from the httr package
+# with our URL as its argument.
+myXML <- read_xml(URL)
+myXML
+
+# This is XML. We can take the response apart into
+# its indvidual components with the as_list() function.
+
+as_list(myXML)
+
+# Note how the XML "tree" is represented as a list of
+# lists of lists ...
+# If we know exactly what elelement we are looking for,
+# we can extract it from this structure:
+as_list(myXML)[["IdList"]][["Id"]][[1]]
+
+# But this is not very robust, it would break with the
+# slightest change that the NCBI makes to their response
+# and the NCBI changes things A LOT!
+
+# Somewhat more robust is to specify the type of element
+# we want - its the text contained in an <id>...</id>
+# element, and use the XPath XML parsing language to
+# retrieve it.
+
+xml_find_all(myXML, "//Id") # returns a "node set"
+
+xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
+
+# We will need doing this a lot, so we write a function
+# for it...
+node2text <- function(doc, tag) {
+  # an extractor function for the contents of elements
+  # between given tags in an XML response.
+  # Contents of all matching elements is returned in
+  # a vector of strings.
+  path <- paste0("//", tag)
+  nodes <- xml_find_all(doc, path)
+  return(xml_text(nodes))
+}
+
+# using node2text() ...
+(GID <- node2text(myXML, "Id"))
+
+# The GI is the pivot for all our data requests at the
+# NCBI.
+
+# Let's first get the associated data for this GI
+URL <- paste0(eUtilsBase,
+              "esummary.fcgi?",
+              "db=protein",
+              "&id=",
+              GID,
+              "&version=2.0")
+(myXML <- read_xml(URL))
+
+(taxID <- node2text(myXML, "TaxId"))
+(organism <- node2text(myXML, "Organism"))
+
+#  This forms the base of a function that gets taxonomy data
+#  from an Entrez result. You can write this!
+
+
+# ==   1.1  Task - fetchNCBItaxData() function  ================================
+
+# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
+# information, returns a list with taxID and organism, if the operation is
+# successful, or a list of length 0 if there is an error.
+
+
+# =    2  Task solutions  ======================================================
+
+# I have placed such a function into the dbUtilities script: look it up by
+# clicking on  dbFetchNCBItaxData() in the Environment pane.
+
+# Test:
+dbFetchNCBItaxData("NP_010227")
+
+
+# [END]