diff --git a/RPR-PROSITE_POST.R b/RPR-PROSITE_POST.R new file mode 100644 index 0000000..4b66124 --- /dev/null +++ b/RPR-PROSITE_POST.R @@ -0,0 +1,154 @@ +# RPR-PROSITE_POST.R +# +# Purpose: A Bioinformatics Course: +# R code accompanying the RPR-Scripting_data_downloads unit. +# +# Version: 1.0 +# +# Date: 2017 10 05 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 1.0 First ABC units version +# 0.1 First code copied from 2016 material. +# +# +# TODO: +# +# +# == DO NOT SIMPLY source() THIS FILE! ======================================= +# +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... +# +# ============================================================================== + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------------------- +#TOC> 1 Constructing a POST command from a Web query 40 +#TOC> 1.1 Task - fetchPrositeFeatures() function 134 +#TOC> 2 Task solutions 142 +#TOC> +#TOC> ========================================================================== + + + + +# = 1 Constructing a POST command from a Web query ======================== + + +if (!require(httr)) { + install.packages("httr") + library(httr) +} + +# We have reverse engineered the Web form for a ScanProsite request, and can now +# construct a POST request. The command is similar to GET(), but we need an +# explicit request body: a list of key/value pairs + +UniProtID <- "P39678" + +URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" + +response <- POST(URL, + body = list(meta = "opt1", + meta1_protein = "opt1", + seq = UniProtID, + skip = "on", + output = "tabular")) + +# Send off this request, and you should have a response in a few +# seconds. Let's check the status first: + +status_code(response) # If this is not 200, something went wrong and it + # makes no sense to continue. If this persists, ask + # on the mailing list what to do. + + +# The text contents of the response is available with the +# content() function: +content(response, "text") + +# ... should show you the same as the page contents that +# you have seen in the browser. The date we need Now we need to extract +# the data from the page: we need regular expressions, but +# only simple ones. First, we strsplit() the response into +# individual lines, since each of our data elements is on +# its own line. We simply split on the "\\n" newline character. + +lines <- unlist(strsplit(content(response, "text"), "\\n")) +head(lines) + +# Now we define a query pattern for the lines we want: +# we can use the uID, bracketed by two "|" pipe +# characters: + +patt <- sprintf("\\|%s\\|", UniProtID) + +# ... and select only the lines that match this +# pattern: + +lines <- lines[grep(patt, lines)] +lines + +# ... captures the four lines of output. + +# Now we break the lines apart into tokens: this is another application of +# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs +# "\t". Look at the regex "\\t|\\|" in the strsplit() call: + +unlist(strsplit(lines[1], "\\t|\\|")) + +# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped +# with a backslash. "t" has to be escaped because we want to match a tab (\t), +# not the literal character "t". And "|" has to be escaped because we mean the +# literal pipe character, not its metacharacter meaning OR. Thus sometimes the +# backslash turns a special meaning off, and sometimes it turns a special +# meaning on. Unfortunately there's no easy way to tell - you just need to +# remember the characters - or have a reference handy. The metacharacters are +# (){}[]^$?*+.|&- ... and some of them have different meanings depending on +# where in the regex they are. + +# Let's put the tokens into named slots of a data frame + +features <- data.frame() +for (line in lines) { + tokens <- unlist(strsplit(line, "\\t|\\|")) + features <- rbind(features, + data.frame(uID = tokens[2], + start = as.numeric(tokens[4]), + end = as.numeric(tokens[5]), + psID = tokens[6], + psName = tokens[7], + stringsAsFactors = FALSE)) +} +features + +# This forms the base of a function that collects the features automatically +# from a PrositeScan result. You can write this! + + +# == 1.1 Task - fetchPrositeFeatures() function ============================ + + +# Task: write a function that takes as input a UniProt ID, fetches the +# features it contains from ScanProsite and returns a list as given above, or +# a list of length 0 if there is an error. + + +# = 2 Task solutions ====================================================== + + +# I have placed such a function into the dbUtilities script: look it up by +# clicking on dbFetchPrositeFeatures() in the Environment pane. + +# Test: +dbFetchPrositeFeatures("P39678") + + + + +# [END] diff --git a/RPR-UniProt_GET.R b/RPR-UniProt_GET.R new file mode 100644 index 0000000..95bce7c --- /dev/null +++ b/RPR-UniProt_GET.R @@ -0,0 +1,118 @@ +# RPR-UniProt_GET.R +# +# Purpose: A Bioinformatics Course: +# R code accompanying the RPR-Scripting_data_downloads unit. +# +# Version: 1.0 +# +# Date: 2017 10 05 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 1.0 First ABC units version +# 0.1 First code copied from 2016 material. +# +# +# TODO: +# +# +# == DO NOT SIMPLY source() THIS FILE! ======================================= +# +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... +# +# ============================================================================== + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ---------------------------------------------------- +#TOC> 1 UniProt files via GET 40 +#TOC> 1.1 Task - fetchUniProtSeq() function 98 +#TOC> 2 Task solutions 105 +#TOC> +#TOC> ========================================================================== + + + + +# = 1 UniProt files via GET =============================================== + + +# Perhaps the simplest example of scripted download is to retrieve a protein +# FASTA sequence from UniProt. All we need is to construct an URL with the +# correct UniProt ID. + +# An interface between R scripts and We=b servers is provided by the httr +# package. This sends and receives information via the http protocol, just like +# a Web browser. Since this is a short and simple request, the GET verb is the +# right tool: + +if (!require(httr)) { + install.packages("httr") + library(httr) +} + +# The UniProt ID for Mbp1 is ... + +UniProtID <- "P39678" + +# and the base URL to retrieve data is ... +# http://www.uniprot.org/uniprot/ . We can construct a simple URL to +# retrieve a FASTA sequence: + +(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) + +# the GET() function from httr will get the data. +response <- GET(URL) + +str(response) # the response object is a bit complex ... +as.character(response) # ... but it is easy to pull out the data. + +# to process ... +x <- as.character(response) +x <- strsplit(x, "\n") +dbSanitizeSequence(x) + +# Simple. +# But what happens if there is an error, e.g. the uniprot ID does not exist? + +response <- GET("http://www.uniprot.org/uniprot/X000000.fasta") +as.character(response) +# this is a large HTML page that tells us the URL was not found. So we need to +# check for errors. The Right way to do this is to evaluate the staus code that +# every Web server returns for every transaction. +# +status_code(response) # 404 == Page Not Found + +# There are many possible codes, but the only code we will be happy with +# is 200 - oK. +# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) + +URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) +response <- GET(URL) +status_code(response) + + +# == 1.1 Task - fetchUniProtSeq() function ================================= + +# Task: write a function that takes as input a UniProt ID, fetches the +# FASTA sequence, returns only the sequence if the operation is successful, or +# a vector of length 0 if there is an error. + + +# = 2 Task solutions ====================================================== + + +# I have placed such a function into the dbUtilities script: look it up by +# clicking on dbFetchUniProtSeq() in the Environment pane. + +# Test: +dbFetchUniProtSeq("P39678") + + + + + +# [END] diff --git a/RPR-eUtils_XML.R b/RPR-eUtils_XML.R new file mode 100644 index 0000000..34ab151 --- /dev/null +++ b/RPR-eUtils_XML.R @@ -0,0 +1,165 @@ +# RPR-eUtils_and_XML.R +# +# Purpose: A Bioinformatics Course: +# R code accompanying the RPR-Scripting_data_downloads unit. +# +# Version: 1.0 +# +# Date: 2017 10 05 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 1.0 First ABC units version +# 0.1 First code copied from 2016 material. +# +# +# TODO: +# +# +# == DO NOT SIMPLY source() THIS FILE! ======================================= +# +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... +# +# ============================================================================== + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ----------------------------------------------------- +#TOC> 1 Working with NCBI eUtils 40 +#TOC> 1.1 Task - fetchNCBItaxData() function 149 +#TOC> 2 Task solutions 156 +#TOC> +#TOC> ========================================================================== + + + + +# = 1 Working with NCBI eUtils ============================================ + + + +# To begin, we load some libraries with functions +# we need... + +# httr sends and receives information via the http +# protocol, just like a Web browser. +if (!require(httr, quietly=TRUE)) { + install.packages("httr") + library(httr) +} + +# NCBI's eUtils send information in XML format; we +# need to be able to parse XML. +if (!require(xml2)) { + install.packages("xml2") + library(xml2) +} + + + +# We will walk through the process with the refSeqID +# of yeast Mbp1 +refSeqID <- "NP_010227" + + +# First we build a query URL... +eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + + +# Then we assemble an URL that will search for get the +# unique, NCBI internal identifier, the GI number, +# for our refSeqID... +URL <- paste(eUtilsBase, + "esearch.fcgi?", # ...using the esearch program + # that finds an entry in an + # NCBI database + "db=protein", + "&term=", refSeqID, + sep="") +# Copy the URL and paste it into your browser to see +# what the response should look like. +URL + +# To fetch a response in R, we use the function GET() from the httr package +# with our URL as its argument. +myXML <- read_xml(URL) +myXML + +# This is XML. We can take the response apart into +# its indvidual components with the as_list() function. + +as_list(myXML) + +# Note how the XML "tree" is represented as a list of +# lists of lists ... +# If we know exactly what elelement we are looking for, +# we can extract it from this structure: +as_list(myXML)[["IdList"]][["Id"]][[1]] + +# But this is not very robust, it would break with the +# slightest change that the NCBI makes to their response +# and the NCBI changes things A LOT! + +# Somewhat more robust is to specify the type of element +# we want - its the text contained in an ... +# element, and use the XPath XML parsing language to +# retrieve it. + +xml_find_all(myXML, "//Id") # returns a "node set" + +xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set + +# We will need doing this a lot, so we write a function +# for it... +node2text <- function(doc, tag) { + # an extractor function for the contents of elements + # between given tags in an XML response. + # Contents of all matching elements is returned in + # a vector of strings. + path <- paste0("//", tag) + nodes <- xml_find_all(doc, path) + return(xml_text(nodes)) +} + +# using node2text() ... +(GID <- node2text(myXML, "Id")) + +# The GI is the pivot for all our data requests at the +# NCBI. + +# Let's first get the associated data for this GI +URL <- paste0(eUtilsBase, + "esummary.fcgi?", + "db=protein", + "&id=", + GID, + "&version=2.0") +(myXML <- read_xml(URL)) + +(taxID <- node2text(myXML, "TaxId")) +(organism <- node2text(myXML, "Organism")) + +# This forms the base of a function that gets taxonomy data +# from an Entrez result. You can write this! + + +# == 1.1 Task - fetchNCBItaxData() function ================================ + +# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy +# information, returns a list with taxID and organism, if the operation is +# successful, or a list of length 0 if there is an error. + + +# = 2 Task solutions ====================================================== + +# I have placed such a function into the dbUtilities script: look it up by +# clicking on dbFetchNCBItaxData() in the Environment pane. + +# Test: +dbFetchNCBItaxData("NP_010227") + + +# [END]