# RPR-eUtils_and_XML.R # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # # Version: 1.0 # # Date: 2017 10 05 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.0 First ABC units version # 0.1 First code copied from 2016 material. # # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------- #TOC> 1 Working with NCBI eUtils 40 #TOC> 1.1 Task - fetchNCBItaxData() function 149 #TOC> 2 Task solutions 156 #TOC> #TOC> ========================================================================== # = 1 Working with NCBI eUtils ============================================ # To begin, we load some libraries with functions # we need... # httr sends and receives information via the http # protocol, just like a Web browser. if (!require(httr, quietly=TRUE)) { install.packages("httr") library(httr) } # NCBI's eUtils send information in XML format; we # need to be able to parse XML. if (!require(xml2)) { install.packages("xml2") library(xml2) } # We will walk through the process with the refSeqID # of yeast Mbp1 refSeqID <- "NP_010227" # First we build a query URL... eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" # Then we assemble an URL that will search for get the # unique, NCBI internal identifier, the GI number, # for our refSeqID... URL <- paste(eUtilsBase, "esearch.fcgi?", # ...using the esearch program # that finds an entry in an # NCBI database "db=protein", "&term=", refSeqID, sep="") # Copy the URL and paste it into your browser to see # what the response should look like. URL # To fetch a response in R, we use the function GET() from the httr package # with our URL as its argument. myXML <- read_xml(URL) myXML # This is XML. We can take the response apart into # its indvidual components with the as_list() function. as_list(myXML) # Note how the XML "tree" is represented as a list of # lists of lists ... # If we know exactly what elelement we are looking for, # we can extract it from this structure: as_list(myXML)[["IdList"]][["Id"]][[1]] # But this is not very robust, it would break with the # slightest change that the NCBI makes to their response # and the NCBI changes things A LOT! # Somewhat more robust is to specify the type of element # we want - its the text contained in an ... # element, and use the XPath XML parsing language to # retrieve it. xml_find_all(myXML, "//Id") # returns a "node set" xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set # We will need doing this a lot, so we write a function # for it... node2text <- function(doc, tag) { # an extractor function for the contents of elements # between given tags in an XML response. # Contents of all matching elements is returned in # a vector of strings. path <- paste0("//", tag) nodes <- xml_find_all(doc, path) return(xml_text(nodes)) } # using node2text() ... (GID <- node2text(myXML, "Id")) # The GI is the pivot for all our data requests at the # NCBI. # Let's first get the associated data for this GI URL <- paste0(eUtilsBase, "esummary.fcgi?", "db=protein", "&id=", GID, "&version=2.0") (myXML <- read_xml(URL)) (taxID <- node2text(myXML, "TaxId")) (organism <- node2text(myXML, "Organism")) # This forms the base of a function that gets taxonomy data # from an Entrez result. You can write this! # == 1.1 Task - fetchNCBItaxData() function ================================ # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy # information, returns a list with taxID and organism, if the operation is # successful, or a list of length 0 if there is an error. # = 2 Task solutions ====================================================== # I have placed such a function into the dbUtilities script: look it up by # clicking on dbFetchNCBItaxData() in the Environment pane. # Test: dbFetchNCBItaxData("NP_010227") # [END]