# tocID <- "RPR-eUtils_and_XML.R" # # ---------------------------------------------------------------------------- # # PATIENCE ... # # Do not yet work wih this code. Updates in progress. Thank you. # # boris.steipe@utoronto.ca # # ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # # Version: 1.1 # # Date: 2017 10 05 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout # 1.0 First ABC units version # 0.1 First code copied from 2016 material. # # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------- #TOC> 1 Working with NCBI eUtils 41 #TOC> 1.1 Task - fetchNCBItaxData() function 144 #TOC> 2 Task solutions 151 #TOC> #TOC> ========================================================================== # = 1 Working with NCBI eUtils ============================================ # To begin, we load the xml2 package that contains functions # we need to receive and parse html data. NCBI's eUtils send information in # XML format so we need to be able to parse XML. if (! requireNamespace("xml2", quietly=TRUE)) { install.packages("xml2") } # Package information: # library(help = xml2) # basic information # browseVignettes("xml2") # available vignettes # data(package = "xml2") # available datasets # We will walk through the process with the refSeqID # of yeast Mbp1 refSeqID <- "NP_010227" # First we build a query URL... eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" # Then we assemble an URL that will search for get the # unique, NCBI internal identifier, the GI number, # for our refSeqID... URL <- paste(eUtilsBase, "esearch.fcgi?", # ...using the esearch program # that finds an entry in an # NCBI database "db=protein", "&term=", refSeqID, sep="") # Copy the URL and paste it into your browser to see # what the response should look like. URL # To fetch a response in R, we use the function read_xml() # with our URL as its argument. (myXML <- xml2::read_xml(URL)) # This is XML. We can take the response apart into # its indvidual components with the as_list() function. xml2::as_list(myXML) # Note how the XML "tree" is represented as a list of # lists of lists ... # If we know exactly what elelement we are looking for, # we can extract it from this structure: xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] # But this is not very robust, it would break with the # slightest change that the NCBI makes to their data format - # and the NCBI changes things A LOT! # Somewhat more robust is to specify the type of element # we want - its the text contained in an ... # element, and use the XPath XML parsing language to # retrieve it. xml2::xml_find_all(myXML, "//Id") # returns a "node set" xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents # of the node set # We will need to do this more than once, so we write a function # for it... node2text <- function(doc, tag) { # an extractor function for the contents of elements # between given tags in an XML response. # Contents of all matching elements is returned in # a vector of strings. path <- paste0("//", tag) nodes <- xml2::xml_find_all(doc, path) return(xml2::xml_text(nodes)) } # using node2text() ... (GID <- node2text(myXML, "Id")) # The GI is the pivot for all our data requests at the # NCBI. # Let's first get the associated data for this GI URL <- paste0(eUtilsBase, "esummary.fcgi?", "db=protein", "&id=", GID, "&version=2.0") (myXML <- xml2::read_xml(URL)) (taxID <- node2text(myXML, "TaxId")) (organism <- node2text(myXML, "Organism")) # This forms the base of a function that gets taxonomy data # from an Entrez result. You can write this! # == 1.1 Task - fetchNCBItaxData() function ================================ # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy # information, returns a list with taxID and organism, if the operation is # successful, or a list of length 0 if there is an error. # = 2 Task solutions ====================================================== # I have placed such a function into the dbUtilities script: look it up by # clicking on dbFetchNCBItaxData() in the Environment pane. # Test: dbFetchNCBItaxData("NP_010227") # [END]