166 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			166 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
| # RPR-eUtils_and_XML.R
 | |
| #
 | |
| # Purpose:  A Bioinformatics Course:
 | |
| #              R code accompanying the RPR-Scripting_data_downloads unit.
 | |
| #
 | |
| # Version:  1.0
 | |
| #
 | |
| # Date:     2017  10  05
 | |
| # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | |
| #
 | |
| # Versions:
 | |
| #           1.0    First ABC units version
 | |
| #           0.1    First code copied from 2016 material.
 | |
| #
 | |
| #
 | |
| # TODO:
 | |
| #
 | |
| #
 | |
| # == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | |
| #
 | |
| # If there are portions you don't understand, use R's help system, Google for an
 | |
| # answer, or ask your instructor. Don't continue if you don't understand what's
 | |
| # going on. That's not how it works ...
 | |
| #
 | |
| # ==============================================================================
 | |
|  
 | |
| #TOC> ==========================================================================
 | |
| #TOC> 
 | |
| #TOC>   Section  Title                                 Line
 | |
| #TOC> -----------------------------------------------------
 | |
| #TOC>   1        Working with NCBI eUtils                40
 | |
| #TOC>   1.1      Task - fetchNCBItaxData() function     149
 | |
| #TOC>   2        Task solutions                         156
 | |
| #TOC> 
 | |
| #TOC> ==========================================================================
 | |
|  
 | |
| 
 | |
| 
 | |
| 
 | |
| # =    1  Working with NCBI eUtils  ============================================
 | |
| 
 | |
| 
 | |
| 
 | |
| # To begin, we load some libraries with functions
 | |
| # we need...
 | |
| 
 | |
| # httr sends and receives information via the http
 | |
| # protocol, just like a Web browser.
 | |
| if (!require(httr, quietly=TRUE)) {
 | |
|   install.packages("httr")
 | |
|   library(httr)
 | |
| }
 | |
| 
 | |
| # NCBI's eUtils send information in XML format; we
 | |
| # need to be able to parse XML.
 | |
| if (!require(xml2)) {
 | |
|   install.packages("xml2")
 | |
|   library(xml2)
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| # We will walk through the process with the refSeqID
 | |
| # of yeast Mbp1
 | |
| refSeqID <- "NP_010227"
 | |
| 
 | |
| 
 | |
| # First we build a query URL...
 | |
| eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
 | |
| 
 | |
| 
 | |
| # Then we assemble an URL that will search for get the
 | |
| # unique, NCBI internal identifier,  the GI number,
 | |
| # for our refSeqID...
 | |
| URL <- paste(eUtilsBase,
 | |
|              "esearch.fcgi?",     # ...using the esearch program
 | |
|                                   # that finds an entry in an
 | |
|                                   # NCBI database
 | |
|              "db=protein",
 | |
|              "&term=", refSeqID,
 | |
|              sep="")
 | |
| # Copy the URL and paste it into your browser to see
 | |
| # what the response should look like.
 | |
| URL
 | |
| 
 | |
| # To fetch a response in R, we use the function GET() from the httr package
 | |
| # with our URL as its argument.
 | |
| myXML <- read_xml(URL)
 | |
| myXML
 | |
| 
 | |
| # This is XML. We can take the response apart into
 | |
| # its indvidual components with the as_list() function.
 | |
| 
 | |
| as_list(myXML)
 | |
| 
 | |
| # Note how the XML "tree" is represented as a list of
 | |
| # lists of lists ...
 | |
| # If we know exactly what elelement we are looking for,
 | |
| # we can extract it from this structure:
 | |
| as_list(myXML)[["IdList"]][["Id"]][[1]]
 | |
| 
 | |
| # But this is not very robust, it would break with the
 | |
| # slightest change that the NCBI makes to their response
 | |
| # and the NCBI changes things A LOT!
 | |
| 
 | |
| # Somewhat more robust is to specify the type of element
 | |
| # we want - its the text contained in an <id>...</id>
 | |
| # element, and use the XPath XML parsing language to
 | |
| # retrieve it.
 | |
| 
 | |
| xml_find_all(myXML, "//Id") # returns a "node set"
 | |
| 
 | |
| xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
 | |
| 
 | |
| # We will need doing this a lot, so we write a function
 | |
| # for it...
 | |
| node2text <- function(doc, tag) {
 | |
|   # an extractor function for the contents of elements
 | |
|   # between given tags in an XML response.
 | |
|   # Contents of all matching elements is returned in
 | |
|   # a vector of strings.
 | |
|   path <- paste0("//", tag)
 | |
|   nodes <- xml_find_all(doc, path)
 | |
|   return(xml_text(nodes))
 | |
| }
 | |
| 
 | |
| # using node2text() ...
 | |
| (GID <- node2text(myXML, "Id"))
 | |
| 
 | |
| # The GI is the pivot for all our data requests at the
 | |
| # NCBI.
 | |
| 
 | |
| # Let's first get the associated data for this GI
 | |
| URL <- paste0(eUtilsBase,
 | |
|               "esummary.fcgi?",
 | |
|               "db=protein",
 | |
|               "&id=",
 | |
|               GID,
 | |
|               "&version=2.0")
 | |
| (myXML <- read_xml(URL))
 | |
| 
 | |
| (taxID <- node2text(myXML, "TaxId"))
 | |
| (organism <- node2text(myXML, "Organism"))
 | |
| 
 | |
| #  This forms the base of a function that gets taxonomy data
 | |
| #  from an Entrez result. You can write this!
 | |
| 
 | |
| 
 | |
| # ==   1.1  Task - fetchNCBItaxData() function  ================================
 | |
| 
 | |
| # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
 | |
| # information, returns a list with taxID and organism, if the operation is
 | |
| # successful, or a list of length 0 if there is an error.
 | |
| 
 | |
| 
 | |
| # =    2  Task solutions  ======================================================
 | |
| 
 | |
| # I have placed such a function into the dbUtilities script: look it up by
 | |
| # clicking on  dbFetchNCBItaxData() in the Environment pane.
 | |
| 
 | |
| # Test:
 | |
| dbFetchNCBItaxData("NP_010227")
 | |
| 
 | |
| 
 | |
| # [END]
 |