bch441-work-abc-units/RPR-eUtils_XML.R

# tocID <- "RPR-eUtils_XML.R"
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version:  1.2
#
# Date:     2017-10  -  2020-09
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.2    2020 Updates
#           1.1    Change from require() to requireNamespace(),
#                      use <package>::<function>() idiom throughout
#           1.0    First ABC units version
#           0.1    First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC> 
#TOC>   Section  Title                                       Line
#TOC> -----------------------------------------------------------
#TOC>   1        Working with NCBI eUtils                      42
#TOC>   1.1        Task - fetchNCBItaxData() function         144
#TOC>   2        Task solutions                               151
#TOC> 
#TOC> ==========================================================================


# =    1  Working with NCBI eUtils  ============================================


# To begin, we load the xml2 package that contains functions
# we need to receive and parse html data. NCBI's eUtils send information in
# XML format so we need to be able to parse XML.
if (! requireNamespace("xml2", quietly=TRUE)) {
  install.packages("xml2")
}
# Package information:
#  library(help = xml2)       # basic information
#  browseVignettes("xml2")    # available vignettes
#  data(package = "xml2")     # available datasets


# We will walk through the process with the refSeqID
# of yeast Mbp1
refSeqID <- "NP_010227"


# First we build a query URL...
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"


# Then we assemble an URL that will search for get the
# unique, NCBI internal identifier,
# for our refSeqID...
URL <- paste(eUtilsBase,
             "esearch.fcgi?",     # ...using the esearch program
                                  # that finds an entry in an
                                  # NCBI database
             "db=protein",
             "&term=", refSeqID,
             sep="")
# Copy the URL and paste it into your browser to see
# what the response should look like.
URL

# To fetch a response in R, we use the function read_xml()
# with our URL as its argument.
( myXML <- xml2::read_xml(URL) )

# This is XML. We can take the response apart into
# its individual components with the as_list() function.

xml2::as_list(myXML)

# Note how the XML "tree" is represented as a list of
# lists of lists ...
# If we know exactly what element we are looking for,
# we can extract it from this structure:
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]

# But this is not very robust, it would break with the
# slightest change that the NCBI makes to their data format -
# and the NCBI changes things A LOT!

# Somewhat more robust is to specify the type of element
# we want - its the text contained in an <Id>...</Id>
# element, and use the XPath XML parsing language to
# retrieve it.

xml2::xml_find_all(myXML, "//Id") # returns a "node set"

xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
                                                  # of the node set

# We will need to do this more than once, so we write a function
# for it...
node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
  # Contents of all matching elements is returned in
  # a vector of strings.
  path <- paste0("//", tag)
  nodes <- xml2::xml_find_all(doc, path)
  return(xml2::xml_text(nodes))
}

# using node2text() ...
(GID <- node2text(myXML, "Id"))

# The GI is the pivot for data requests at the
# NCBI.

# Let's first get the associated data for this GI
URL <- paste0(eUtilsBase,
              "esummary.fcgi?",
              "db=protein",
              "&id=",
              GID,
              "&version=2.0")
(myXML <- xml2::read_xml(URL))

(taxID <- node2text(myXML, "TaxId"))
(organism <- node2text(myXML, "Organism"))

#  This forms the base of a function that gets taxonomy data
#  from an Entrez result. You can write this!


# ==   1.1  Task - fetchNCBItaxData() function  ================================

# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
# information, returns a list with taxID and organism, if the operation is
# successful, or a list of length 0 if there is an error.


# =    2  Task solutions  ======================================================

# I have placed such a function into the dbUtilities script: look it up by
# clicking on  dbFetchNCBItaxData() in the Environment pane.

# Test:
dbFetchNCBItaxData("XP_001837394")


# [END]
TOC, and filename typo 2020-09-25 02:03:56 +00:00			`# tocID <- "RPR-eUtils_XML.R"`
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the RPR-Scripting_data_downloads unit.`
			`#`
minor maintenance 2020-09-25 02:01:08 +00:00			`# Version: 1.2`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`#`
minor maintenance 2020-09-25 02:01:08 +00:00			`# Date: 2017-10 - 2020-09`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
minor maintenance 2020-09-25 02:01:08 +00:00			`# 1.2 2020 Updates`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# 1.1 Change from require() to requireNamespace(),`
			`# use <package>::<function>() idiom throughout`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# 1.0 First ABC units version`
			`# 0.1 First code copied from 2016 material.`
			`#`
			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
			`#`
			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
			`#`
			`# ==============================================================================`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00

Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`#TOC> ==========================================================================`
TOC, and filename typo 2020-09-25 02:03:56 +00:00			`#TOC>`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`#TOC> Section Title Line`
			`#TOC> -----------------------------------------------------------`
TOC, and filename typo 2020-09-25 02:03:56 +00:00			`#TOC> 1 Working with NCBI eUtils 42`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`#TOC> 1.1 Task - fetchNCBItaxData() function 144`
			`#TOC> 2 Task solutions 151`
TOC, and filename typo 2020-09-25 02:03:56 +00:00			`#TOC>`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`#TOC> ==========================================================================`


			`# = 1 Working with NCBI eUtils ============================================`


Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# To begin, we load the xml2 package that contains functions`
			`# we need to receive and parse html data. NCBI's eUtils send information in`
			`# XML format so we need to be able to parse XML.`
			`if (! requireNamespace("xml2", quietly=TRUE)) {`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`install.packages("xml2")`
			`}`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`# Package information:`
			`# library(help = xml2) # basic information`
			`# browseVignettes("xml2") # available vignettes`
			`# data(package = "xml2") # available datasets`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00


			`# We will walk through the process with the refSeqID`
			`# of yeast Mbp1`
			`refSeqID <- "NP_010227"`


			`# First we build a query URL...`
			`eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"`


			`# Then we assemble an URL that will search for get the`
minor maintenance 2020-09-25 02:01:08 +00:00			`# unique, NCBI internal identifier,`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# for our refSeqID...`
			`URL <- paste(eUtilsBase,`
			`"esearch.fcgi?", # ...using the esearch program`
			`# that finds an entry in an`
			`# NCBI database`
			`"db=protein",`
			`"&term=", refSeqID,`
			`sep="")`
			`# Copy the URL and paste it into your browser to see`
			`# what the response should look like.`
			`URL`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# To fetch a response in R, we use the function read_xml()`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# with our URL as its argument.`
minor maintenance 2020-09-25 02:01:08 +00:00			`( myXML <- xml2::read_xml(URL) )`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
			`# This is XML. We can take the response apart into`
minor maintenance 2020-09-25 02:01:08 +00:00			`# its individual components with the as_list() function.`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`xml2::as_list(myXML)`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
			`# Note how the XML "tree" is represented as a list of`
			`# lists of lists ...`
minor maintenance 2020-09-25 02:01:08 +00:00			`# If we know exactly what element we are looking for,`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# we can extract it from this structure:`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
			`# But this is not very robust, it would break with the`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# slightest change that the NCBI makes to their data format -`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# and the NCBI changes things A LOT!`

			`# Somewhat more robust is to specify the type of element`
minor maintenance 2020-09-25 02:01:08 +00:00			`# we want - its the text contained in an <Id>...</Id>`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# element, and use the XPath XML parsing language to`
			`# retrieve it.`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`xml2::xml_find_all(myXML, "//Id") # returns a "node set"`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents`
			`# of the node set`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# We will need to do this more than once, so we write a function`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# for it...`
			`node2text <- function(doc, tag) {`
			`# an extractor function for the contents of elements`
			`# between given tags in an XML response.`
			`# Contents of all matching elements is returned in`
			`# a vector of strings.`
			`path <- paste0("//", tag)`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`nodes <- xml2::xml_find_all(doc, path)`
			`return(xml2::xml_text(nodes))`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`}`

			`# using node2text() ...`
			`(GID <- node2text(myXML, "Id"))`

minor maintenance 2020-09-25 02:01:08 +00:00			`# The GI is the pivot for data requests at the`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00			`# NCBI.`

			`# Let's first get the associated data for this GI`
			`URL <- paste0(eUtilsBase,`
			`"esummary.fcgi?",`
			`"db=protein",`
			`"&id=",`
			`GID,`
			`"&version=2.0")`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`(myXML <- xml2::read_xml(URL))`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00
			`(taxID <- node2text(myXML, "TaxId"))`
			`(organism <- node2text(myXML, "Organism"))`

			`# This forms the base of a function that gets taxonomy data`
			`# from an Entrez result. You can write this!`


			`# == 1.1 Task - fetchNCBItaxData() function ================================`

			`# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy`
			`# information, returns a list with taxID and organism, if the operation is`
			`# successful, or a list of length 0 if there is an error.`


			`# = 2 Task solutions ======================================================`

			`# I have placed such a function into the dbUtilities script: look it up by`
			`# clicking on dbFetchNCBItaxData() in the Environment pane.`

			`# Test:`
minor maintenance 2020-09-25 02:01:08 +00:00			`dbFetchNCBItaxData("XP_001837394")`
Add code for scripting data download unit 2017-10-06 12:49:43 +00:00

			`# [END]`