2020-09-25 02:03:56 +00:00
|
|
|
# tocID <- "RPR-eUtils_XML.R"
|
2020-09-18 11:56:30 +00:00
|
|
|
#
|
2017-10-06 12:49:43 +00:00
|
|
|
# Purpose: A Bioinformatics Course:
|
|
|
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
|
|
|
#
|
2020-09-25 02:01:08 +00:00
|
|
|
# Version: 1.2
|
2017-10-06 12:49:43 +00:00
|
|
|
#
|
2020-09-25 02:01:08 +00:00
|
|
|
# Date: 2017-10 - 2020-09
|
2017-10-06 12:49:43 +00:00
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# Versions:
|
2020-09-25 02:01:08 +00:00
|
|
|
# 1.2 2020 Updates
|
2019-01-08 07:11:25 +00:00
|
|
|
# 1.1 Change from require() to requireNamespace(),
|
|
|
|
# use <package>::<function>() idiom throughout
|
2017-10-06 12:49:43 +00:00
|
|
|
# 1.0 First ABC units version
|
|
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# TODO:
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
|
|
|
#
|
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
|
|
# going on. That's not how it works ...
|
|
|
|
#
|
|
|
|
# ==============================================================================
|
2017-10-29 03:05:53 +00:00
|
|
|
|
|
|
|
|
2017-10-06 12:49:43 +00:00
|
|
|
#TOC> ==========================================================================
|
2020-09-25 02:03:56 +00:00
|
|
|
#TOC>
|
2019-01-08 07:11:25 +00:00
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> -----------------------------------------------------------
|
2020-09-25 02:03:56 +00:00
|
|
|
#TOC> 1 Working with NCBI eUtils 42
|
2019-01-08 07:11:25 +00:00
|
|
|
#TOC> 1.1 Task - fetchNCBItaxData() function 144
|
|
|
|
#TOC> 2 Task solutions 151
|
2020-09-25 02:03:56 +00:00
|
|
|
#TOC>
|
2017-10-06 12:49:43 +00:00
|
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
|
|
|
|
|
|
# = 1 Working with NCBI eUtils ============================================
|
|
|
|
|
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
# To begin, we load the xml2 package that contains functions
|
|
|
|
# we need to receive and parse html data. NCBI's eUtils send information in
|
|
|
|
# XML format so we need to be able to parse XML.
|
|
|
|
if (! requireNamespace("xml2", quietly=TRUE)) {
|
2017-10-06 12:49:43 +00:00
|
|
|
install.packages("xml2")
|
|
|
|
}
|
2017-10-29 03:05:53 +00:00
|
|
|
# Package information:
|
|
|
|
# library(help = xml2) # basic information
|
|
|
|
# browseVignettes("xml2") # available vignettes
|
|
|
|
# data(package = "xml2") # available datasets
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# We will walk through the process with the refSeqID
|
|
|
|
# of yeast Mbp1
|
|
|
|
refSeqID <- "NP_010227"
|
|
|
|
|
|
|
|
|
|
|
|
# First we build a query URL...
|
|
|
|
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
|
|
|
|
|
|
|
|
|
|
# Then we assemble an URL that will search for get the
|
2020-09-25 02:01:08 +00:00
|
|
|
# unique, NCBI internal identifier,
|
2017-10-06 12:49:43 +00:00
|
|
|
# for our refSeqID...
|
|
|
|
URL <- paste(eUtilsBase,
|
|
|
|
"esearch.fcgi?", # ...using the esearch program
|
|
|
|
# that finds an entry in an
|
|
|
|
# NCBI database
|
|
|
|
"db=protein",
|
|
|
|
"&term=", refSeqID,
|
|
|
|
sep="")
|
|
|
|
# Copy the URL and paste it into your browser to see
|
|
|
|
# what the response should look like.
|
|
|
|
URL
|
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
# To fetch a response in R, we use the function read_xml()
|
2017-10-06 12:49:43 +00:00
|
|
|
# with our URL as its argument.
|
2020-09-25 02:01:08 +00:00
|
|
|
( myXML <- xml2::read_xml(URL) )
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
# This is XML. We can take the response apart into
|
2020-09-25 02:01:08 +00:00
|
|
|
# its individual components with the as_list() function.
|
2017-10-06 12:49:43 +00:00
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
xml2::as_list(myXML)
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
# Note how the XML "tree" is represented as a list of
|
|
|
|
# lists of lists ...
|
2020-09-25 02:01:08 +00:00
|
|
|
# If we know exactly what element we are looking for,
|
2017-10-06 12:49:43 +00:00
|
|
|
# we can extract it from this structure:
|
2019-01-08 07:11:25 +00:00
|
|
|
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
# But this is not very robust, it would break with the
|
2019-01-08 07:11:25 +00:00
|
|
|
# slightest change that the NCBI makes to their data format -
|
2017-10-06 12:49:43 +00:00
|
|
|
# and the NCBI changes things A LOT!
|
|
|
|
|
|
|
|
# Somewhat more robust is to specify the type of element
|
2020-09-25 02:01:08 +00:00
|
|
|
# we want - its the text contained in an <Id>...</Id>
|
2017-10-06 12:49:43 +00:00
|
|
|
# element, and use the XPath XML parsing language to
|
|
|
|
# retrieve it.
|
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
|
2017-10-06 12:49:43 +00:00
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
|
|
|
|
# of the node set
|
2017-10-06 12:49:43 +00:00
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
# We will need to do this more than once, so we write a function
|
2017-10-06 12:49:43 +00:00
|
|
|
# for it...
|
|
|
|
node2text <- function(doc, tag) {
|
|
|
|
# an extractor function for the contents of elements
|
|
|
|
# between given tags in an XML response.
|
|
|
|
# Contents of all matching elements is returned in
|
|
|
|
# a vector of strings.
|
|
|
|
path <- paste0("//", tag)
|
2019-01-08 07:11:25 +00:00
|
|
|
nodes <- xml2::xml_find_all(doc, path)
|
|
|
|
return(xml2::xml_text(nodes))
|
2017-10-06 12:49:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# using node2text() ...
|
|
|
|
(GID <- node2text(myXML, "Id"))
|
|
|
|
|
2020-09-25 02:01:08 +00:00
|
|
|
# The GI is the pivot for data requests at the
|
2017-10-06 12:49:43 +00:00
|
|
|
# NCBI.
|
|
|
|
|
|
|
|
# Let's first get the associated data for this GI
|
|
|
|
URL <- paste0(eUtilsBase,
|
|
|
|
"esummary.fcgi?",
|
|
|
|
"db=protein",
|
|
|
|
"&id=",
|
|
|
|
GID,
|
|
|
|
"&version=2.0")
|
2019-01-08 07:11:25 +00:00
|
|
|
(myXML <- xml2::read_xml(URL))
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
(taxID <- node2text(myXML, "TaxId"))
|
|
|
|
(organism <- node2text(myXML, "Organism"))
|
|
|
|
|
|
|
|
# This forms the base of a function that gets taxonomy data
|
|
|
|
# from an Entrez result. You can write this!
|
|
|
|
|
|
|
|
|
|
|
|
# == 1.1 Task - fetchNCBItaxData() function ================================
|
|
|
|
|
|
|
|
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
|
|
|
|
# information, returns a list with taxID and organism, if the operation is
|
|
|
|
# successful, or a list of length 0 if there is an error.
|
|
|
|
|
|
|
|
|
|
|
|
# = 2 Task solutions ======================================================
|
|
|
|
|
|
|
|
# I have placed such a function into the dbUtilities script: look it up by
|
|
|
|
# clicking on dbFetchNCBItaxData() in the Environment pane.
|
|
|
|
|
|
|
|
# Test:
|
2020-09-25 02:01:08 +00:00
|
|
|
dbFetchNCBItaxData("XP_001837394")
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
# [END]
|