add httr and xml2 package, add scripted data download functions

This commit is contained in:
hyginn 2017-10-06 08:51:12 -04:00
parent b33feed50a
commit 27dd73664b

View File

@ -9,12 +9,24 @@
# ====== PACKAGES ============================================================== # ====== PACKAGES ==============================================================
if (! require("jsonlite", quietly = TRUE)) { if (! require(jsonlite, quietly = TRUE)) {
install.packages("jsonlite") install.packages("jsonlite")
library(jsonlite) library(jsonlite)
} }
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
if (!require(xml2, quietly = TRUE)) {
install.packages("xml2")
library(xml2)
}
# ====== FUNCTIONS ============================================================= # ====== FUNCTIONS =============================================================
@ -204,4 +216,118 @@ dbAddAnnotation <- function(db, jsonDF) {
} }
dbFetchUniProtSeq <- function(ID) {
# Fetch a protein sequence from UniProt.
# Parameters:
# ID char a UniProt ID (accession number)
# Value:
# char the sequence
# If the operation is not successful, a 0-length string is returned
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
response <- GET(URL)
mySeq <- character()
if (status_code(response) == 200) {
x <- as.character(response)
x <- strsplit(x, "\n")
mySeq <- dbSanitizeSequence(x)
}
return(mySeq)
}
dbFetchPrositeFeatures <- function(ID) {
# Fetch feature annotations from ScanProsite.
# Parameters:
# ID char a UniProt ID (accession number)
# Value:
# data frame uID char UniProt ID
# start num start of motif
# end num end of motif
# psID char PROSITE motif ID
# psName char PROSITE motif name
# If the operation is not successful, a 0-length data frame is returned.
URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = ID,
skip = "on",
output = "tabular"))
myFeatures <- data.frame()
if (status_code(response) == 200) {
lines <- unlist(strsplit(content(response, "text"), "\\n"))
patt <- sprintf("\\|%s\\|", UniProtID)
lines <- lines[grep(patt, lines)]
for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|"))
myFeatures <- rbind(myFeatures,
data.frame(uID = tokens[2],
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
stringsAsFactors = FALSE))
}
}
return(myFeatures)
}
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
# between given tags in an XML response.
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml_find_all(doc, path)
return(xml_text(nodes))
}
dbFetchNCBItaxData <- function(ID) {
# Fetch feature taxID and Organism from the NCBI.
# Parameters:
# ID char a RefSeq ID (accession number)
# Value:
# data frame taxID num NCBI taxID
# organism char organism for this taxID
# If the operation is not successful, a 0-length data frame is returned.
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
URL <- paste(eUtilsBase,
"esearch.fcgi?",
"db=protein",
"&term=", ID,
sep="")
myXML <- read_xml(URL)
GID <- node2text(myXML, "Id")
URL <- paste0(eUtilsBase,
"esummary.fcgi?",
"db=protein",
"&id=",
GID,
"&version=2.0")
myXML <- read_xml(URL)
x <- as.integer(node2text(myXML, "TaxId"))
y <- node2text(myXML, "Organism")
tID <- data.frame()
if (length(x) > 0 && length(y) > 0) {
tID <- data.frame(taxID = x, organism = y)
}
return(tID)
}
# [END] # [END]