From 27dd73664b9482e458df04cafa297b5bce128353 Mon Sep 17 00:00:00 2001 From: hyginn Date: Fri, 6 Oct 2017 08:51:12 -0400 Subject: [PATCH] add httr and xml2 package, add scripted data download functions --- scripts/ABC-dbUtilities.R | 128 +++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R index 587d216..735ce84 100644 --- a/scripts/ABC-dbUtilities.R +++ b/scripts/ABC-dbUtilities.R @@ -9,12 +9,24 @@ # ====== PACKAGES ============================================================== -if (! require("jsonlite", quietly = TRUE)) { +if (! require(jsonlite, quietly = TRUE)) { install.packages("jsonlite") library(jsonlite) } +if (!require(httr, quietly = TRUE)) { + install.packages("httr") + library(httr) +} + + +if (!require(xml2, quietly = TRUE)) { + install.packages("xml2") + library(xml2) +} + + # ====== FUNCTIONS ============================================================= @@ -204,4 +216,118 @@ dbAddAnnotation <- function(db, jsonDF) { } +dbFetchUniProtSeq <- function(ID) { + # Fetch a protein sequence from UniProt. + # Parameters: + # ID char a UniProt ID (accession number) + # Value: + # char the sequence + # If the operation is not successful, a 0-length string is returned + + URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID) + + response <- GET(URL) + + mySeq <- character() + if (status_code(response) == 200) { + x <- as.character(response) + x <- strsplit(x, "\n") + mySeq <- dbSanitizeSequence(x) + } + + return(mySeq) +} + + +dbFetchPrositeFeatures <- function(ID) { + # Fetch feature annotations from ScanProsite. + # Parameters: + # ID char a UniProt ID (accession number) + # Value: + # data frame uID char UniProt ID + # start num start of motif + # end num end of motif + # psID char PROSITE motif ID + # psName char PROSITE motif name + # If the operation is not successful, a 0-length data frame is returned. + + URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" + + response <- POST(URL, + body = list(meta = "opt1", + meta1_protein = "opt1", + seq = ID, + skip = "on", + output = "tabular")) + + myFeatures <- data.frame() + if (status_code(response) == 200) { + + lines <- unlist(strsplit(content(response, "text"), "\\n")) + + patt <- sprintf("\\|%s\\|", UniProtID) + lines <- lines[grep(patt, lines)] + + for (line in lines) { + tokens <- unlist(strsplit(line, "\\t|\\|")) + myFeatures <- rbind(myFeatures, + data.frame(uID = tokens[2], + start = as.numeric(tokens[4]), + end = as.numeric(tokens[5]), + psID = tokens[6], + psName = tokens[7], + stringsAsFactors = FALSE)) + } + } + return(myFeatures) +} + + +node2text <- function(doc, tag) { + # an extractor function for the contents of elements + # between given tags in an XML response. + # Contents of all matching elements is returned in + # a vector of strings. + path <- paste0("//", tag) + nodes <- xml_find_all(doc, path) + return(xml_text(nodes)) +} + + +dbFetchNCBItaxData <- function(ID) { + # Fetch feature taxID and Organism from the NCBI. + # Parameters: + # ID char a RefSeq ID (accession number) + # Value: + # data frame taxID num NCBI taxID + # organism char organism for this taxID + # If the operation is not successful, a 0-length data frame is returned. + + eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + URL <- paste(eUtilsBase, + "esearch.fcgi?", + "db=protein", + "&term=", ID, + sep="") + myXML <- read_xml(URL) + GID <- node2text(myXML, "Id") + + URL <- paste0(eUtilsBase, + "esummary.fcgi?", + "db=protein", + "&id=", + GID, + "&version=2.0") + myXML <- read_xml(URL) + + x <- as.integer(node2text(myXML, "TaxId")) + y <- node2text(myXML, "Organism") + + tID <- data.frame() + if (length(x) > 0 && length(y) > 0) { + tID <- data.frame(taxID = x, organism = y) + } + return(tID) +} + # [END]