From 36140bc98437292d444c0ee499fcbbc4f1e6d387 Mon Sep 17 00:00:00 2001 From: hyginn Date: Fri, 25 Sep 2020 10:47:41 +1000 Subject: [PATCH] updates; vectorized dbFetchUniProtSeq() --- RPR-UniProt_GET.R | 52 ++++++++++++++++++++---------------- scripts/ABC-dbUtilities.R | 56 +++++++++++++++++++++++++++------------ 2 files changed, 68 insertions(+), 40 deletions(-) diff --git a/RPR-UniProt_GET.R b/RPR-UniProt_GET.R index b8e9c30..6a87854 100644 --- a/RPR-UniProt_GET.R +++ b/RPR-UniProt_GET.R @@ -1,20 +1,16 @@ # tocID <- "RPR-UniProt_GET.R" # -# ---------------------------------------------------------------------------- # -# PATIENCE ... # -# Do not yet work wih this code. Updates in progress. Thank you. # -# boris.steipe@utoronto.ca # -# ---------------------------------------------------------------------------- # -# # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 10 - 2019 01 +# Date: 2017-10 - 2020-09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and +# added FASTA headers as attribute # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout # 1.0 First ABC units version @@ -34,13 +30,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------------- -#TOC> 1 UniProt files via GET 41 -#TOC> 1.1 Task - fetchUniProtSeq() function 103 -#TOC> 2 Task solutions 110 -#TOC> +#TOC> 1 UniProt files via GET 43 +#TOC> 1.1 Task - fetchUniProtSeq() function 105 +#TOC> 2 Task solutions 118 +#TOC> #TOC> ========================================================================== @@ -51,7 +47,7 @@ # FASTA sequence from UniProt. All we need is to construct an URL with the # correct UniProt ID. -# An interface between R scripts and We=b servers is provided by the httr +# An interface between R scripts and Web servers is provided by the httr:: # package. This sends and receives information via the http protocol, just like # a Web browser. Since this is a short and simple request, the GET verb is the # right tool: @@ -108,21 +104,31 @@ httr::status_code(response) # == 1.1 Task - fetchUniProtSeq() function ================================= -# Task: write a function that takes as input a UniProt ID, fetches the -# FASTA sequence, returns only the sequence if the operation is successful, or -# a vector of length 0 if there is an error. +# Task: write a function that +# - takes as input a vector of UniProt IDs, +# - fetches the FASTA sequence for each +# - returns a vector of the same length as the input, where an element is: +# - ... the sequence, if the query was successful +# - ... NA if there was an error +# - each element has the UniProt ID as the name() +# - bonus: the output has an attribute "headers" that is a vector of the +# FASTA headers ( cf. ?attr ) # = 2 Task solutions ====================================================== -# I have placed such a function into the dbUtilities script: look it up by -# clicking on dbFetchUniProtSeq() in the Environment pane. - -# Test: -dbFetchUniProtSeq("P39678") - +# I have placed such a function - dbFetchUniProtSeq() - into +# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq() +# in the Environment pane. +# Test this: +( x <- dbFetchUniProtSeq("P39678") ) +names(x)[1] +attr(x, "headers")[1] +x[1] +cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]), + width = 40), sep = "\n") diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R index 1649494..af37663 100644 --- a/scripts/ABC-dbUtilities.R +++ b/scripts/ABC-dbUtilities.R @@ -20,11 +20,11 @@ #TOC> 2.07 dbAddTaxonomy() 199 #TOC> 2.08 dbAddAnnotation() 215 #TOC> 2.09 dbFetchUniProtSeq() 243 -#TOC> 2.10 dbFetchPrositeFeatures() 267 -#TOC> 2.11 node2text() 311 -#TOC> 2.12 dbFetchNCBItaxData() 323 -#TOC> 2.13 UniProtIDmap() 362 -#TOC> 3 TESTS 401 +#TOC> 2.10 dbFetchPrositeFeatures() 289 +#TOC> 2.11 node2text() 333 +#TOC> 2.12 dbFetchNCBItaxData() 345 +#TOC> 2.13 UniProtIDmap() 384 +#TOC> 3 TESTS 423 #TOC> #TOC> ========================================================================== @@ -241,28 +241,50 @@ dbAddAnnotation <- function(db, jsonDF) { # == 2.09 dbFetchUniProtSeq() ============================================== -dbFetchUniProtSeq <- function(ID) { +dbFetchUniProtSeq <- function(IDs) { # Fetch a protein sequence from UniProt. # Parameters: - # ID char a UniProt ID (accession number) + # IDs char a vector of UniProt IDs (accession number) # Value: - # char the sequence - # If the operation is not successful, a 0-length string is returned + # char a vector of the same length as ID. It contains + # sequences where the retrieval was successful, NA where + # it was not successful. The elements are named with + # the ID, the header lines are set as attribute "header" - URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID) - response <- httr::GET(URL) + BASE <- "http://www.uniprot.org/uniprot/" - mySeq <- character() - if (httr::status_code(response) == 200) { - x <- as.character(response) - x <- strsplit(x, "\n") - mySeq <- dbSanitizeSequence(x) + sq <- character() + hd <- character() + for (i in seq_along(IDs)) { + URL <- sprintf("%s%s.fasta", BASE, IDs[i]) + response <- httr::GET(URL) + if (httr::status_code(response) == 200) { + s <- as.character(response) + s <- unlist(strsplit(s, "\n")) + x <- dbSanitizeSequence(s) + } else { + s <- "" + x <- NA + } + hd[i] <- s[1] + sq[i] <- x } + names(sq) <- IDs + attr(sq, "headers") <- hd - return(mySeq) + return(sq) } +if (FALSE) { + inp <- c("P79073", "P0000000", "A0A1W2TKZ7") + s <- dbFetchUniProtSeq(inp) + s[1:3] + str(s) + attr(s, "headers")[1] +} + + # == 2.10 dbFetchPrositeFeatures() ========================================= dbFetchPrositeFeatures <- function(ID) {