updates; vectorized dbFetchUniProtSeq()

This commit is contained in:
hyginn 2020-09-25 10:47:41 +10:00
parent 2775c7c9a8
commit 36140bc984
2 changed files with 68 additions and 40 deletions

View File

@ -1,20 +1,16 @@
# tocID <- "RPR-UniProt_GET.R" # tocID <- "RPR-UniProt_GET.R"
# #
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
#
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit. # R code accompanying the RPR-Scripting_data_downloads unit.
# #
# Version: 1.1 # Version: 1.2
# #
# Date: 2017 10 - 2019 01 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
# added FASTA headers as attribute
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 First ABC units version # 1.0 First ABC units version
@ -34,13 +30,13 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------------- #TOC> ----------------------------------------------------------
#TOC> 1 UniProt files via GET 41 #TOC> 1 UniProt files via GET 43
#TOC> 1.1 Task - fetchUniProtSeq() function 103 #TOC> 1.1 Task - fetchUniProtSeq() function 105
#TOC> 2 Task solutions 110 #TOC> 2 Task solutions 118
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -51,7 +47,7 @@
# FASTA sequence from UniProt. All we need is to construct an URL with the # FASTA sequence from UniProt. All we need is to construct an URL with the
# correct UniProt ID. # correct UniProt ID.
# An interface between R scripts and We=b servers is provided by the httr # An interface between R scripts and Web servers is provided by the httr::
# package. This sends and receives information via the http protocol, just like # package. This sends and receives information via the http protocol, just like
# a Web browser. Since this is a short and simple request, the GET verb is the # a Web browser. Since this is a short and simple request, the GET verb is the
# right tool: # right tool:
@ -108,21 +104,31 @@ httr::status_code(response)
# == 1.1 Task - fetchUniProtSeq() function ================================= # == 1.1 Task - fetchUniProtSeq() function =================================
# Task: write a function that takes as input a UniProt ID, fetches the # Task: write a function that
# FASTA sequence, returns only the sequence if the operation is successful, or # - takes as input a vector of UniProt IDs,
# a vector of length 0 if there is an error. # - fetches the FASTA sequence for each
# - returns a vector of the same length as the input, where an element is:
# - ... the sequence, if the query was successful
# - ... NA if there was an error
# - each element has the UniProt ID as the name()
# - bonus: the output has an attribute "headers" that is a vector of the
# FASTA headers ( cf. ?attr )
# = 2 Task solutions ====================================================== # = 2 Task solutions ======================================================
# I have placed such a function into the dbUtilities script: look it up by # I have placed such a function - dbFetchUniProtSeq() - into
# clicking on dbFetchUniProtSeq() in the Environment pane. # "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
# in the Environment pane.
# Test:
dbFetchUniProtSeq("P39678")
# Test this:
( x <- dbFetchUniProtSeq("P39678") )
names(x)[1]
attr(x, "headers")[1]
x[1]
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
width = 40), sep = "\n")

View File

@ -20,11 +20,11 @@
#TOC> 2.07 dbAddTaxonomy() 199 #TOC> 2.07 dbAddTaxonomy() 199
#TOC> 2.08 dbAddAnnotation() 215 #TOC> 2.08 dbAddAnnotation() 215
#TOC> 2.09 dbFetchUniProtSeq() 243 #TOC> 2.09 dbFetchUniProtSeq() 243
#TOC> 2.10 dbFetchPrositeFeatures() 267 #TOC> 2.10 dbFetchPrositeFeatures() 289
#TOC> 2.11 node2text() 311 #TOC> 2.11 node2text() 333
#TOC> 2.12 dbFetchNCBItaxData() 323 #TOC> 2.12 dbFetchNCBItaxData() 345
#TOC> 2.13 UniProtIDmap() 362 #TOC> 2.13 UniProtIDmap() 384
#TOC> 3 TESTS 401 #TOC> 3 TESTS 423
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -241,28 +241,50 @@ dbAddAnnotation <- function(db, jsonDF) {
# == 2.09 dbFetchUniProtSeq() ============================================== # == 2.09 dbFetchUniProtSeq() ==============================================
dbFetchUniProtSeq <- function(ID) { dbFetchUniProtSeq <- function(IDs) {
# Fetch a protein sequence from UniProt. # Fetch a protein sequence from UniProt.
# Parameters: # Parameters:
# ID char a UniProt ID (accession number) # IDs char a vector of UniProt IDs (accession number)
# Value: # Value:
# char the sequence # char a vector of the same length as ID. It contains
# If the operation is not successful, a 0-length string is returned # sequences where the retrieval was successful, NA where
# it was not successful. The elements are named with
# the ID, the header lines are set as attribute "header"
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
response <- httr::GET(URL) BASE <- "http://www.uniprot.org/uniprot/"
mySeq <- character() sq <- character()
if (httr::status_code(response) == 200) { hd <- character()
x <- as.character(response) for (i in seq_along(IDs)) {
x <- strsplit(x, "\n") URL <- sprintf("%s%s.fasta", BASE, IDs[i])
mySeq <- dbSanitizeSequence(x) response <- httr::GET(URL)
if (httr::status_code(response) == 200) {
s <- as.character(response)
s <- unlist(strsplit(s, "\n"))
x <- dbSanitizeSequence(s)
} else {
s <- ""
x <- NA
}
hd[i] <- s[1]
sq[i] <- x
} }
names(sq) <- IDs
attr(sq, "headers") <- hd
return(mySeq) return(sq)
} }
if (FALSE) {
inp <- c("P79073", "P0000000", "A0A1W2TKZ7")
s <- dbFetchUniProtSeq(inp)
s[1:3]
str(s)
attr(s, "headers")[1]
}
# == 2.10 dbFetchPrositeFeatures() ========================================= # == 2.10 dbFetchPrositeFeatures() =========================================
dbFetchPrositeFeatures <- function(ID) { dbFetchPrositeFeatures <- function(ID) {