From 36140bc98437292d444c0ee499fcbbc4f1e6d387 Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Fri, 25 Sep 2020 10:47:41 +1000
Subject: [PATCH] updates; vectorized dbFetchUniProtSeq()

---
 RPR-UniProt_GET.R         | 52 ++++++++++++++++++++----------------
 scripts/ABC-dbUtilities.R | 56 +++++++++++++++++++++++++++------------
 2 files changed, 68 insertions(+), 40 deletions(-)
diff --git a/RPR-UniProt_GET.R b/RPR-UniProt_GET.R
index b8e9c30..6a87854 100644
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@@ -1,20 +1,16 @@
 # tocID <- "RPR-UniProt_GET.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
-#
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10  -  2019  01
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
+#                  added FASTA headers as attribute
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout
 #           1.0    First ABC units version
@@ -34,13 +30,13 @@
 
 
 #TOC> ==========================================================================
-#TOC>
+#TOC> 
 #TOC>   Section  Title                                      Line
 #TOC> ----------------------------------------------------------
-#TOC>   1        UniProt files via GET                        41
-#TOC>   1.1        Task - fetchUniProtSeq() function         103
-#TOC>   2        Task solutions                              110
-#TOC>
+#TOC>   1        UniProt files via GET                        43
+#TOC>   1.1        Task - fetchUniProtSeq() function         105
+#TOC>   2        Task solutions                              118
+#TOC> 
 #TOC> ==========================================================================
 
 
@@ -51,7 +47,7 @@
 # FASTA sequence from UniProt. All we need is to construct an URL with the
 # correct UniProt ID.
 
-# An interface between R scripts and We=b servers is provided by the httr
+# An interface between R scripts and Web servers is provided by the httr::
 # package. This sends and receives information via the http protocol, just like
 # a Web browser. Since this is a short and simple request, the GET verb is the
 # right tool:
@@ -108,21 +104,31 @@ httr::status_code(response)
 
 # ==   1.1  Task - fetchUniProtSeq() function  =================================
 
-# Task: write a function that takes as input a UniProt ID, fetches the
-# FASTA sequence, returns only the sequence if the operation is successful, or
-# a vector of length 0 if there is an error.
+# Task: write a function that
+#   - takes as input a vector of UniProt IDs,
+#   - fetches the FASTA sequence for each
+#   - returns a vector of the same length as the input, where an element is:
+#   -  ...  the sequence, if the query was successful
+#   -  ...  NA if there was an error
+#   - each element has the UniProt ID as the name()
+#   - bonus: the output has an attribute "headers" that is a vector of the
+#            FASTA headers ( cf. ?attr )
 
 
 # =    2  Task solutions  ======================================================
 
 
-# I have placed such a function into the dbUtilities script: look it up by
-# clicking on  dbFetchUniProtSeq() in the Environment pane.
-
-# Test:
-dbFetchUniProtSeq("P39678")
-
+# I have placed such a function - dbFetchUniProtSeq() - into
+# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
+# in the Environment pane.
 
+# Test this:
+( x <- dbFetchUniProtSeq("P39678") )
+names(x)[1]
+attr(x, "headers")[1]
+x[1]
+cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
+               width = 40), sep = "\n")
 
 
 
diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R
index 1649494..af37663 100644
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@@ -20,11 +20,11 @@
 #TOC>   2.07       dbAddTaxonomy()                  199
 #TOC>   2.08       dbAddAnnotation()                215
 #TOC>   2.09       dbFetchUniProtSeq()              243
-#TOC>   2.10       dbFetchPrositeFeatures()         267
-#TOC>   2.11       node2text()                      311
-#TOC>   2.12       dbFetchNCBItaxData()             323
-#TOC>   2.13       UniProtIDmap()                   362
-#TOC>   3        TESTS                              401
+#TOC>   2.10       dbFetchPrositeFeatures()         289
+#TOC>   2.11       node2text()                      333
+#TOC>   2.12       dbFetchNCBItaxData()             345
+#TOC>   2.13       UniProtIDmap()                   384
+#TOC>   3        TESTS                              423
 #TOC> 
 #TOC> ==========================================================================
 
@@ -241,28 +241,50 @@ dbAddAnnotation <- function(db, jsonDF) {
 
 
 # ==   2.09  dbFetchUniProtSeq()  ==============================================
-dbFetchUniProtSeq <- function(ID) {
+dbFetchUniProtSeq <- function(IDs) {
   # Fetch a protein sequence from UniProt.
   # Parameters:
-  #     ID   char   a UniProt ID (accession number)
+  #     IDs   char  a vector of UniProt IDs (accession number)
   # Value:
-  #     char        the sequence
-  # If the operation is not successful, a 0-length string is returned
+  #     char        a vector of the same length as ID. It contains
+  #                 sequences where the retrieval was successful, NA where
+  #                 it was not successful. The elements are named with
+  #                 the ID, the header lines are set as attribute "header"
 
-  URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
 
-  response <- httr::GET(URL)
+  BASE <- "http://www.uniprot.org/uniprot/"
 
-  mySeq <- character()
-  if (httr::status_code(response) == 200) {
-    x <- as.character(response)
-    x <- strsplit(x, "\n")
-    mySeq <- dbSanitizeSequence(x)
+  sq <- character()
+  hd <- character()
+  for (i in seq_along(IDs)) {
+    URL <- sprintf("%s%s.fasta", BASE, IDs[i])
+    response <- httr::GET(URL)
+    if (httr::status_code(response) == 200) {
+      s <- as.character(response)
+      s <- unlist(strsplit(s, "\n"))
+      x <- dbSanitizeSequence(s)
+    } else {
+      s <- ""
+      x <- NA
+    }
+    hd[i] <- s[1]
+    sq[i] <- x
   }
+  names(sq) <- IDs
+  attr(sq, "headers") <- hd
 
-  return(mySeq)
+  return(sq)
 }
 
+if (FALSE) {
+  inp <- c("P79073", "P0000000", "A0A1W2TKZ7")
+  s <- dbFetchUniProtSeq(inp)
+  s[1:3]
+  str(s)
+  attr(s, "headers")[1]
+}
+
+
 
 # ==   2.10  dbFetchPrositeFeatures()  =========================================
 dbFetchPrositeFeatures <- function(ID) {