bugfix in dbFetchPrositeFeatures(); add return of actual motif sequence; stylistic updates

2020-09-25 11:29:28 +10:00
parent 36140bc984
commit abb146f828
2 changed files with 42 additions and 36 deletions
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@@ -1,20 +1,15 @@
 # tocID <- "RPR-PROSITE_POST.R"
 #
 # ---------------------------------------------------------------------------- #
 #  PATIENCE  ...                                                               #
 #    Do not yet work wih this code. Updates in progress. Thank you.            #
 #    boris.steipe@utoronto.ca                                                  #
 # ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10  -  2019  01
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    2020 Maintenance
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #           1.0.1  Updates for slightly changed interfaces
@@ -38,9 +33,9 @@
 #TOC> 
 #TOC>   Section  Title                                                 Line
 #TOC> ---------------------------------------------------------------------
-#TOC>   1        Constructing a POST command from a Web query            42
+#TOC>   1        Constructing a POST command from a Web query            43
-#TOC>   1.1        Task - fetchPrositeFeatures() function               142
+#TOC>   1.1        Task - fetchPrositeFeatures() function               148
-#TOC>   2        Task solutions                                         150
+#TOC>   2        Task solutions                                         156
 #TOC> 
 #TOC> ==========================================================================
@@ -59,9 +54,10 @@ if (! requireNamespace("httr", quietly = TRUE)) {
-# We have reverse engineered the Web form for a ScanProsite request, and can now
+# We have reverse engineered the Web form for a ScanProsite request, and can
-# construct a POST request. The command is similar to GET(), but we need an
+# construct a valid POST request from knowing the required field names. The POST
-# explicit request body: a list of key/value pairs
+# command is similar to GET(), but we need an explicit request body that
 # contains a list of key/value pairs
 UniProtID <- "P39678"
@@ -79,19 +75,24 @@ response <- httr::POST(URL,
 httr::status_code(response)  # If this is not 200, something went wrong and it
                             # makes no sense to continue. If this persists, ask
-                             # on the mailing list what to do.
+                             # on the Discussion Board what to do.
 # The text contents of the response is available with the
 # content() function:
 httr::content(response, "text")
-# ... should show you the same as the page contents that
+# ... should show you the same as the page contents that you have seen in the
-# you have seen in the browser. The date we need Now we need to extract
+# browser. Now we need to extract the data from the page. For this simple
-# the data from the page: we need regular expressions, but
+# example we can get away with using regular expressions, but in general we need
-# only simple ones. First, we strsplit() the response into
+# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
-# individual lines, since each of our data elements is on
+# strsplit() the response into individual lines, since each of our data elements
-# its own line. We simply split on the "\\n" newline character.
+# is on its own line, and then capture the contents. The way Prosite has
 # formatted their HTML we can simply split on the "\\n" newline character - but
 # they could write the same valid HTML without any newline-characters at all.
 # Understand that we are working with a bit of a "hack" here: exploting
 # empirical assumptions rather than a formal specification. But sometimes quick
 # and dirty is fine, because quick.
 lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
 head(lines)
@@ -105,10 +106,9 @@ patt <- sprintf("\\|%s\\|", UniProtID)
 # ... and select only the lines that match this
 # pattern:
-lines <- lines[grep(patt, lines)]
+( lines <- lines[grep(patt, lines)] )
 lines
-# ... captures the four lines of output.
+# ... captures the three lines of output.
 # Now we break the lines apart into tokens: this is another application of
 # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
@@ -137,7 +137,7 @@ for (line in lines) {
                               end   =  as.numeric(tokens[5]),
                               psID  =  tokens[6],
                               psName = tokens[7],
-                               stringsAsFactors = FALSE))
+                               psSeq  = tokens[11]))
 }
 features
@@ -149,8 +149,8 @@ features
 # Task: write a function that takes as input a UniProt ID, fetches the
-# features it contains from ScanProsite and returns a list as given above, or
+# features it contains from ScanProsite and returns a data frame as given above, or
-# a list of length 0 if there is an error.
+# an empty data frame if there is an error.
 # =    2  Task solutions  ======================================================
@@ -160,7 +160,7 @@ features
 # clicking on  dbFetchPrositeFeatures() in the Environment pane.
 # Test:
-dbFetchPrositeFeatures("P39678")
+dbFetchPrositeFeatures("Q5KMQ9")
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@@ -21,10 +21,10 @@
 #TOC>   2.08       dbAddAnnotation()                215
 #TOC>   2.09       dbFetchUniProtSeq()              243
 #TOC>   2.10       dbFetchPrositeFeatures()         289
-#TOC>   2.11       node2text()                      333
+#TOC>   2.11       node2text()                      339
-#TOC>   2.12       dbFetchNCBItaxData()             345
+#TOC>   2.12       dbFetchNCBItaxData()             351
-#TOC>   2.13       UniProtIDmap()                   384
+#TOC>   2.13       UniProtIDmap()                   390
-#TOC>   3        TESTS                              423
+#TOC>   3        TESTS                              429
 #TOC> 
 #TOC> ==========================================================================
@@ -297,6 +297,7 @@ dbFetchPrositeFeatures <- function(ID) {
  #                    end    num   end of motif
  #                    psID   char  PROSITE motif ID
  #                    psName char  PROSITE motif name
  #                    psSeq  char  sequence annotated to the feature
  # If the operation is not successful, a 0-length data frame is returned.
  URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
@@ -313,7 +314,7 @@ dbFetchPrositeFeatures <- function(ID) {
    lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
-    patt <- sprintf("\\|%s\\|", UniProtID)
+    patt <- sprintf("\\|%s\\|", ID)
    lines <- lines[grep(patt, lines)]
    for (line in lines) {
@@ -323,12 +324,17 @@ dbFetchPrositeFeatures <- function(ID) {
                                     start =  as.numeric(tokens[4]),
                                     end   =  as.numeric(tokens[5]),
                                     psID  =  tokens[6],
-                                     psName = tokens[7]))
+                                     psName = tokens[7],
                                     psSeq  = tokens[11]))
    }
  }
  return(myFeatures)
 }
 if (FALSE) {
  dbFetchPrositeFeatures("P33520")  # RES1_SCHPO
 }
 # ==   2.11  node2text()  ======================================================
 node2text <- function(doc, tag) {