Maintainance fixes and UniProt API change bugfix

2018-10-30 21:27:42 -04:00 · 2018-10-30 21:27:42 -04:00 · 1dd87e7473
commit 1dd87e7473
parent ead20d5f18
8 changed files with 51 additions and 30 deletions
--- a/.utilities.R
+++ b/.utilities.R
@ -93,7 +93,7 @@ pBar <- function(i, l, nCh = 50) {
  ticks <- round(seq(1, l-1, length.out = nCh))
  if (i < l) {
    if (any(i == ticks)) {
-      p <- which(i == ticks)
+      p <- which(i == ticks)[1]  # use only first, in case there are ties
      p1 <- paste(rep("#", p), collapse = "")
      p2 <- paste(rep("-", nCh - p), collapse = "")
      cat(sprintf("\r|%s%s|", p1, p2))
--- a/BIN-Data_integration.R
+++ b/BIN-Data_integration.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Data_integration unit.
 #
-# Version:  1.0
+# Version:  1.0.1
 #
-# Date:     2017  10  08
+# Date:     2018  10  30
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0.1  Bugfix: UniProt ID Mapping service API change
 #           1.0    First live version
 #
 #
@ -28,10 +29,10 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                       Line
-#TOC> -------------------------------------------
-#TOC>   1        Identifier mapping            45
-#TOC>   2        Cross-referencing tables     151
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Identifier mapping                  40
+#TOC>   2        Cross-referencing tables           164
 #TOC> 
 #TOC> ==========================================================================

@ -73,14 +74,14 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036"
 # the URL of the server and send a list of items labelled as "query" in the body
 # of the request. GET() and POST() are functions from httr.

-URL <- "http://www.uniprot.org/mapping/"
+URL <- "https://www.uniprot.org/mapping/"
 response <- POST(URL,
                 body = list(from = "P_REFSEQ_AC",   # Refseq Protein
                             to = "ACC",             # UniProt ID
                             format = "tab",
                             query = myQueryIDs))

-response
+cat(content(response))

 # We need to check the status code - if it is not 200, an error ocurred and we
 # can't process the result:
@ -94,6 +95,22 @@ myMappedIDs <- read.delim(file = textConnection(content(response)),
                          stringsAsFactors = FALSE)
 myMappedIDs

+# We actually only need columns 1 and 3, and we can also change the names
+# to "From" and "To":
+
+myMappedIDs <- myMappedIDs[ , c(1,3)]
+colnames(myMappedIDs) <- c("From", "To")
+
+myMappedIDs
+
+# If this works as expected, you should see:
+#        From     To
+# 1 NP_010227 P39678
+# 2 NP_011036 P25302
+#
+# ... and note that there are only two entries, because nothing was returned
+# for the dummy "RefSeq ID" NP_00000
+
 # If the query can't be fulfilled because of a problem with the server, a
 # WebPage is returned. But the server status is also returned and we can check
 # the status code. I have lately gotten many "503" status codes: Server Not
@ -114,7 +131,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  #    empty data frame if the mapping was unsuccessful. No rows are returned
  #    for IDs that are not mapped.

-  URL <- "http://www.uniprot.org/mapping/"
+  URL <- "https://www.uniprot.org/uploadlists/"
  response <- POST(URL,
                   body = list(from = mapFrom,
                               to = mapTo,
@ -125,6 +142,8 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
    myMap <- read.delim(file = textConnection(content(response)),
                        sep = "\t",
                        stringsAsFactors = FALSE)
+    myMap <- myMap[ , c(1,3)]
+    colnames(myMap) <- c("From", "To")
  } else {
    myMap <- data.frame()
    warning(paste("No uniProt ID mapping returned:",
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -584,8 +584,8 @@ KLdiv(pmfL1, pmfL2)  # 0.1087
 # random samples according to the rL1 distribution, calculate the Kullback
 # Leibler divergence with countsL1, and compare the distribution we get with the
 # value we observed as the difference with discL2. Essentially, this tells us
-# the probability that countsL2 is actually a sample from the L1 function. Here we
-# go:
+# the probability that countsL2 is actually a sample from the L1 function.
+# Here we go:

 N <- 1000
 divs <- numeric(N)
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@ -26,7 +26,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                     Line
 #TOC> ---------------------------------------------------------
 #TOC>   1        The Biostrings Package                      52
@ -41,7 +41,7 @@
 #TOC>   5.1      Views                                      183
 #TOC>   5.2      Iranges                                    195
 #TOC>   5.3      StringSets                                 201
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -182,7 +182,7 @@ toString(myDNAseq[4:15])

 # ==   5.1  Views  =============================================================

-# Biostring "Views" are objects that store mutliple substrings of one
+# Biostring "Views" are objects that store multiple substrings of one
 # Biostring object.

 (myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45)))
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@ -12,7 +12,8 @@
 #           1.0    New unit.
 #
 #
-# TODO:
+# TODO: Make a simple solution first, then extend it to error checking, and
+#       to handle .mfa files.
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
@ -22,17 +23,17 @@
 # going on. That's not how it works ...
 #
 # ==============================================================================
- 
+
 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                 Line
 #TOC> -------------------------------------
 #TOC>   1        Reading FASTA           39
 #TOC>   2        Interpreting FASTA     227
 #TOC>   3        Writing FASTA          248
-#TOC> 
+#TOC>
 #TOC> ==========================================================================
- 
+



@ -231,7 +232,7 @@ refAPSES[grep("P39678", refAPSES) + 1]  # grep() the string and add 1
 # when working with strings, we can use substr(<string>, <start>, <stop>) to
 # extract substrings, but more often we expand the string into a vector of
 # single characters with strsplit(<string>, ""). strsplit() returns a list,
-# to accommodate that <string> could be a vector of many elements, therafore
+# to accommodate that <string> could be a vector of many elements, therefore
 # we usually unlist() the result if we use it only on a single string.

 # Example: How many positive charged residues in "MBP1_SACCE"?
@ -297,8 +298,8 @@ writeFASTA <- function(s, OUT = stdout(), width = 60) {

 }

-# Let's try this. We don't define OUT, so the result is written to the console
-# by default. Defualt width for sequence is 60 characters
+# Let's try this. If we don't specify OUT, the result is written to the console
+# by default. Default width for sequence is 60 characters

 writeFASTA(refAPSES)

--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -63,8 +63,8 @@ IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
 nchar(s)
 # Must be 969

-# Fetch the Uniprot ID by retrieving the first string that appears between two
-# vertical bars in the header line.
+# Task: Fetch the Uniprot ID by retrieving the first string that appears between
+# two vertical bars ("pipes") in the header record.
 #

 # Develop the regular expression:
--- a/RPR-Unit_testing.R
+++ b/RPR-Unit_testing.R
@ -25,13 +25,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                       Line
 #TOC> -------------------------------------------
 #TOC>   1        Unit Tests with testthat      43
 #TOC>   2        Organizing your tests        156
 #TOC>   3        Task solutions               181
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -107,8 +107,7 @@ expect_error(log(v[1,2]))                # This appears oK, but ...
 expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!

 # Producing unit tests simply means: we define a function, and then we check
-# whether all test pass. Consider a function that is loaded from your utilities
-# file:
+# whether all test pass. Consider a function that is loaded on startup:

 biCode

--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@ -345,7 +345,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  #    empty data frame if the mapping was unsuccessful. No rows are returned
  #    for IDs that are not mapped.

-  URL <- "http://www.uniprot.org/mapping/"
+  URL <- "https://www.uniprot.org/uploadlists/"
  response <- POST(URL,
                   body = list(from = mapFrom,
                               to = mapTo,
@ -356,6 +356,8 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
    myMap <- read.delim(file = textConnection(content(response)),
                        sep = "\t",
                        stringsAsFactors = FALSE)
+    myMap <- myMap[ , c(1,3)]
+    colnames(myMap) <- c("From", "To")
  } else {
    myMap <- data.frame()
    warning(paste("No uniProt ID mapping returned:",