bugfix in dbFetchPrositeFeatures(); add return of actual motif sequence; stylistic updates

This commit is contained in:
hyginn 2020-09-25 11:29:28 +10:00
parent 36140bc984
commit abb146f828
2 changed files with 42 additions and 36 deletions

View File

@ -1,20 +1,15 @@
# tocID <- "RPR-PROSITE_POST.R" # tocID <- "RPR-PROSITE_POST.R"
# #
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
#
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit. # R code accompanying the RPR-Scripting_data_downloads unit.
# #
# Version: 1.1 # Version: 1.2
# #
# Date: 2017 10 - 2019 01 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces # 1.0.1 Updates for slightly changed interfaces
@ -38,9 +33,9 @@
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------------------- #TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 42 #TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 142 #TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 150 #TOC> 2 Task solutions 156
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -59,9 +54,10 @@ if (! requireNamespace("httr", quietly = TRUE)) {
# We have reverse engineered the Web form for a ScanProsite request, and can now # We have reverse engineered the Web form for a ScanProsite request, and can
# construct a POST request. The command is similar to GET(), but we need an # construct a valid POST request from knowing the required field names. The POST
# explicit request body: a list of key/value pairs # command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs
UniProtID <- "P39678" UniProtID <- "P39678"
@ -79,19 +75,24 @@ response <- httr::POST(URL,
httr::status_code(response) # If this is not 200, something went wrong and it httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask # makes no sense to continue. If this persists, ask
# on the mailing list what to do. # on the Discussion Board what to do.
# The text contents of the response is available with the # The text contents of the response is available with the
# content() function: # content() function:
httr::content(response, "text") httr::content(response, "text")
# ... should show you the same as the page contents that # ... should show you the same as the page contents that you have seen in the
# you have seen in the browser. The date we need Now we need to extract # browser. Now we need to extract the data from the page. For this simple
# the data from the page: we need regular expressions, but # example we can get away with using regular expressions, but in general we need
# only simple ones. First, we strsplit() the response into # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# individual lines, since each of our data elements is on # strsplit() the response into individual lines, since each of our data elements
# its own line. We simply split on the "\\n" newline character. # is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines) head(lines)
@ -105,10 +106,9 @@ patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this # ... and select only the lines that match this
# pattern: # pattern:
lines <- lines[grep(patt, lines)] ( lines <- lines[grep(patt, lines)] )
lines
# ... captures the four lines of output. # ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of # Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
@ -137,7 +137,7 @@ for (line in lines) {
end = as.numeric(tokens[5]), end = as.numeric(tokens[5]),
psID = tokens[6], psID = tokens[6],
psName = tokens[7], psName = tokens[7],
stringsAsFactors = FALSE)) psSeq = tokens[11]))
} }
features features
@ -149,8 +149,8 @@ features
# Task: write a function that takes as input a UniProt ID, fetches the # Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a list as given above, or # features it contains from ScanProsite and returns a data frame as given above, or
# a list of length 0 if there is an error. # an empty data frame if there is an error.
# = 2 Task solutions ====================================================== # = 2 Task solutions ======================================================
@ -160,7 +160,7 @@ features
# clicking on dbFetchPrositeFeatures() in the Environment pane. # clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test: # Test:
dbFetchPrositeFeatures("P39678") dbFetchPrositeFeatures("Q5KMQ9")

View File

@ -21,10 +21,10 @@
#TOC> 2.08 dbAddAnnotation() 215 #TOC> 2.08 dbAddAnnotation() 215
#TOC> 2.09 dbFetchUniProtSeq() 243 #TOC> 2.09 dbFetchUniProtSeq() 243
#TOC> 2.10 dbFetchPrositeFeatures() 289 #TOC> 2.10 dbFetchPrositeFeatures() 289
#TOC> 2.11 node2text() 333 #TOC> 2.11 node2text() 339
#TOC> 2.12 dbFetchNCBItaxData() 345 #TOC> 2.12 dbFetchNCBItaxData() 351
#TOC> 2.13 UniProtIDmap() 384 #TOC> 2.13 UniProtIDmap() 390
#TOC> 3 TESTS 423 #TOC> 3 TESTS 429
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -297,6 +297,7 @@ dbFetchPrositeFeatures <- function(ID) {
# end num end of motif # end num end of motif
# psID char PROSITE motif ID # psID char PROSITE motif ID
# psName char PROSITE motif name # psName char PROSITE motif name
# psSeq char sequence annotated to the feature
# If the operation is not successful, a 0-length data frame is returned. # If the operation is not successful, a 0-length data frame is returned.
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
@ -313,7 +314,7 @@ dbFetchPrositeFeatures <- function(ID) {
lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
patt <- sprintf("\\|%s\\|", UniProtID) patt <- sprintf("\\|%s\\|", ID)
lines <- lines[grep(patt, lines)] lines <- lines[grep(patt, lines)]
for (line in lines) { for (line in lines) {
@ -323,12 +324,17 @@ dbFetchPrositeFeatures <- function(ID) {
start = as.numeric(tokens[4]), start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]), end = as.numeric(tokens[5]),
psID = tokens[6], psID = tokens[6],
psName = tokens[7])) psName = tokens[7],
psSeq = tokens[11]))
} }
} }
return(myFeatures) return(myFeatures)
} }
if (FALSE) {
dbFetchPrositeFeatures("P33520") # RES1_SCHPO
}
# == 2.11 node2text() ====================================================== # == 2.11 node2text() ======================================================
node2text <- function(doc, tag) { node2text <- function(doc, tag) {