bugfix in dbFetchPrositeFeatures(); add return of actual motif sequence; stylistic updates

This commit is contained in:
hyginn 2020-09-25 11:29:28 +10:00
parent 36140bc984
commit abb146f828
2 changed files with 42 additions and 36 deletions

View File

@ -1,20 +1,15 @@
# tocID <- "RPR-PROSITE_POST.R"
#
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces
@ -35,13 +30,13 @@
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 42
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
#TOC> 2 Task solutions 150
#TOC>
#TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 156
#TOC>
#TOC> ==========================================================================
@ -59,9 +54,10 @@ if (! requireNamespace("httr", quietly = TRUE)) {
# We have reverse engineered the Web form for a ScanProsite request, and can now
# construct a POST request. The command is similar to GET(), but we need an
# explicit request body: a list of key/value pairs
# We have reverse engineered the Web form for a ScanProsite request, and can
# construct a valid POST request from knowing the required field names. The POST
# command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs
UniProtID <- "P39678"
@ -79,19 +75,24 @@ response <- httr::POST(URL,
httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask
# on the mailing list what to do.
# on the Discussion Board what to do.
# The text contents of the response is available with the
# content() function:
httr::content(response, "text")
# ... should show you the same as the page contents that
# you have seen in the browser. The date we need Now we need to extract
# the data from the page: we need regular expressions, but
# only simple ones. First, we strsplit() the response into
# individual lines, since each of our data elements is on
# its own line. We simply split on the "\\n" newline character.
# ... should show you the same as the page contents that you have seen in the
# browser. Now we need to extract the data from the page. For this simple
# example we can get away with using regular expressions, but in general we need
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# strsplit() the response into individual lines, since each of our data elements
# is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines)
@ -105,10 +106,9 @@ patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this
# pattern:
lines <- lines[grep(patt, lines)]
lines
( lines <- lines[grep(patt, lines)] )
# ... captures the four lines of output.
# ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
@ -137,7 +137,7 @@ for (line in lines) {
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
stringsAsFactors = FALSE))
psSeq = tokens[11]))
}
features
@ -149,8 +149,8 @@ features
# Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a list as given above, or
# a list of length 0 if there is an error.
# features it contains from ScanProsite and returns a data frame as given above, or
# an empty data frame if there is an error.
# = 2 Task solutions ======================================================
@ -160,7 +160,7 @@ features
# clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test:
dbFetchPrositeFeatures("P39678")
dbFetchPrositeFeatures("Q5KMQ9")

View File

@ -21,10 +21,10 @@
#TOC> 2.08 dbAddAnnotation() 215
#TOC> 2.09 dbFetchUniProtSeq() 243
#TOC> 2.10 dbFetchPrositeFeatures() 289
#TOC> 2.11 node2text() 333
#TOC> 2.12 dbFetchNCBItaxData() 345
#TOC> 2.13 UniProtIDmap() 384
#TOC> 3 TESTS 423
#TOC> 2.11 node2text() 339
#TOC> 2.12 dbFetchNCBItaxData() 351
#TOC> 2.13 UniProtIDmap() 390
#TOC> 3 TESTS 429
#TOC>
#TOC> ==========================================================================
@ -297,6 +297,7 @@ dbFetchPrositeFeatures <- function(ID) {
# end num end of motif
# psID char PROSITE motif ID
# psName char PROSITE motif name
# psSeq char sequence annotated to the feature
# If the operation is not successful, a 0-length data frame is returned.
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
@ -313,7 +314,7 @@ dbFetchPrositeFeatures <- function(ID) {
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
patt <- sprintf("\\|%s\\|", UniProtID)
patt <- sprintf("\\|%s\\|", ID)
lines <- lines[grep(patt, lines)]
for (line in lines) {
@ -323,12 +324,17 @@ dbFetchPrositeFeatures <- function(ID) {
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7]))
psName = tokens[7],
psSeq = tokens[11]))
}
}
return(myFeatures)
}
if (FALSE) {
dbFetchPrositeFeatures("P33520") # RES1_SCHPO
}
# == 2.11 node2text() ======================================================
node2text <- function(doc, tag) {