bch441-work-abc-units/RPR-PROSITE_POST.R

169 lines
6.3 KiB
R
Raw Permalink Normal View History

2021-11-16 05:31:48 +00:00
# tocID <- "RPR-PROSITE_POST.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 156
#TOC>
#TOC> ==========================================================================
# = 1 Constructing a POST command from a Web query ========================
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We have reverse engineered the Web form for a ScanProsite request, and can
# construct a valid POST request from knowing the required field names. The POST
# command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs
UniProtID <- "P39678"
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = UniProtID,
skip = "on",
output = "tabular"))
# Send off this request, and you should have a response in a few
# seconds. Let's check the status first:
httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask
# on the Discussion Board what to do.
# The text contents of the response is available with the
# content() function:
httr::content(response, "text")
# ... should show you the same as the page contents that you have seen in the
# browser. Now we need to extract the data from the page. For this simple
# example we can get away with using regular expressions, but in general we need
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# strsplit() the response into individual lines, since each of our data elements
# is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines)
# Now we define a query pattern for the lines we want:
# we can use the uID, bracketed by two "|" pipe
# characters:
patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this
# pattern:
( lines <- lines[grep(patt, lines)] )
# ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
unlist(strsplit(lines[1], "\\t|\\|"))
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
# not the literal character "t". And "|" has to be escaped because we mean the
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
# backslash turns a special meaning off, and sometimes it turns a special
# meaning on. Unfortunately there's no easy way to tell - you just need to
# remember the characters - or have a reference handy. The metacharacters are
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
# where in the regex they are.
# Let's put the tokens into named slots of a data frame
features <- data.frame()
for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|"))
features <- rbind(features,
data.frame(uID = tokens[2],
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
psSeq = tokens[11]))
}
features
# This forms the base of a function that collects the features automatically
# from a PrositeScan result. You can write this!
# == 1.1 Task - fetchPrositeFeatures() function ============================
# Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a data frame as given above, or
# an empty data frame if there is an error.
# = 2 Task solutions ======================================================
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
# clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test:
dbFetchPrositeFeatures("Q5KMQ9")
# [END]