2020-09-18 11:56:30 +00:00
|
|
|
# tocID <- "RPR-UniProt_GET.R"
|
|
|
|
#
|
|
|
|
# ---------------------------------------------------------------------------- #
|
|
|
|
# PATIENCE ... #
|
|
|
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
|
|
# boris.steipe@utoronto.ca #
|
|
|
|
# ---------------------------------------------------------------------------- #
|
2017-10-06 12:49:43 +00:00
|
|
|
#
|
|
|
|
# Purpose: A Bioinformatics Course:
|
|
|
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
|
|
|
#
|
2019-01-08 07:11:25 +00:00
|
|
|
# Version: 1.1
|
2017-10-06 12:49:43 +00:00
|
|
|
#
|
2019-01-08 07:11:25 +00:00
|
|
|
# Date: 2017 10 - 2019 01
|
2017-10-06 12:49:43 +00:00
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# Versions:
|
2019-01-08 07:11:25 +00:00
|
|
|
# 1.1 Change from require() to requireNamespace(),
|
|
|
|
# use <package>::<function>() idiom throughout
|
2017-10-06 12:49:43 +00:00
|
|
|
# 1.0 First ABC units version
|
|
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# TODO:
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
|
|
|
#
|
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
|
|
# going on. That's not how it works ...
|
|
|
|
#
|
|
|
|
# ==============================================================================
|
2017-10-29 03:05:53 +00:00
|
|
|
|
|
|
|
|
2017-10-06 12:49:43 +00:00
|
|
|
#TOC> ==========================================================================
|
2020-09-18 11:56:30 +00:00
|
|
|
#TOC>
|
2019-01-08 07:11:25 +00:00
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> ----------------------------------------------------------
|
|
|
|
#TOC> 1 UniProt files via GET 41
|
|
|
|
#TOC> 1.1 Task - fetchUniProtSeq() function 103
|
|
|
|
#TOC> 2 Task solutions 110
|
2020-09-18 11:56:30 +00:00
|
|
|
#TOC>
|
2017-10-06 12:49:43 +00:00
|
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
|
|
|
|
|
|
# = 1 UniProt files via GET ===============================================
|
|
|
|
|
|
|
|
|
|
|
|
# Perhaps the simplest example of scripted download is to retrieve a protein
|
|
|
|
# FASTA sequence from UniProt. All we need is to construct an URL with the
|
|
|
|
# correct UniProt ID.
|
|
|
|
|
|
|
|
# An interface between R scripts and We=b servers is provided by the httr
|
|
|
|
# package. This sends and receives information via the http protocol, just like
|
|
|
|
# a Web browser. Since this is a short and simple request, the GET verb is the
|
|
|
|
# right tool:
|
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
2017-10-06 12:49:43 +00:00
|
|
|
install.packages("httr")
|
|
|
|
}
|
2017-10-29 03:05:53 +00:00
|
|
|
# Package information:
|
|
|
|
# library(help = httr) # basic information
|
|
|
|
# browseVignettes("httr") # available vignettes
|
|
|
|
# data(package = "httr") # available datasets
|
|
|
|
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
# The UniProt ID for Mbp1 is ...
|
|
|
|
|
|
|
|
UniProtID <- "P39678"
|
|
|
|
|
|
|
|
# and the base URL to retrieve data is ...
|
|
|
|
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
|
|
|
|
# retrieve a FASTA sequence:
|
|
|
|
|
|
|
|
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
|
|
|
|
|
|
|
# the GET() function from httr will get the data.
|
2019-01-08 07:11:25 +00:00
|
|
|
response <- httr::GET(URL)
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
str(response) # the response object is a bit complex ...
|
|
|
|
as.character(response) # ... but it is easy to pull out the data.
|
|
|
|
|
|
|
|
# to process ...
|
|
|
|
x <- as.character(response)
|
|
|
|
x <- strsplit(x, "\n")
|
|
|
|
dbSanitizeSequence(x)
|
|
|
|
|
|
|
|
# Simple.
|
|
|
|
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
|
|
|
|
2019-01-08 07:11:25 +00:00
|
|
|
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
2017-10-06 12:49:43 +00:00
|
|
|
as.character(response)
|
|
|
|
# this is a large HTML page that tells us the URL was not found. So we need to
|
2019-01-08 07:11:25 +00:00
|
|
|
# check for errors. The Right Way to do this is to evaluate the staus code that
|
2017-10-06 12:49:43 +00:00
|
|
|
# every Web server returns for every transaction.
|
|
|
|
#
|
2019-01-08 07:11:25 +00:00
|
|
|
httr::status_code(response) # 404 == Page Not Found
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
# There are many possible codes, but the only code we will be happy with
|
|
|
|
# is 200 - oK.
|
|
|
|
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
|
|
|
|
|
|
|
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
2019-01-08 07:11:25 +00:00
|
|
|
response <- httr::GET(URL)
|
|
|
|
httr::status_code(response)
|
2017-10-06 12:49:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
# == 1.1 Task - fetchUniProtSeq() function =================================
|
|
|
|
|
|
|
|
# Task: write a function that takes as input a UniProt ID, fetches the
|
|
|
|
# FASTA sequence, returns only the sequence if the operation is successful, or
|
|
|
|
# a vector of length 0 if there is an error.
|
|
|
|
|
|
|
|
|
|
|
|
# = 2 Task solutions ======================================================
|
|
|
|
|
|
|
|
|
|
|
|
# I have placed such a function into the dbUtilities script: look it up by
|
|
|
|
# clicking on dbFetchUniProtSeq() in the Environment pane.
|
|
|
|
|
|
|
|
# Test:
|
|
|
|
dbFetchUniProtSeq("P39678")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# [END]
|