New unit
This commit is contained in:
parent
96e559429d
commit
59a1341796
192
BIN-Data_integration.R
Normal file
192
BIN-Data_integration.R
Normal file
@ -0,0 +1,192 @@
|
||||
# BIN-Data_integration.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Data_integration unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 10 08
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 First live version
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Develop a fungi-specific BioMart example.
|
||||
# (cf.
|
||||
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------
|
||||
#TOC> 1 Identifier mapping 41
|
||||
#TOC> 2 Cross-referencing tables 142
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# = 1 Identifier mapping ==================================================
|
||||
|
||||
# UniProt provides a well-designed ID mapping tool that can be accessed
|
||||
# online at http://www.uniprot.org/mapping/
|
||||
#
|
||||
# Here we will use the UniProt Web API for this tool to map identifiers. The
|
||||
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
|
||||
# simply via a Web- browsers request. Such requests are commonly sent via the
|
||||
# GET or POST verbs that a Webserver responds to, when a client asks for data.
|
||||
# GET requests are visible in the URL of the request; POST requests are not
|
||||
# directly visible, they are commonly used to send the contents of forms, or
|
||||
# when transmitting larger, complex data items. The UniProt ID mapping sevice
|
||||
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
|
||||
# and POST() functions are part of the httr package.
|
||||
|
||||
# To begin, we load httr, which supports sending and receiving data via the
|
||||
# http protocol, just like a Web browser.
|
||||
if (!require(httr, quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
|
||||
# happens if the ID can't be mapped:
|
||||
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
|
||||
|
||||
|
||||
# The UniProt ID mapping service API is very straightforward to use: just define
|
||||
# the URL of the server and send a list of items labelled as "query" in the body
|
||||
# of the request.
|
||||
|
||||
URL <- "http://www.uniprot.org/mapping/"
|
||||
response <- POST(URL,
|
||||
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
||||
to = "ACC", # UniProt ID
|
||||
format = "tab",
|
||||
query = myQueryIDs))
|
||||
|
||||
response
|
||||
|
||||
# We need to check the status code - if it is not 200, an error ocurred and we
|
||||
# can't process the result:
|
||||
status_code(response)
|
||||
|
||||
# If the query is successful, tabbed text is returned. We can assign that to a
|
||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
||||
|
||||
myMappedIDs <- read.delim(file = textConnection(content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMappedIDs
|
||||
|
||||
# If the query can't be fulfilled because of a problem with the server, a
|
||||
# WebPage is returned. But the server status is also returned and we can check
|
||||
# the status code. I have lately gotten many "503" status codes: Server Not
|
||||
# Available...
|
||||
|
||||
# We wrap this into a function:
|
||||
|
||||
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
# Use UniProt ID mapping service to map one or more IDs
|
||||
# Parameters:
|
||||
# s char A string of separated IDs
|
||||
# mapFrom char the database in which the IDs in s are valid. Default
|
||||
# is RefSeq protein
|
||||
# mapTo char the database in which the target IDs are valid. Default
|
||||
# is UniProtKB
|
||||
# Value
|
||||
# a data frame of mapped IDs, with column names From and To, or an
|
||||
# empty data frame if the mapping was unsuccessful. No rows are returned
|
||||
# for IDs that are not mapped.
|
||||
|
||||
URL <- "http://www.uniprot.org/mapping/"
|
||||
response <- POST(URL,
|
||||
body = list(from = mapFrom,
|
||||
to = mapTo,
|
||||
format = "tab",
|
||||
query = s))
|
||||
|
||||
if (status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
} else {
|
||||
myMap <- data.frame()
|
||||
warning(paste("No uniProt ID mapping returned:",
|
||||
"server sent status",
|
||||
status_code(response)))
|
||||
}
|
||||
|
||||
return(myMap)
|
||||
}
|
||||
|
||||
# Try it out ...
|
||||
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
|
||||
|
||||
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
|
||||
# into your workspace on startup.
|
||||
|
||||
|
||||
# = 2 Cross-referencing tables ============================================
|
||||
|
||||
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
|
||||
# genes in a model organism database such as SGD, or from the Human Genen
|
||||
# Nomenclature commission. How do we map one set of identifiers to another one?
|
||||
|
||||
# The function to use is match().
|
||||
# Here is a tiny set of identifiers taken from a much larger table to illustrate the principle:
|
||||
#
|
||||
|
||||
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
||||
"Q08641", "P47129", "P52910", "P00330", "P81450"),
|
||||
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
|
||||
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
|
||||
refID = c("NP_014657", "NP_009386",
|
||||
"NP_012683", "NP_012559",
|
||||
"NP_010038", "NP_014882",
|
||||
"NP_012616", "NP_013254",
|
||||
"NP_014555", "NP_013629"),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
myIDs
|
||||
|
||||
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
|
||||
# their gene names.
|
||||
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
|
||||
|
||||
# %in% will only tell us if these IDs are present in the table:
|
||||
myQuery %in% myIDs$refID
|
||||
|
||||
# ... but not where they are located. But match() does what we need here:
|
||||
match(myQuery, myIDs$refID)
|
||||
|
||||
# ... and we can use the result to subset the column that we want to map to:
|
||||
myIDs$name[match(myQuery, myIDs$refID)]
|
||||
|
||||
# Note that the output preserves the NA - i.e. the length of the mapped
|
||||
# values is exactly the same as the length of the query.
|
||||
|
||||
# task: map the three genes to their UniProt Identifier.
|
||||
|
||||
|
||||
#
|
||||
# Note: if you want to do very many queries in large tables, use the
|
||||
# fmatch() function in the "fastmatch" package for a considerable
|
||||
# speedup.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user