bch441-work-abc-units/BIN-ALI-BLAST.R
2017-09-12 16:09:20 -04:00

257 lines
7.5 KiB
R

# BIN-ALI-BLAST.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
#
#
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
#
# ToDo:
# Notes: The bioconducter "annotate" package contains code for BLAST searches,
# in case you need to do something more involved.
#
# ==============================================================================
# Dependencies: myEmail must exist as a global variable with
# your valid email adress
# waitTimer() must be loaded (it should have been loaded from
# .utilities.R, which was sourced via .Rprofile)
# library to interface with WebServers and process their XML/HTML
# responses
if (!require(xml2, quietly = TRUE)) {
install.packages("xml2")
library(xml2)
}
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
parseBLAST_XML <- function(hit) {
# parse one BLAST hit XML node with the xml2 package;
# return a list
h <- list()
h$id <- xml_text(xml_find_first(hit, ".//Hit_accession"))
h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
h$bestE <- Inf
h$sumLen <- 0
h$sumId <- 0
h$sumGap <- 0
hsps <- xml_find_all(hit, ".//Hsp")
h$Hsp <- list()
h$nHsps <- length(hsps)
if (h$nHsps > 0) {
for (i in 1:length(hsps)) {
h$Hsp[[i]] <- list()
h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue")
h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from")
h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to")
h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from")
h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to")
h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps")
h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len")
h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
}
}
return(h)
}
xml_numeric <- function(n, p) {
# Utility: return first node matching xpath p in XML node n as numeric
return(as.numeric(xml_text(xml_find_first(n, p))))
}
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 3,
limits = "\"\"",
email = myEMail,
rid = "",
quietly = FALSE) {
# Purpose:
# Basic BLAST search
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# email: a valid email address, defaults to global value myEMail
# quietly: controls printing of wait-time progress bar
# Value:
# result: list of resulting hits and some metadata
results <- list()
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # prepare, send and analyse query
results$query <- paste(
"https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"QUERY=", q,
"&DATABASE=", db,
"&HITLIST_SIZE=", as.character(nHits),
"&EXPECT=", as.character(E),
"&PROGRAM=", "blastp",
"&ENTREZ_QUERY=", limits,
"&NOHEADER=", "true",
"&EMAIL=", email,
"&CMD=Put",
sep = "")
# send it off ...
response <- read_xml(results$query, as_html = TRUE)
# find the comment node that contains the information we need
# using an xpath expression
info <- xml_find_first(response,
"//comment()[contains(., \"QBlastInfo\")]")
info <- xml_text(info) # extract its contents
# parse
results$rid <- regmatches(info,
regexec("RID = (\\w+)", info))[[1]][2]
results$rtoe <- regmatches(info,
regexec("RTOE = (\\d+)", info))[[1]][2]
results$rtoe <- as.numeric(results$rtoe)
} # done analysing query
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
# retrieve results from BLAST server
URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"RID=", results$rid,
"&FORMAT_TYPE=", "XML",
"&EMAIL=", email,
"&CMD=Get",
sep = "")
raw <- GET(URL)
timeOut <- 300
nWait <- 0
while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
waitTimer(10)
nWait <- nWait + 1
raw <- GET(URL)
}
# If we get to here, we received some result. But what?
if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
stop(sprintf("Query >>%s<< didn't complete.", results$rid))
} else if (raw$headers["content-type"] == "application/xml") { # Good!
response <- read_xml(raw)
} else { # Unknown, abort.
stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
}
hits <- xml_find_all(response, ".//Hit")
if (length(hits) == 0) {
s <- "No hit returned.\n"
s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid))
stop(paste(s, collapse = ""))
}
results$hits <- list()
for (i in 1:length(hits)) {
results$hits[[i]] <- parseBLAST_XML(hits[i])
}
return(results)
}
# = 1 Tasks
# ==== TESTS ===================================================================
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# q <- "NP_010227"
# fungi <- "txid4751[ORGN]"
#
# test <- BLAST("NP_010227",
# nHits = 1000,
# E = 0.01,
# limits = fungi)
# length(test$hits)
# [END]