# BIN-ALI-BLAST.R # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-BLAST unit. # # Version: 0.1 # # Date: 2017 08 28 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 0.1 First code copied from 2016 material. # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # ============================================================================== # = 1 ___Section___ # BLAST.R # # Purpose: Send off one BLAST search and return parsed list of results # This script uses the BLAST URL-API # (Application Programming Interface) at the NCBI. # Read about the constraints here: # http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo # # # Version: 1.0 # Date: 2016-09 # Author: Boris Steipe # # # ToDo: # Notes: The bioconducter "annotate" package contains code for BLAST searches, # in case you need to do something more involved. # # ============================================================================== # Dependencies: myEmail must exist as a global variable with # your valid email adress # waitTimer() must be loaded (it should have been loaded from # .utilities.R, which was sourced via .Rprofile) # library to interface with WebServers and process their XML/HTML # responses if (!require(xml2, quietly = TRUE)) { install.packages("xml2") library(xml2) } if (!require(httr, quietly = TRUE)) { install.packages("httr") library(httr) } parseBLAST_XML <- function(hit) { # parse one BLAST hit XML node with the xml2 package; # return a list h <- list() h$id <- xml_text(xml_find_first(hit, ".//Hit_accession")) h$def <- xml_text(xml_find_first(hit, ".//Hit_def")) h$bestE <- Inf h$sumLen <- 0 h$sumId <- 0 h$sumGap <- 0 hsps <- xml_find_all(hit, ".//Hsp") h$Hsp <- list() h$nHsps <- length(hsps) if (h$nHsps > 0) { for (i in 1:length(hsps)) { h$Hsp[[i]] <- list() h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue") h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from") h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to") h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from") h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to") h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity") h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps") h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len") h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq")) h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline")) h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq")) h$bestE <- min(h$bestE, h$Hsp[[i]]$e) h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len h$sumId <- h$sumId + h$Hsp[[i]]$h_identity h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps } } return(h) } xml_numeric <- function(n, p) { # Utility: return first node matching xpath p in XML node n as numeric return(as.numeric(xml_text(xml_find_first(n, p)))) } BLAST <- function(q, db = "refseq_protein", nHits = 30, E = 3, limits = "\"\"", email = myEMail, rid = "", quietly = FALSE) { # Purpose: # Basic BLAST search # Version: 1.0 # Date: 2016-09 # Author: Boris Steipe # # Parameters: # q: query - either a valid ID or a sequence # db: "refseq_protein" by default, # other legal valuses include: "nr", "pdb", "swissprot" ... # nHits: number of hits to maximally return # E: E-value cutoff. Do not return hits whose score would be expected # to occur E or more times in a database of random sequence. # limits: a valid ENTREZ filter # email: a valid email address, defaults to global value myEMail # quietly: controls printing of wait-time progress bar # Value: # result: list of resulting hits and some metadata results <- list() results$rid <- rid results$rtoe <- 0 if (rid == "") { # prepare, send and analyse query results$query <- paste( "https://www.ncbi.nlm.nih.gov/blast/Blast.cgi", "?", "QUERY=", q, "&DATABASE=", db, "&HITLIST_SIZE=", as.character(nHits), "&EXPECT=", as.character(E), "&PROGRAM=", "blastp", "&ENTREZ_QUERY=", limits, "&NOHEADER=", "true", "&EMAIL=", email, "&CMD=Put", sep = "") # send it off ... response <- read_xml(results$query, as_html = TRUE) # find the comment node that contains the information we need # using an xpath expression info <- xml_find_first(response, "//comment()[contains(., \"QBlastInfo\")]") info <- xml_text(info) # extract its contents # parse results$rid <- regmatches(info, regexec("RID = (\\w+)", info))[[1]][2] results$rtoe <- regmatches(info, regexec("RTOE = (\\d+)", info))[[1]][2] results$rtoe <- as.numeric(results$rtoe) } # done analysing query # Now we wait ... if (quietly) { Sys.sleep(results$rtoe) } else { cat(sprintf("BLAST is processing %s:\n", results$rid)) waitTimer(results$rtoe) } # retrieve results from BLAST server URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi", "?", "RID=", results$rid, "&FORMAT_TYPE=", "XML", "&EMAIL=", email, "&CMD=Get", sep = "") raw <- GET(URL) timeOut <- 300 nWait <- 0 while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) { cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n") waitTimer(10) nWait <- nWait + 1 raw <- GET(URL) } # If we get to here, we received some result. But what? if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ... stop(sprintf("Query >>%s<< didn't complete.", results$rid)) } else if (raw$headers["content-type"] == "application/xml") { # Good! response <- read_xml(raw) } else { # Unknown, abort. stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"])) } hits <- xml_find_all(response, ".//Hit") if (length(hits) == 0) { s <- "No hit returned.\n" s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query)) s <- c(s, sprintf("and/or try again later by typing:\n", results$rid)) s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid)) stop(paste(s, collapse = "")) } results$hits <- list() for (i in 1:length(hits)) { results$hits[[i]] <- parseBLAST_XML(hits[i]) } return(results) } # = 1 Tasks # ==== TESTS =================================================================== # q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain # "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", # "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", # sep="") # q <- "NP_010227" # fungi <- "txid4751[ORGN]" # # test <- BLAST("NP_010227", # nHits = 1000, # E = 0.01, # limits = fungi) # length(test$hits) # [END]