# BLAST.R # # Purpose: Send off one BLAST search and return parsed list of results # This script uses the BLAST URL-API # (Application Programming Interface) at the NCBI. # Read about the constraints here: # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # # # Version: 3.2 # Date: 2016 09 - 2020 09 # Author: Boris Steipe # # Versions: # 3.2 2020 updates # 3.1 Change from require() to requireNamespace(), # use ::() idiom throughout # 3.0 parsing logic had not been fully implemented; Fixed. # 2.1 bugfix in BLAST(), bug was blanking non-split deflines; # refactored parseBLASTalignment() to handle lists with multiple hits. # 2.0 Completely rewritten because the interface completely changed. # Code adpated in part from NCBI Perl sample code: # $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ # 1.0 first version posted for BCH441 2016, based on BLAST - API # # ToDo: Return the organism/strain name in the output, and propagate # into MYSPE selection script. # # Notes: This is somewhat pedestrian, but apparently there are currently # no R packages that contain such code. # # ============================================================================== if (! requireNamespace("httr", quietly = TRUE)) { install.packages("httr") } BLAST <- function(Q, db = "refseq_protein", nHits = 30, E = 0.1, limits = "", rid = "", query = "", quietly = FALSE, myTimeout = 120) { # Purpose: # Basic BLAST search # # Parameters: # Q: query - either a valid ID or a sequence # db: "refseq_protein" by default, # other legal values include: "nr", "pdb", "swissprot" ... # nHits: number of hits to maximally return # E: E-value cutoff. Do not return hits whose score would be expected # to occur E or more times in a database of random sequence. # limits: a valid ENTREZ filter # rid: a request ID - to retrieve earlier search results # query: the actual query string (needed when retrieving results # with an rid) # quietly: controls printing of wait-time progress bar # timeout: how much longer _after_ rtoe to wait for a result # before giving up (seconds) # Value: # result: list of process status or resulting hits, and some metadata EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done results <- list() results$query = query results$rid <- rid results$rtoe <- 0 if (rid == "") { # If no rid is available, spawn a search. # Else, proceed directly to retrieval. # prepare query, GET(), and parse rid and rtoe from BLAST server response results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", "?", "CMD=Put", "&PROGRAM=", "blastp", "&QUERY=", URLencode(Q), "&DATABASE=", db, "&MATRIX=", "BLOSUM62", "&EXPECT=", as.character(E), "&HITLIST_SIZE=", as.character(nHits), "&ALIGNMENTS=", as.character(nHits), "&FORMAT_TYPE=Text") if (limits != "") { results$query <- paste0( results$query, "&ENTREZ_QUERY=", limits) } # send it off ... response <- httr::GET(results$query) if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", httr::http_status(response)$message)) } txt <- httr::content(response, "text", encoding = "UTF-8") patt <- "RID = (\\w+)" # match the request id results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2] patt <- "RTOE = (\\d+)" # match the expected completion time results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) # Now we wait ... if (quietly) { Sys.sleep(results$rtoe) } else { cat(sprintf("BLAST is processing %s:\n", results$rid)) waitTimer(results$rtoe) } } # done sending query and retrieving rid, rtoe # Enter an infinite loop to check for result availability checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", "?", "CMD=Get", "&RID=", results$rid, "&FORMAT_TYPE=Text", "&FORMAT_OBJECT=SearchInfo", sep = "") while (TRUE) { # Check whether the result is ready response <- httr::GET(checkStatus) if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", httr::http_status(response)$message)) } txt <- httr::content(response, "text", encoding = "UTF-8") if (length(grep("Status=WAITING", txt)) > 0) { myTimeout <- myTimeout - EXTRAWAIT if (myTimeout <= 0) { # abort cat("BLAST search not concluded before timeout. Aborting.\n") cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n", "Trying checking back later with >", results$rid)) return(results) } if (quietly) { Sys.sleep(EXTRAWAIT) } else { cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", EXTRAWAIT, myTimeout)) waitTimer(EXTRAWAIT) next } } else if (length(grep("Status=FAILED", txt)) > 0) { cat("BLAST search returned status \"FAILED\". Aborting.\n") return(results) } else if (length(grep("Status=UNKNOWN", txt)) > 0) { cat("BLAST search returned status \"UNKNOWN\".\n") cat("This probably means the rid has expired. Aborting.\n") return(results) } else if (length(grep("Status=READY", txt)) > 0) { # Done if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits cat("BLAST search ready but no hits found. Aborting.\n") return(results) } else { break # done ... retrieve search result } } } # end result-check loop # retrieve results from BLAST server retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", "?", "&CMD=Get", "&RID=", results$rid, "&FORMAT_TYPE=Text", sep = "") response <- httr::GET(retrieve) if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", httr::http_status(response)$message)) } txt <- httr::content(response, "text", encoding = "UTF-8") # txt contains the whole set of results. Process: # First, we strsplit() on linebreaks: txt <- unlist(strsplit(txt, "\n")) # The alignments range from the first line that begins with ">" ... iFirst <- grep("^>", txt)[1] # ... to the last line that begins with "Sbjct" x <- grep("^Sbjct", txt) iLast <- x[length(x)] # Get the alignments block txt <- txt[iFirst:iLast] # Drop empty lines txt <- txt[!(nchar(txt) == 0)] # A line that ends "]" but does not begin ">" seems to be a split # defline ... eg. # [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " # [2] "EXF-2481]" # Merge these lines to the preceding lines and delete them. # x <- which(grepl("]$", txt) & !(grepl("^>", txt))) if (length(x) > 0) { txt[x-1] <- paste0(txt[x-1], txt[x]) txt <- txt[-x] } # Special case: there may be multiple deflines when the BLAST hit is to # redundant, identical sequences. Keep only the first instance. iKeep <- ! grepl("^>", txt) x <- rle(iKeep) x$positions <- cumsum(x$lengths) i <- which(x$lengths > 1 & x$values == FALSE) if (length(i) > 0) { firsts <- x$positions[i] - x$lengths[i] + 1 iKeep[firsts] <- TRUE txt <- txt[iKeep] } # After this preprocessing the following should be true: # - Every alignment block begins with a defline in which the # first character is ">" # - There is only one defline in each block. # - Lines are not split. # Make a dataframe of first and last indices of alignment blocks x <- grep("^>", txt) blocks <- data.frame(iFirst = x, iLast = c((x[-1] - 1), length(txt))) # Build the hits list by parsing the blocks results$hits <- list() for (i in seq_len(nrow(blocks))) { thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] results$hits[[i]] <- parseBLASTalignment(thisBlock) } return(results) } parseBLASTalignment <- function(hit) { # Parse data from a character vector containing a BLAST hit # Parameters: # hit char one BLAST hit as char vector # Value: # list $def chr defline # $accession chr accession number # $organism chr complete organism definition # $species chr binomial species # $E num E value # $lengthAli num length of the alignment # $nIdentitites num number of identities # $nGaps num number of gaps # $Qbounds num 2-element vector of query start-end # $Sbounds num 2-element vector of subject start-end # $Qseq chr query sequence # $midSeq chr midline string # $Sseq chr subject sequence getToken <- function(patt, v) { # get the first token identified by pattern patt in character vector v v <- v[grep(patt, v)] if (length(v) > 1) { v <- v[1] } if (length(v) == 0) { token <- NA } else { token <- regmatches(v, regexec(patt, v))[[1]][2] } return(token) } h <- list() # FASTA defline h$def <- hit[1] # accesion number (ID), use the first if there are several, separated by "|" patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2] # organism patt <- "\\[(.+)]" h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2] # species x <- unlist(strsplit(h$organism, "\\s+")) if (length(x) >= 2) { h$species <- paste(x[1], x[2]) } else if (length(x) == 1) { h$species <- paste(x[1], "sp.") } else { h$species <- NA } # E-value h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) # length of alignment h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) # number of identities h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) # number of gaps h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) # split up alignment section idx <- grep("^Query ", hit) Que <- hit[idx] Mid <- hit[idx + 1] Sbj <- hit[idx + 2] # first and last positions h$Qbounds <- c(start = 0, end = 0) h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) h$Sbounds <- c(start = 0, end = 0) h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) # aligned sequences for (i in seq_along(Que)) { patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string m <- regexec(patt, Que[i]) iFirst <- m[[1]][2] iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 Que[i] <- substring(Que[i], iFirst, iLast) Mid[i] <- substring(Mid[i], iFirst, iLast) Sbj[i] <- substring(Sbj[i], iFirst, iLast) } h$Qseq <- paste0(Que, collapse = "") h$midSeq <- paste0(Mid, collapse = "") h$Sseq <- paste0(Sbj, collapse = "") return(h) } # ==== TESTS =================================================================== if (FALSE) { # define query: q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", sep="") # or ... q <- "NP_010227" # refseq ID test <- BLAST(q, nHits = 100, E = 0.001, rid = "", limits = "txid4751[ORGN]") # Fungi str(test) length(test$hits) } # [END]