385 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			385 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
| # BLAST.R
 | |
| #
 | |
| # Purpose: Send off one BLAST search and return parsed list of results
 | |
| #          This script uses the BLAST URL-API
 | |
| #          (Application Programming Interface) at the NCBI.
 | |
| #          Read about the constraints here:
 | |
| #          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 | |
| #
 | |
| #
 | |
| # Version: 3.2
 | |
| # Date:    2016 09 - 2020 09
 | |
| # Author:  Boris Steipe
 | |
| #
 | |
| # Versions:
 | |
| #    3.2   2020 updates
 | |
| #    3.1   Change from require() to requireNamespace(),
 | |
| #          use <package>::<function>() idiom throughout
 | |
| #    3.0   parsing logic had not been fully implemented; Fixed.
 | |
| #    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 | |
| #          refactored parseBLASTalignment() to handle lists with multiple hits.
 | |
| #    2.0   Completely rewritten because the interface completely changed.
 | |
| #          Code adpated in part from NCBI Perl sample code:
 | |
| #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
 | |
| #    1.0   first version posted for BCH441 2016, based on BLAST - API
 | |
| #
 | |
| # ToDo:    Return the organism/strain name in the output, and propagate
 | |
| #          into MYSPE selection script.
 | |
| #
 | |
| # Notes:   This is somewhat pedestrian, but apparently there are currently
 | |
| #          no R packages that contain such code.
 | |
| #
 | |
| # ==============================================================================
 | |
| 
 | |
| 
 | |
| if (! requireNamespace("httr", quietly = TRUE)) {
 | |
|   install.packages("httr")
 | |
| }
 | |
| 
 | |
| 
 | |
| BLAST <- function(Q,
 | |
|                   db = "refseq_protein",
 | |
|                   nHits = 30,
 | |
|                   E = 0.1,
 | |
|                   limits = "",
 | |
|                   rid = "",
 | |
|                   query = "",
 | |
|                   quietly = FALSE,
 | |
|                   myTimeout = 120) {
 | |
|     # Purpose:
 | |
|     #     Basic BLAST search
 | |
|     #
 | |
|     # Parameters:
 | |
|     #     Q: query - either a valid ID or a sequence
 | |
|     #     db: "refseq_protein" by default,
 | |
|     #         other legal values include: "nr", "pdb", "swissprot" ...
 | |
|     #     nHits: number of hits to maximally return
 | |
|     #     E: E-value cutoff. Do not return hits whose score would be expected
 | |
|     #        to occur E or more times in a database of random sequence.
 | |
|     #     limits: a valid ENTREZ filter
 | |
|     #     rid: a request ID - to retrieve earlier search results
 | |
|     #     query: the actual query string (needed when retrieving results
 | |
|     #            with an rid)
 | |
|     #     quietly: controls printing of wait-time progress bar
 | |
|     #     timeout: how much longer _after_ rtoe to wait for a result
 | |
|     #              before giving up (seconds)
 | |
|     # Value:
 | |
|     #     result: list of process status or resulting hits, and some metadata
 | |
| 
 | |
| 
 | |
|     EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
 | |
| 
 | |
|     results <- list()
 | |
|     results$query = query
 | |
|     results$rid <- rid
 | |
|     results$rtoe <- 0
 | |
| 
 | |
|     if (rid == "") {  # If no rid is available, spawn a search.
 | |
|                       # Else, proceed directly to retrieval.
 | |
| 
 | |
|       # prepare query, GET(), and parse rid and rtoe from BLAST server response
 | |
|       results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | |
|                               "?",
 | |
|                               "CMD=Put",
 | |
|                               "&PROGRAM=", "blastp",
 | |
|                               "&QUERY=", URLencode(Q),
 | |
|                               "&DATABASE=", db,
 | |
|                               "&MATRIX=", "BLOSUM62",
 | |
|                               "&EXPECT=", as.character(E),
 | |
|                               "&HITLIST_SIZE=", as.character(nHits),
 | |
|                               "&ALIGNMENTS=", as.character(nHits),
 | |
|                               "&FORMAT_TYPE=Text")
 | |
| 
 | |
|       if (limits != "") {
 | |
|         results$query <- paste0(
 | |
|           results$query,
 | |
|           "&ENTREZ_QUERY=", limits)
 | |
|       }
 | |
| 
 | |
|       # send it off ...
 | |
|       response <- httr::GET(results$query)
 | |
|       if (httr::http_status(response)$category != "Success" ) {
 | |
|         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
 | |
|                      httr::http_status(response)$message))
 | |
|       }
 | |
| 
 | |
|       txt <- httr::content(response, "text", encoding = "UTF-8")
 | |
| 
 | |
|       patt <- "RID = (\\w+)" # match the request id
 | |
|       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
 | |
| 
 | |
|       patt <- "RTOE = (\\d+)" # match the expected completion time
 | |
|       results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
 | |
| 
 | |
|       # Now we wait ...
 | |
|       if (quietly) {
 | |
|         Sys.sleep(results$rtoe)
 | |
|       } else {
 | |
|         cat(sprintf("BLAST is processing %s:\n", results$rid))
 | |
|         waitTimer(results$rtoe)
 | |
|       }
 | |
| 
 | |
|     } # done sending query and retrieving rid, rtoe
 | |
| 
 | |
|     # Enter an infinite loop to check for result availability
 | |
|     checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | |
|                          "?",
 | |
|                          "CMD=Get",
 | |
|                          "&RID=", results$rid,
 | |
|                          "&FORMAT_TYPE=Text",
 | |
|                          "&FORMAT_OBJECT=SearchInfo",
 | |
|                          sep = "")
 | |
| 
 | |
|     while (TRUE) {
 | |
|       # Check whether the result is ready
 | |
|       response <- httr::GET(checkStatus)
 | |
|       if (httr::http_status(response)$category != "Success" ) {
 | |
|         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
 | |
|                      httr::http_status(response)$message))
 | |
|       }
 | |
| 
 | |
|       txt <- httr::content(response, "text", encoding = "UTF-8")
 | |
| 
 | |
|       if (length(grep("Status=WAITING",  txt)) > 0) {
 | |
|         myTimeout <- myTimeout - EXTRAWAIT
 | |
| 
 | |
|         if (myTimeout <= 0) { # abort
 | |
|           cat("BLAST search not concluded before timeout. Aborting.\n")
 | |
|           cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
 | |
|                       "Trying checking back later with >",
 | |
|                       results$rid))
 | |
|           return(results)
 | |
|         }
 | |
| 
 | |
|         if (quietly) {
 | |
|           Sys.sleep(EXTRAWAIT)
 | |
|         } else {
 | |
|           cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
 | |
|                       EXTRAWAIT,
 | |
|                       myTimeout))
 | |
|           waitTimer(EXTRAWAIT)
 | |
|           next
 | |
|         }
 | |
| 
 | |
|       } else if (length(grep("Status=FAILED",  txt)) > 0) {
 | |
|           cat("BLAST search returned status \"FAILED\". Aborting.\n")
 | |
|           return(results)
 | |
| 
 | |
|       } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
 | |
|           cat("BLAST search returned status \"UNKNOWN\".\n")
 | |
|           cat("This probably means the rid has expired. Aborting.\n")
 | |
|           return(results)
 | |
| 
 | |
|       } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
 | |
| 
 | |
|           if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
 | |
|             cat("BLAST search ready but no hits found. Aborting.\n")
 | |
|             return(results)
 | |
| 
 | |
|           } else {
 | |
|             break  # done ... retrieve search result
 | |
|           }
 | |
|       }
 | |
|     } # end result-check loop
 | |
| 
 | |
|     # retrieve results from BLAST server
 | |
|     retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | |
|                       "?",
 | |
|                       "&CMD=Get",
 | |
|                       "&RID=", results$rid,
 | |
|                       "&FORMAT_TYPE=Text",
 | |
|                       sep = "")
 | |
| 
 | |
|     response <- httr::GET(retrieve)
 | |
|     if (httr::http_status(response)$category != "Success" ) {
 | |
|       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
 | |
|                    httr::http_status(response)$message))
 | |
|     }
 | |
| 
 | |
|     txt <- httr::content(response, "text", encoding = "UTF-8")
 | |
| 
 | |
|     # txt contains the whole set of results. Process:
 | |
| 
 | |
|     # First, we strsplit() on linebreaks:
 | |
|     txt <- unlist(strsplit(txt, "\n"))
 | |
| 
 | |
|     # The alignments range from the first line that begins with ">" ...
 | |
|     iFirst <- grep("^>", txt)[1]
 | |
| 
 | |
|     # ... to the last line that begins with "Sbjct"
 | |
|     x <- grep("^Sbjct", txt)
 | |
|     iLast <- x[length(x)]
 | |
| 
 | |
|     # Get the alignments block
 | |
|     txt <- txt[iFirst:iLast]
 | |
| 
 | |
|     # Drop empty lines
 | |
|     txt <- txt[!(nchar(txt) == 0)]
 | |
| 
 | |
|     # A line that ends "]" but does not begin ">" seems to be a split
 | |
|     # defline ... eg.
 | |
|     #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
 | |
|     #  [2] "EXF-2481]"
 | |
|     #  Merge these lines to the preceding lines and delete them.
 | |
|     #
 | |
|     x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
 | |
|     if (length(x) > 0) {
 | |
|       txt[x-1] <- paste0(txt[x-1], txt[x])
 | |
|       txt <- txt[-x]
 | |
|     }
 | |
| 
 | |
|     # Special case: there may be multiple deflines when the BLAST hit is to
 | |
|     # redundant, identical sequences. Keep only the first instance.
 | |
|     iKeep <- ! grepl("^>", txt)
 | |
|     x <- rle(iKeep)
 | |
|     x$positions <- cumsum(x$lengths)
 | |
|     i <- which(x$lengths > 1 & x$values == FALSE)
 | |
|     if (length(i) > 0) {
 | |
|       firsts <- x$positions[i] - x$lengths[i] + 1
 | |
|       iKeep[firsts] <- TRUE
 | |
|       txt <- txt[iKeep]
 | |
|     }
 | |
| 
 | |
|     # After this preprocessing the following should be true:
 | |
|     # - Every alignment block begins with a defline in which the
 | |
|     #   first character is ">"
 | |
|     # - There is only one defline in each block.
 | |
|     # - Lines are not split.
 | |
| 
 | |
|     # Make a dataframe of first and last indices of alignment blocks
 | |
|     x <- grep("^>", txt)
 | |
|     blocks <- data.frame(iFirst = x,
 | |
|                          iLast  = c((x[-1] - 1), length(txt)))
 | |
| 
 | |
|     # Build the hits list by parsing the blocks
 | |
|     results$hits <- list()
 | |
| 
 | |
|     for (i in seq_len(nrow(blocks))) {
 | |
|       thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
 | |
|       results$hits[[i]] <- parseBLASTalignment(thisBlock)
 | |
|     }
 | |
| 
 | |
|     return(results)
 | |
| }
 | |
| 
 | |
| parseBLASTalignment <- function(hit) {
 | |
|   # Parse data from a character vector containing a BLAST hit
 | |
|   # Parameters:
 | |
|   #    hit  char   one BLAST hit as char vector
 | |
|   # Value:
 | |
|   #          list   $def          chr   defline
 | |
|   #                 $accession    chr   accession number
 | |
|   #                 $organism     chr   complete organism definition
 | |
|   #                 $species      chr   binomial species
 | |
|   #                 $E            num   E value
 | |
|   #                 $lengthAli    num   length of the alignment
 | |
|   #                 $nIdentitites num   number of identities
 | |
|   #                 $nGaps        num   number of gaps
 | |
|   #                 $Qbounds      num   2-element vector of query start-end
 | |
|   #                 $Sbounds      num   2-element vector of subject start-end
 | |
|   #                 $Qseq         chr   query sequence
 | |
|   #                 $midSeq       chr   midline string
 | |
|   #                 $Sseq         chr   subject sequence
 | |
| 
 | |
|   getToken <- function(patt, v) {
 | |
|     # get the first token identified by pattern patt in character vector v
 | |
|     v <- v[grep(patt, v)]
 | |
|     if (length(v) > 1) { v <- v[1] }
 | |
|     if (length(v) == 0) { token <- NA
 | |
|     } else {
 | |
|       token <- regmatches(v, regexec(patt, v))[[1]][2] }
 | |
|     return(token)
 | |
|   }
 | |
| 
 | |
|   h <- list()
 | |
| 
 | |
|   # FASTA defline
 | |
|   h$def <- hit[1]
 | |
| 
 | |
|   # accesion number (ID), use the first if there are several, separated by "|"
 | |
|   patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
 | |
|   h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | |
| 
 | |
|   # organism
 | |
|   patt <- "\\[(.+)]"
 | |
|   h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | |
| 
 | |
|   # species
 | |
|   x <- unlist(strsplit(h$organism, "\\s+"))
 | |
|   if (length(x) >= 2) {
 | |
|     h$species <- paste(x[1], x[2])
 | |
|   } else if (length(x) == 1) {
 | |
|     h$species <- paste(x[1], "sp.")
 | |
|   } else {
 | |
|     h$species <- NA
 | |
|   }
 | |
| 
 | |
|   # E-value
 | |
|   h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
 | |
| 
 | |
|   # length of alignment
 | |
|   h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
 | |
| 
 | |
|   # number of identities
 | |
|   h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
 | |
| 
 | |
|   # number of gaps
 | |
|   h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
 | |
| 
 | |
|   # split up alignment section
 | |
|   idx <- grep("^Query ", hit)
 | |
|   Que <- hit[idx]
 | |
|   Mid <- hit[idx + 1]
 | |
|   Sbj <- hit[idx + 2]
 | |
| 
 | |
|   # first and last positions
 | |
|   h$Qbounds <- c(start = 0, end = 0)
 | |
|   h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
 | |
|   h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
 | |
| 
 | |
|   h$Sbounds <- c(start = 0, end = 0)
 | |
|   h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
 | |
|   h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
 | |
| 
 | |
|   # aligned sequences
 | |
|   for (i in seq_along(Que)) {
 | |
|     patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
 | |
|     m <- regexec(patt, Que[i])
 | |
|     iFirst <- m[[1]][2]
 | |
|     iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
 | |
|     Que[i] <- substring(Que[i], iFirst, iLast)
 | |
|     Mid[i] <- substring(Mid[i], iFirst, iLast)
 | |
|     Sbj[i] <- substring(Sbj[i], iFirst, iLast)
 | |
|   }
 | |
| 
 | |
|   h$Qseq   <- paste0(Que, collapse = "")
 | |
|   h$midSeq <- paste0(Mid, collapse = "")
 | |
|   h$Sseq   <- paste0(Sbj, collapse = "")
 | |
| 
 | |
|   return(h)
 | |
| }
 | |
| 
 | |
| 
 | |
| # ==== TESTS ===================================================================
 | |
| 
 | |
| if (FALSE) {
 | |
|   # define query:
 | |
|   q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
 | |
|                "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
 | |
|                "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
 | |
|                sep="")
 | |
|   # or ...
 | |
|   q <- "NP_010227" # refseq ID
 | |
| 
 | |
|   test <- BLAST(q,
 | |
|                 nHits = 100,
 | |
|                 E = 0.001,
 | |
|                 rid = "",
 | |
|                 limits = "txid4751[ORGN]")  # Fungi
 | |
|   str(test)
 | |
|   length(test$hits)
 | |
| }
 | |
| 
 | |
| # [END]
 | |
| 
 |