bch441-work-abc-units/scripts/BLAST.R

# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
#          This script uses the BLAST URL-API
#          (Application Programming Interface) at the NCBI.
#          Read about the constraints here:
#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
#
#
# Version: 3.2
# Date:    2016 09 - 2020 09
# Author:  Boris Steipe
#
# Versions:
#    3.2   2020 updates
#    3.1   Change from require() to requireNamespace(),
#          use <package>::<function>() idiom throughout
#    3.0   parsing logic had not been fully implemented; Fixed.
#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
#          refactored parseBLASTalignment() to handle lists with multiple hits.
#    2.0   Completely rewritten because the interface completely changed.
#          Code adpated in part from NCBI Perl sample code:
#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
#    1.0   first version posted for BCH441 2016, based on BLAST - API
#
# ToDo:    Return the organism/strain name in the output, and propagate
#          into MYSPE selection script.
#
# Notes:   This is somewhat pedestrian, but apparently there are currently
#          no R packages that contain such code.
#
# ==============================================================================


if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
}


BLAST <- function(Q,
                  db = "refseq_protein",
                  nHits = 30,
                  E = 0.1,
                  limits = "",
                  rid = "",
                  query = "",
                  quietly = FALSE,
                  myTimeout = 120) {
    # Purpose:
    #     Basic BLAST search
    #
    # Parameters:
    #     Q: query - either a valid ID or a sequence
    #     db: "refseq_protein" by default,
    #         other legal values include: "nr", "pdb", "swissprot" ...
    #     nHits: number of hits to maximally return
    #     E: E-value cutoff. Do not return hits whose score would be expected
    #        to occur E or more times in a database of random sequence.
    #     limits: a valid ENTREZ filter
    #     rid: a request ID - to retrieve earlier search results
    #     query: the actual query string (needed when retrieving results
    #            with an rid)
    #     quietly: controls printing of wait-time progress bar
    #     timeout: how much longer _after_ rtoe to wait for a result
    #              before giving up (seconds)
    # Value:
    #     result: list of process status or resulting hits, and some metadata


    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done

    results <- list()
    results$query = query
    results$rid <- rid
    results$rtoe <- 0

    if (rid == "") {  # If no rid is available, spawn a search.
                      # Else, proceed directly to retrieval.

      # prepare query, GET(), and parse rid and rtoe from BLAST server response
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
                              "?",
                              "CMD=Put",
                              "&PROGRAM=", "blastp",
                              "&QUERY=", URLencode(Q),
                              "&DATABASE=", db,
                              "&MATRIX=", "BLOSUM62",
                              "&EXPECT=", as.character(E),
                              "&HITLIST_SIZE=", as.character(nHits),
                              "&ALIGNMENTS=", as.character(nHits),
                              "&FORMAT_TYPE=Text")

      if (limits != "") {
        results$query <- paste0(
          results$query,
          "&ENTREZ_QUERY=", limits)
      }

      # send it off ...
      response <- httr::GET(results$query)
      if (httr::http_status(response)$category != "Success" ) {
        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
                     httr::http_status(response)$message))
      }

      txt <- httr::content(response, "text", encoding = "UTF-8")

      patt <- "RID = (\\w+)" # match the request id
      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]

      patt <- "RTOE = (\\d+)" # match the expected completion time
      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])

      # Now we wait ...
      if (quietly) {
        Sys.sleep(results$rtoe)
      } else {
        cat(sprintf("BLAST is processing %s:\n", results$rid))
        waitTimer(results$rtoe)
      }

    } # done sending query and retrieving rid, rtoe

    # Enter an infinite loop to check for result availability
    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
                         "?",
                         "CMD=Get",
                         "&RID=", results$rid,
                         "&FORMAT_TYPE=Text",
                         "&FORMAT_OBJECT=SearchInfo",
                         sep = "")

    while (TRUE) {
      # Check whether the result is ready
      response <- httr::GET(checkStatus)
      if (httr::http_status(response)$category != "Success" ) {
        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
                     httr::http_status(response)$message))
      }

      txt <- httr::content(response, "text", encoding = "UTF-8")

      if (length(grep("Status=WAITING",  txt)) > 0) {
        myTimeout <- myTimeout - EXTRAWAIT

        if (myTimeout <= 0) { # abort
          cat("BLAST search not concluded before timeout. Aborting.\n")
          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
                      "Trying checking back later with >",
                      results$rid))
          return(results)
        }

        if (quietly) {
          Sys.sleep(EXTRAWAIT)
        } else {
          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
                      EXTRAWAIT,
                      myTimeout))
          waitTimer(EXTRAWAIT)
          next
        }

      } else if (length(grep("Status=FAILED",  txt)) > 0) {
          cat("BLAST search returned status \"FAILED\". Aborting.\n")
          return(results)

      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
          cat("BLAST search returned status \"UNKNOWN\".\n")
          cat("This probably means the rid has expired. Aborting.\n")
          return(results)

      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done

          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
            cat("BLAST search ready but no hits found. Aborting.\n")
            return(results)

          } else {
            break  # done ... retrieve search result
          }
      }
    } # end result-check loop

    # retrieve results from BLAST server
    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
                      "?",
                      "&CMD=Get",
                      "&RID=", results$rid,
                      "&FORMAT_TYPE=Text",
                      sep = "")

    response <- httr::GET(retrieve)
    if (httr::http_status(response)$category != "Success" ) {
      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
                   httr::http_status(response)$message))
    }

    txt <- httr::content(response, "text", encoding = "UTF-8")

    # txt contains the whole set of results. Process:

    # First, we strsplit() on linebreaks:
    txt <- unlist(strsplit(txt, "\n"))

    # The alignments range from the first line that begins with ">" ...
    iFirst <- grep("^>", txt)[1]

    # ... to the last line that begins with "Sbjct"
    x <- grep("^Sbjct", txt)
    iLast <- x[length(x)]

    # Get the alignments block
    txt <- txt[iFirst:iLast]

    # Drop empty lines
    txt <- txt[!(nchar(txt) == 0)]

    # A line that ends "]" but does not begin ">" seems to be a split
    # defline ... eg.
    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
    #  [2] "EXF-2481]"
    #  Merge these lines to the preceding lines and delete them.
    #
    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
    if (length(x) > 0) {
      txt[x-1] <- paste0(txt[x-1], txt[x])
      txt <- txt[-x]
    }

    # Special case: there may be multiple deflines when the BLAST hit is to
    # redundant, identical sequences. Keep only the first instance.
    iKeep <- ! grepl("^>", txt)
    x <- rle(iKeep)
    x$positions <- cumsum(x$lengths)
    i <- which(x$lengths > 1 & x$values == FALSE)
    if (length(i) > 0) {
      firsts <- x$positions[i] - x$lengths[i] + 1
      iKeep[firsts] <- TRUE
      txt <- txt[iKeep]
    }

    # After this preprocessing the following should be true:
    # - Every alignment block begins with a defline in which the
    #   first character is ">"
    # - There is only one defline in each block.
    # - Lines are not split.

    # Make a dataframe of first and last indices of alignment blocks
    x <- grep("^>", txt)
    blocks <- data.frame(iFirst = x,
                         iLast  = c((x[-1] - 1), length(txt)))

    # Build the hits list by parsing the blocks
    results$hits <- list()

    for (i in seq_len(nrow(blocks))) {
      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
      results$hits[[i]] <- parseBLASTalignment(thisBlock)
    }

    return(results)
}

parseBLASTalignment <- function(hit) {
  # Parse data from a character vector containing a BLAST hit
  # Parameters:
  #    hit  char   one BLAST hit as char vector
  # Value:
  #          list   $def          chr   defline
  #                 $accession    chr   accession number
  #                 $organism     chr   complete organism definition
  #                 $species      chr   binomial species
  #                 $E            num   E value
  #                 $lengthAli    num   length of the alignment
  #                 $nIdentitites num   number of identities
  #                 $nGaps        num   number of gaps
  #                 $Qbounds      num   2-element vector of query start-end
  #                 $Sbounds      num   2-element vector of subject start-end
  #                 $Qseq         chr   query sequence
  #                 $midSeq       chr   midline string
  #                 $Sseq         chr   subject sequence

  getToken <- function(patt, v) {
    # get the first token identified by pattern patt in character vector v
    v <- v[grep(patt, v)]
    if (length(v) > 1) { v <- v[1] }
    if (length(v) == 0) { token <- NA
    } else {
      token <- regmatches(v, regexec(patt, v))[[1]][2] }
    return(token)
  }

  h <- list()

  # FASTA defline
  h$def <- hit[1]

  # accesion number (ID), use the first if there are several, separated by "|"
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]

  # organism
  patt <- "\\[(.+)]"
  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]

  # species
  x <- unlist(strsplit(h$organism, "\\s+"))
  if (length(x) >= 2) {
    h$species <- paste(x[1], x[2])
  } else if (length(x) == 1) {
    h$species <- paste(x[1], "sp.")
  } else {
    h$species <- NA
  }

  # E-value
  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))

  # length of alignment
  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))

  # number of identities
  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))

  # number of gaps
  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))

  # split up alignment section
  idx <- grep("^Query ", hit)
  Que <- hit[idx]
  Mid <- hit[idx + 1]
  Sbj <- hit[idx + 2]

  # first and last positions
  h$Qbounds <- c(start = 0, end = 0)
  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))

  h$Sbounds <- c(start = 0, end = 0)
  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))

  # aligned sequences
  for (i in seq_along(Que)) {
    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
    m <- regexec(patt, Que[i])
    iFirst <- m[[1]][2]
    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
    Que[i] <- substring(Que[i], iFirst, iLast)
    Mid[i] <- substring(Mid[i], iFirst, iLast)
    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
  }

  h$Qseq   <- paste0(Que, collapse = "")
  h$midSeq <- paste0(Mid, collapse = "")
  h$Sseq   <- paste0(Sbj, collapse = "")

  return(h)
}


# ==== TESTS ===================================================================

if (FALSE) {
  # define query:
  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
               sep="")
  # or ...
  q <- "NP_010227" # refseq ID

  test <- BLAST(q,
                nHits = 100,
                E = 0.001,
                rid = "",
                limits = "txid4751[ORGN]")  # Fungi
  str(test)
  length(test$hits)
}

# [END]
Line termination change and old code. 2021-11-16 05:31:48 +00:00			`# BLAST.R`
			`#`
			`# Purpose: Send off one BLAST search and return parsed list of results`
			`# This script uses the BLAST URL-API`
			`# (Application Programming Interface) at the NCBI.`
			`# Read about the constraints here:`
			`# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo`
			`#`
			`#`
			`# Version: 3.2`
			`# Date: 2016 09 - 2020 09`
			`# Author: Boris Steipe`
			`#`
			`# Versions:`
			`# 3.2 2020 updates`
			`# 3.1 Change from require() to requireNamespace(),`
			`# use <package>::<function>() idiom throughout`
			`# 3.0 parsing logic had not been fully implemented; Fixed.`
			`# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;`
			`# refactored parseBLASTalignment() to handle lists with multiple hits.`
			`# 2.0 Completely rewritten because the interface completely changed.`
			`# Code adpated in part from NCBI Perl sample code:`
			`# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $`
			`# 1.0 first version posted for BCH441 2016, based on BLAST - API`
			`#`
			`# ToDo: Return the organism/strain name in the output, and propagate`
			`# into MYSPE selection script.`
			`#`
			`# Notes: This is somewhat pedestrian, but apparently there are currently`
			`# no R packages that contain such code.`
			`#`
			`# ==============================================================================`


			`if (! requireNamespace("httr", quietly = TRUE)) {`
			`install.packages("httr")`
			`}`


			`BLAST <- function(Q,`
			`db = "refseq_protein",`
			`nHits = 30,`
			`E = 0.1,`
			`limits = "",`
			`rid = "",`
			`query = "",`
			`quietly = FALSE,`
			`myTimeout = 120) {`
			`# Purpose:`
			`# Basic BLAST search`
			`#`
			`# Parameters:`
			`# Q: query - either a valid ID or a sequence`
			`# db: "refseq_protein" by default,`
			`# other legal values include: "nr", "pdb", "swissprot" ...`
			`# nHits: number of hits to maximally return`
			`# E: E-value cutoff. Do not return hits whose score would be expected`
			`# to occur E or more times in a database of random sequence.`
			`# limits: a valid ENTREZ filter`
			`# rid: a request ID - to retrieve earlier search results`
			`# query: the actual query string (needed when retrieving results`
			`# with an rid)`
			`# quietly: controls printing of wait-time progress bar`
			`# timeout: how much longer _after_ rtoe to wait for a result`
			`# before giving up (seconds)`
			`# Value:`
			`# result: list of process status or resulting hits, and some metadata`


			`EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done`

			`results <- list()`
			`results$query = query`
			`results$rid <- rid`
			`results$rtoe <- 0`

			`if (rid == "") { # If no rid is available, spawn a search.`
			`# Else, proceed directly to retrieval.`

			`# prepare query, GET(), and parse rid and rtoe from BLAST server response`
			`results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",`
			`"?",`
			`"CMD=Put",`
			`"&PROGRAM=", "blastp",`
			`"&QUERY=", URLencode(Q),`
			`"&DATABASE=", db,`
			`"&MATRIX=", "BLOSUM62",`
			`"&EXPECT=", as.character(E),`
			`"&HITLIST_SIZE=", as.character(nHits),`
			`"&ALIGNMENTS=", as.character(nHits),`
			`"&FORMAT_TYPE=Text")`

			`if (limits != "") {`
			`results$query <- paste0(`
			`results$query,`
			`"&ENTREZ_QUERY=", limits)`
			`}`

			`# send it off ...`
			`response <- httr::GET(results$query)`
			`if (httr::http_status(response)$category != "Success" ) {`
			`stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",`
			`httr::http_status(response)$message))`
			`}`

			`txt <- httr::content(response, "text", encoding = "UTF-8")`

			`patt <- "RID = (\\w+)" # match the request id`
			`results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]`

			`patt <- "RTOE = (\\d+)" # match the expected completion time`
			`results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])`

			`# Now we wait ...`
			`if (quietly) {`
			`Sys.sleep(results$rtoe)`
			`} else {`
			`cat(sprintf("BLAST is processing %s:\n", results$rid))`
			`waitTimer(results$rtoe)`
			`}`

			`} # done sending query and retrieving rid, rtoe`

			`# Enter an infinite loop to check for result availability`
			`checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",`
			`"?",`
			`"CMD=Get",`
			`"&RID=", results$rid,`
			`"&FORMAT_TYPE=Text",`
			`"&FORMAT_OBJECT=SearchInfo",`
			`sep = "")`

			`while (TRUE) {`
			`# Check whether the result is ready`
			`response <- httr::GET(checkStatus)`
			`if (httr::http_status(response)$category != "Success" ) {`
			`stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",`
			`httr::http_status(response)$message))`
			`}`

			`txt <- httr::content(response, "text", encoding = "UTF-8")`

			`if (length(grep("Status=WAITING", txt)) > 0) {`
			`myTimeout <- myTimeout - EXTRAWAIT`

			`if (myTimeout <= 0) { # abort`
			`cat("BLAST search not concluded before timeout. Aborting.\n")`
			`cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",`
			`"Trying checking back later with >",`
			`results$rid))`
			`return(results)`
			`}`

			`if (quietly) {`
			`Sys.sleep(EXTRAWAIT)`
			`} else {`
			`cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",`
			`EXTRAWAIT,`
			`myTimeout))`
			`waitTimer(EXTRAWAIT)`
			`next`
			`}`

			`} else if (length(grep("Status=FAILED", txt)) > 0) {`
			`cat("BLAST search returned status \"FAILED\". Aborting.\n")`
			`return(results)`

			`} else if (length(grep("Status=UNKNOWN", txt)) > 0) {`
			`cat("BLAST search returned status \"UNKNOWN\".\n")`
			`cat("This probably means the rid has expired. Aborting.\n")`
			`return(results)`

			`} else if (length(grep("Status=READY", txt)) > 0) { # Done`

			`if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits`
			`cat("BLAST search ready but no hits found. Aborting.\n")`
			`return(results)`

			`} else {`
			`break # done ... retrieve search result`
			`}`
			`}`
			`} # end result-check loop`

			`# retrieve results from BLAST server`
			`retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",`
			`"?",`
			`"&CMD=Get",`
			`"&RID=", results$rid,`
			`"&FORMAT_TYPE=Text",`
			`sep = "")`

			`response <- httr::GET(retrieve)`
			`if (httr::http_status(response)$category != "Success" ) {`
			`stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",`
			`httr::http_status(response)$message))`
			`}`

			`txt <- httr::content(response, "text", encoding = "UTF-8")`

			`# txt contains the whole set of results. Process:`

			`# First, we strsplit() on linebreaks:`
			`txt <- unlist(strsplit(txt, "\n"))`

			`# The alignments range from the first line that begins with ">" ...`
			`iFirst <- grep("^>", txt)[1]`

			`# ... to the last line that begins with "Sbjct"`
			`x <- grep("^Sbjct", txt)`
			`iLast <- x[length(x)]`

			`# Get the alignments block`
			`txt <- txt[iFirst:iLast]`

			`# Drop empty lines`
			`txt <- txt[!(nchar(txt) == 0)]`

			`# A line that ends "]" but does not begin ">" seems to be a split`
			`# defline ... eg.`
			`# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "`
			`# [2] "EXF-2481]"`
			`# Merge these lines to the preceding lines and delete them.`
			`#`
			`x <- which(grepl("]$", txt) & !(grepl("^>", txt)))`
			`if (length(x) > 0) {`
			`txt[x-1] <- paste0(txt[x-1], txt[x])`
			`txt <- txt[-x]`
			`}`

			`# Special case: there may be multiple deflines when the BLAST hit is to`
			`# redundant, identical sequences. Keep only the first instance.`
			`iKeep <- ! grepl("^>", txt)`
			`x <- rle(iKeep)`
			`x$positions <- cumsum(x$lengths)`
			`i <- which(x$lengths > 1 & x$values == FALSE)`
			`if (length(i) > 0) {`
			`firsts <- x$positions[i] - x$lengths[i] + 1`
			`iKeep[firsts] <- TRUE`
			`txt <- txt[iKeep]`
			`}`

			`# After this preprocessing the following should be true:`
			`# - Every alignment block begins with a defline in which the`
			`# first character is ">"`
			`# - There is only one defline in each block.`
			`# - Lines are not split.`

			`# Make a dataframe of first and last indices of alignment blocks`
			`x <- grep("^>", txt)`
			`blocks <- data.frame(iFirst = x,`
			`iLast = c((x[-1] - 1), length(txt)))`

			`# Build the hits list by parsing the blocks`
			`results$hits <- list()`

			`for (i in seq_len(nrow(blocks))) {`
			`thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]`
			`results$hits[[i]] <- parseBLASTalignment(thisBlock)`
			`}`

			`return(results)`
			`}`

			`parseBLASTalignment <- function(hit) {`
			`# Parse data from a character vector containing a BLAST hit`
			`# Parameters:`
			`# hit char one BLAST hit as char vector`
			`# Value:`
			`# list $def chr defline`
			`# $accession chr accession number`
			`# $organism chr complete organism definition`
			`# $species chr binomial species`
			`# $E num E value`
			`# $lengthAli num length of the alignment`
			`# $nIdentitites num number of identities`
			`# $nGaps num number of gaps`
			`# $Qbounds num 2-element vector of query start-end`
			`# $Sbounds num 2-element vector of subject start-end`
			`# $Qseq chr query sequence`
			`# $midSeq chr midline string`
			`# $Sseq chr subject sequence`

			`getToken <- function(patt, v) {`
			`# get the first token identified by pattern patt in character vector v`
			`v <- v[grep(patt, v)]`
			`if (length(v) > 1) { v <- v[1] }`
			`if (length(v) == 0) { token <- NA`
			`} else {`
			`token <- regmatches(v, regexec(patt, v))[[1]][2] }`
			`return(token)`
			`}`

			`h <- list()`

			`# FASTA defline`
			`h$def <- hit[1]`

			`# accesion number (ID), use the first if there are several, separated by "\|"`
			`patt <- "^>(.+?)(\\s\|\\\|)" # from ">" to space or "\|"`
			`h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]`

			`# organism`
			`patt <- "\\[(.+)]"`
			`h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]`

			`# species`
			`x <- unlist(strsplit(h$organism, "\\s+"))`
			`if (length(x) >= 2) {`
			`h$species <- paste(x[1], x[2])`
			`} else if (length(x) == 1) {`
			`h$species <- paste(x[1], "sp.")`
			`} else {`
			`h$species <- NA`
			`}`

			`# E-value`
			`h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))`

			`# length of alignment`
			`h$lengthAli <- as.numeric(getToken("^\\sLength\\s=(.+)$", hit))`

			`# number of identities`
			`h$nIdentities <- as.numeric(getToken("^\\sIdentities\\s=(.+?)/", hit))`

			`# number of gaps`
			`h$nGaps <- as.numeric(getToken("\\sGaps\\s=(.+?)/", hit))`

			`# split up alignment section`
			`idx <- grep("^Query ", hit)`
			`Que <- hit[idx]`
			`Mid <- hit[idx + 1]`
			`Sbj <- hit[idx + 2]`

			`# first and last positions`
			`h$Qbounds <- c(start = 0, end = 0)`
			`h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))`
			`h$Qbounds[2] <- as.numeric(getToken("\\s(\\d+)\\s$", Que[length(Que)]))`

			`h$Sbounds <- c(start = 0, end = 0)`
			`h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))`
			`h$Sbounds[2] <- as.numeric(getToken("\\s(\\d+)\\s$", Sbj[length(Sbj)]))`

			`# aligned sequences`
			`for (i in seq_along(Que)) {`
			`patt <- ("^\\sQuery\\s\\d+\\s*([A-Za-z-]+)") # capture aligned string`
			`m <- regexec(patt, Que[i])`
			`iFirst <- m[[1]][2]`
			`iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1`
			`Que[i] <- substring(Que[i], iFirst, iLast)`
			`Mid[i] <- substring(Mid[i], iFirst, iLast)`
			`Sbj[i] <- substring(Sbj[i], iFirst, iLast)`
			`}`

			`h$Qseq <- paste0(Que, collapse = "")`
			`h$midSeq <- paste0(Mid, collapse = "")`
			`h$Sseq <- paste0(Sbj, collapse = "")`

			`return(h)`
			`}`


			`# ==== TESTS ===================================================================`

			`if (FALSE) {`
			`# define query:`
			`q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain`
			`"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",`
			`"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",`
			`sep="")`
			`# or ...`
			`q <- "NP_010227" # refseq ID`

			`test <- BLAST(q,`
			`nHits = 100,`
			`E = 0.001,`
			`rid = "",`
			`limits = "txid4751[ORGN]") # Fungi`
			`str(test)`
			`length(test$hits)`
			`}`

			`# [END]`