Fixed parsing and added missing logic

2017-11-16 11:47:53 -05:00
parent 206e2e14bb
commit a58e35f060
2 changed files with 61 additions and 36 deletions
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-BLAST unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
 # Date:     2017  10  23
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1    Fixed parsing logic.
 #           1.0    First live version 2017.
 #           0.1    First code copied from 2016 material.
 #
@@ -26,14 +27,14 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                         Line
 #TOC> ---------------------------------------------
 #TOC>   1        Preparations                    41
 #TOC>   2        Defining the APSES domain       54
 #TOC>   3        Executing the BLAST search      76
 #TOC>   4        Analysing results               98
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@@ -83,23 +84,20 @@ source("./scripts/BLAST.R")
 # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
 # cerevisiae:

-BLASThits <- BLAST(apses,                       # MYSPE APSES domain sequence
-                  db = "refseq_protein",        # database to search in
-                  nHits = 10,                   #
-                  E = 0.01,                     #
-                  limits = "txid559292[ORGN]")  # S. cerevisiae S288c
+BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
+                     db = "refseq_protein",        # database to search in
+                     nHits = 10,                   #
+                     E = 0.01,                     #
+                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c


-length(BLASThits)  # There should be at least one hit there. Ask for advice
-                   # in case this step fails.
+length(BLASTresults$hits)  # There should be at least one hit there. Ask for advice
+                           # in case this step fails.


 # =    4  Analysing results  ===================================================

-# The BLAST.R script has defined a convenience function to parse BLAST
-# alignments.
-
-(topHit <- parseBLASTalignment(BLASThits, idx = 1))   # Parse the top hit
+(topHit <- BLASTresults$hits[[1]])   # Get the top hit

 # What is the refseq ID of the top hit
 topHit$accession
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -7,11 +7,12 @@
 #          https://ncbi.github.io/blast-cloud/dev/api.html
 #
 #
-# Version: 2.1
-# Date:    2016 09 - 2017 10
+# Version: 3
+# Date:    2016 09 - 2017 11
 # Author:  Boris Steipe
 #
 # Versions:
+#    3     parsing logic had not been fully implemented; Fixed.
 #    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 #          refactored parseBLASTalignment() to handle lists with multiple hits.
 #    2.0   Completely rewritten because the interface completely changed.
@@ -44,9 +45,6 @@ BLAST <- function(q,
                  myTimeout = 120) {
    # Purpose:
    #     Basic BLAST search
-    # Version: 2.0
-    # Date:    2017-09
-    # Author:  Boris Steipe
    #
    # Parameters:
    #     q: query - either a valid ID or a sequence
@@ -258,11 +256,10 @@ BLAST <- function(q,
    return(results)
 }

-parseBLASTalignment <- function(hits, idx) {
-  # Parse one BLAST hit from a BLAST result
+parseBLASTalignment <- function(hit) {
+  # Parse data from a character vector containing a BLAST hit
  # Parameters:
-  #    hits  list   contains the BLAST hits
-  #    idx   int    index of the requested hit
+  #    hit  char   one BLAST hit as char vector
  # Value:
  #          list   $def          chr   defline
  #                 $accession    chr   accession number
@@ -278,12 +275,20 @@ parseBLASTalignment <- function(hits, idx) {
  #                 $midSeq       chr   midline string
  #                 $Sseq         chr   subject sequence

+  getToken <- function(patt, v) {
+    # get the first token identified by pattern patt in character vector v
+    v <- v[grep(patt, v)]
+    if (length(v) > 1) { v <- v[1] }
+    if (length(v) == 0) { token <- NA
+    } else {
+      token <- regmatches(v, regexec(patt, v))[[1]][2] }
+    return(token)
+  }
+
  h <- list()

-  hit <- hits$hits[[idx]]
-
  # FASTA defline
-  h$def <- hit$def
+  h$def <- hit[1]

  # accesion number (ID), use the first if there are several, separated by "|"
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
@@ -304,24 +309,46 @@ parseBLASTalignment <- function(hits, idx) {
  }

  # E-value
-  h$E <-  hit$E
+  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))

-  # length of hit and # identities
-  h$lengthAli   <- hit$lengthAli
-  h$nIdentities <- hit$nIdentities
+  # length of alignment
+  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
+
+  # number of identities
+  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))

  # number of gaps
-  h$nGaps <- hit$nGaps
+  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
+
+  # split up alignment section
+  idx <- grep("^Query ", hit)
+  Que <- hit[idx]
+  Mid <- hit[idx + 1]
+  Sbj <- hit[idx + 2]

  # first and last positions
-  h$Qbounds <- hit$Qbounds
-  h$Sbounds <- hit$Sbounds
+  h$Qbounds <- c(start = 0, end = 0)
+  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
+  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
+
+  h$Sbounds <- c(start = 0, end = 0)
+  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
+  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))

  # aligned sequences
+  for (i in seq_along(Que)) {
+    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
+    m <- regexec(patt, Que[i])
+    iFirst <- m[[1]][2]
+    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
+    Que[i] <- substring(Que[i], iFirst, iLast)
+    Mid[i] <- substring(Mid[i], iFirst, iLast)
+    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
+  }

-  h$Qseq   <- hit$Qseq
-  h$midSeq <- hit$midSeq
-  h$Sseq   <- hit$Sseq
+  h$Qseq   <- paste0(Que, collapse = "")
+  h$midSeq <- paste0(Mid, collapse = "")
+  h$Sseq   <- paste0(Sbj, collapse = "")

  return(h)
 }