Updates for BIN-ALI-BLAST

2017-10-23 12:37:09 -04:00
parent 59ab6c573f
commit bc4afc97aa
5 changed files with 130 additions and 286 deletions
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@@ -3,253 +3,108 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-BLAST unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10  23
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    First live version 2017.
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
-# ==============================================================================
-
-# = 1 ___Section___
-
-# BLAST.R
-#
-# Purpose: Send off one BLAST search and return parsed list of results
-#          This script uses the BLAST URL-API
-#          (Application Programming Interface) at the NCBI.
-#          Read about the constraints here:
-# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
-#
-#
-# Version: 1.0
-# Date:    2016-09
-# Author:  Boris Steipe
-#
-#
-# ToDo:
-# Notes:   The bioconducter "annotate" package contains code for BLAST searches,
-#          in case you need to do something more involved.
 #
 # ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                         Line
+#TOC> ---------------------------------------------
+#TOC>   1        Packages                        41
+#TOC>   2        Defining the APSES domain       50
+#TOC>   3        Executing the BLAST search      72
+#TOC>   4        Analysing results               94
+#TOC> 
+#TOC> ==========================================================================
+ 


-# Dependencies:  myEmail must exist as a global variable with
-#                     your valid email adress
-#                waitTimer() must be loaded (it should have been loaded from
-#                     .utilities.R, which was sourced via .Rprofile)

+# =    1  Packages  ============================================================

-# library to interface with WebServers and process their XML/HTML
-# responses
-if (!require(xml2, quietly = TRUE)) {
-  install.packages("xml2")
-  library(xml2)
-}
-
-if (!require(httr, quietly = TRUE)) {
-  install.packages("httr")
-  library(httr)
+if (!require(Biostrings, quietly=TRUE)) {
+  source("https://bioconductor.org/biocLite.R")
+  biocLite("Biostrings")
+  library(Biostrings)
 }


+# =    2  Defining the APSES domain  ===========================================

-parseBLAST_XML <- function(hit) {
-  # parse one BLAST hit XML node with the xml2 package;
-  # return a list
+# Load your protein database
+source("makeProteinDB.R")

-  h <- list()
-  h$id <-  xml_text(xml_find_first(hit, ".//Hit_accession"))
-  h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
-  h$bestE <- Inf
-  h$sumLen <- 0
-  h$sumId <- 0
-  h$sumGap <- 0
-  hsps <- xml_find_all(hit, ".//Hsp")
-  h$Hsp <- list()
-  h$nHsps <- length(hsps)
-  if (h$nHsps > 0) {
-    for (i in 1:length(hsps)) {
-      h$Hsp[[i]] <- list()
-      h$Hsp[[i]]$e <-          xml_numeric(hsps[i], ".//Hsp_evalue")
-      h$Hsp[[i]]$q_from <-     xml_numeric(hsps[i], ".//Hsp_query-from")
-      h$Hsp[[i]]$q_to <-       xml_numeric(hsps[i], ".//Hsp_query-to")
-      h$Hsp[[i]]$h_from <-     xml_numeric(hsps[i], ".//Hsp_hit-from")
-      h$Hsp[[i]]$h_to <-       xml_numeric(hsps[i], ".//Hsp_hit-to")
-      h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
-      h$Hsp[[i]]$h_gaps <-     xml_numeric(hsps[i], ".//Hsp_gaps")
-      h$Hsp[[i]]$h_len <-      xml_numeric(hsps[i], ".//Hsp_align-len")
-      h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
-      h$Hsp[[i]]$mid <-  xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
-      h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
+# Get the APSES domain sequence for MBP1_MYSPE feature annotation. (You have
+# entered this data in the BIN-ALI-Optimal_sequence_alignment unit.)

-      h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
-      h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
-      h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
-      h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
-    }
-  }
-  return(h)
-}
+(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+                               myDB$annotation$featureID == ftrID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+                 start,
+                 end))

-xml_numeric <- function(n, p) {
-  # Utility: return first node matching xpath p in XML node n as numeric
-  return(as.numeric(xml_text(xml_find_first(n, p))))
-}
+# The MYSPE "apses" sequence is the sequence that we will use for our reverse
+# BLAST search.


-BLAST <- function(q,
-                  db = "refseq_protein",
-                  nHits = 30,
-                  E = 3,
-                  limits = "\"\"",
-                  email = myEMail,
-                  rid = "",
-                  quietly = FALSE) {
-  # Purpose:
-  #     Basic BLAST search
-  # Version: 1.0
-  # Date:    2016-09
-  # Author:  Boris Steipe
-  #
-  # Parameters:
-  #     q: query - either a valid ID or a sequence
-  #     db: "refseq_protein" by default,
-  #         other legal valuses include: "nr", "pdb", "swissprot" ...
-  #     nHits: number of hits to maximally return
-  #     E: E-value cutoff. Do not return hits whose score would be expected
-  #        to occur E or more times in a database of random sequence.
-  #     limits: a valid ENTREZ filter
-  #     email: a valid email address, defaults to global value myEMail
-  #     quietly: controls printing of wait-time progress bar
-  # Value:
-  #     result: list of resulting hits and some metadata
+# =    3  Executing the BLAST search  ==========================================

-  results <- list()
-  results$rid <- rid
-  results$rtoe <- 0
+# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
+# through its Web API, and to parse results. Have a look at the script, then
+# source it:

-  if (rid == "") {  # prepare, send and analyse query
-    results$query <- paste(
-      "https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
-      "?",
-      "QUERY=", q,
-      "&DATABASE=", db,
-      "&HITLIST_SIZE=", as.character(nHits),
-      "&EXPECT=", as.character(E),
-      "&PROGRAM=", "blastp",
-      "&ENTREZ_QUERY=", limits,
-      "&NOHEADER=", "true",
-      "&EMAIL=", email,
-      "&CMD=Put",
-      sep = "")
+source("./scripts/BLAST.R")

-    # send it off ...
-    response <- read_xml(results$query, as_html = TRUE)
+# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
+# cerevisiae:

-    # find the comment node that contains the information we need
-    # using an xpath expression
-    info <- xml_find_first(response,
-                           "//comment()[contains(., \"QBlastInfo\")]")
-
-    info <- xml_text(info)  # extract its contents
-
-    # parse
-    results$rid  <- regmatches(info,
-                               regexec("RID = (\\w+)",  info))[[1]][2]
-    results$rtoe <- regmatches(info,
-                               regexec("RTOE = (\\d+)", info))[[1]][2]
-    results$rtoe <- as.numeric(results$rtoe)
-  } # done analysing query
-
-  # Now we wait ...
-  if (quietly) {
-    Sys.sleep(results$rtoe)
-  } else {
-    cat(sprintf("BLAST is processing %s:\n", results$rid))
-    waitTimer(results$rtoe)
-  }
-
-  # retrieve results from BLAST server
-  URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
-               "?",
-               "RID=", results$rid,
-               "&FORMAT_TYPE=", "XML",
-               "&EMAIL=", email,
-               "&CMD=Get",
-               sep = "")
-  raw <- GET(URL)
-
-  timeOut <- 300
-  nWait <- 0
-  while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
-    cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
-    waitTimer(10)
-    nWait <- nWait + 1
-    raw <- GET(URL)
-  }
-
-  # If we get to here, we received some result. But what?
-  if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
-    stop(sprintf("Query >>%s<< didn't complete.", results$rid))
-  } else if (raw$headers["content-type"] == "application/xml") { # Good!
-    response <- read_xml(raw)
-  } else { # Unknown, abort.
-    stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
-  }
-
-  hits <- xml_find_all(response, ".//Hit")
-
-  if (length(hits) == 0) {
-    s <- "No hit returned.\n"
-    s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
-    s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
-    s <- c(s, sprintf("   BLAST(rid = \"%s\")\n", results$rid))
-    stop(paste(s, collapse = ""))
-  }
-
-  results$hits <- list()
-
-  for (i in 1:length(hits)) {
-    results$hits[[i]] <- parseBLAST_XML(hits[i])
-  }
-
-  return(results)
-}
+BLASThits <- BLAST(apses,                       # MYSPE APSES domain sequence
+                  db = "refseq_protein",        # database to search in
+                  nHits = 10,                   #
+                  E = 0.01,                     #
+                  limits = "txid559292[ORGN]")  # S. cerevisiae S288c


-
-# = 1 Tasks
+length(BLASThits)  # There should be at least one hit there. Ask for advice
+                   # in case this step fails.


+# =    4  Analysing results  ===================================================
+
+# The BLAST.R script has defined a convenience function to parse BLAST
+# alignments.
+
+(topHit <- parseBLASTalignment(BLASThits, idx = 1))   # Parse the top hit
+
+# What is the refseq ID of the top hit
+topHit$accession
+
+# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
+# domain. If it is not, ask me for advice.


-# ==== TESTS ===================================================================
-
-# q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI",   # Mbp1 APSES domain
-#              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-#              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-#              sep="")
-# q <- "NP_010227"
-# fungi <- "txid4751[ORGN]"
-#
-# test <- BLAST("NP_010227",
-#               nHits = 1000,
-#               E = 0.01,
-#               limits = fungi)
-# length(test$hits)



--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -3,12 +3,13 @@
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
 #
-# Version: 1.0
+# Version: 1.0.1
 #
 # Date:    2017  09  21
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
-# V 1.0    Final code, after rewriting BLAST parser and creating current MYSPElist
+# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
+# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
 #
 # TODO:
@@ -28,9 +29,9 @@
 #TOC>
 #TOC>   Section  Title                   Line
 #TOC> ---------------------------------------
-#TOC>   1        Preparations              38
-#TOC>   2        Suitable MYSPE Species    50
-#TOC>   3        Adopt "MYSPE"             64
+#TOC>   1        Preparations              39
+#TOC>   2        Suitable MYSPE Species    51
+#TOC>   3        Adopt "MYSPE"             65
 #TOC>
 #TOC> ==========================================================================

@@ -56,10 +57,10 @@ if (! exists("myStudentNumber")) {

 # A detailed description of the process of compiling the list of genome
 # sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ABC-makeMYSPElist.R
+# ./scripts/ABC-makeMYSPElist.R

-# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
-# selecting and combining data from various public-domain data resources.
+# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
+#       of selecting and combining data from  public-domain data resources.

 # =    3  Adopt "MYSPE"  =======================================================

--- a/data/refAnnotations.json
+++ b/data/refAnnotations.json
@@ -71,8 +71,8 @@
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},

-  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "113", "end" : "211"},
-  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "131", "end" : "215"},
+  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
+  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -3,11 +3,12 @@
 # Purpose:  Create a list of genome sequenced fungi with protein annotations and
 #               Mbp1 homologues.
 #
-# Version: 1.1.1
+# Version: 1.1.2
 #
 # Date:    2016 09 - 2017 09
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
+# V 1.1.2  Moved BLAST.R to ./scripts directory
 # V 1.1    Update 2017
 # V 1.0    First code 2016
 #
@@ -184,12 +185,12 @@ length(GOLDspecies)
 # amount of error handling involved that is not supported by the API in a
 # principled way but requires rather ad hoc solutions. The code I threw together
 # to make a BLAST interface (demo-quality, not research-quality) is in the file
-# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
-# of communicating with servers and parsing responses - everyday fare in the
-# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
-# in currently available packages.
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
+# standard task of communicating with servers and parsing responses - everyday
+# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
+# parser in currently available packages.

-# source("BLAST.R")   # load the function and its utilities
+# source("./scripts/BLAST.R")   # load the function and its utilities
 # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 #                    db = "refseq_protein",        # database to search in
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -4,14 +4,16 @@
 #          This script uses the BLAST URL-API
 #          (Application Programming Interface) at the NCBI.
 #          Read about the constraints here:
-# https://ncbi.github.io/blast-cloud/dev/api.html
+#          https://ncbi.github.io/blast-cloud/dev/api.html
 #
 #
-# Version: 2.0
-# Date:    2016 09 - 2017 09
+# Version: 2.1
+# Date:    2016 09 - 2017 10
 # Author:  Boris Steipe
 #
 # Versions:
+#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
+#          refactored parseBLASTalignment() to handle lists with multiple hits.
 #    2.0   Completely rewritten because the interface completely changed.
 #          Code adpated in part from NCBI Perl sample code:
 #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
@@ -68,8 +70,9 @@ BLAST <- function(q,
    results$rid <- rid
    results$rtoe <- 0

-    if (rid == "") {  # we skip, and proceed directly to retrieval
-                      # if rid is not the empty string
+    if (rid == "") {  # if rid is not the empty string we skip the
+                      # initial search and and proceed directly to retrieval
+

      # prepare query, GET(), and parse rid and rtoe from BLAST server response
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
@@ -216,8 +219,10 @@ BLAST <- function(q,
    #  Merge these lines to the preceding lines and delete them.
    #
    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
-    txt[x-1] <- paste0(txt[x-1], txt[x])
-    txt <- txt[-x]
+    if (length(x) > 0) {
+      txt[x-1] <- paste0(txt[x-1], txt[x])
+      txt <- txt[-x]
+    }

    # Special case: there may be multiple deflines when the BLAST hit is to
    # redundant, identical sequences. Keep only the first instance.
@@ -253,18 +258,32 @@ BLAST <- function(q,
    return(results)
 }

-parseBLASTalignment <- function(hit) {
-  # parse one BLAST hit;
-  # return a list
-
-  if (length(grep("Length", hit)) > 1) {
-    stop("Parsing function can't handle multiple HSPs (yet).")
-  }
+parseBLASTalignment <- function(hits, idx) {
+  # Parse one BLAST hit from a BLAST result
+  # Parameters:
+  #    hits  list   contains the BLAST hits
+  #    idx   int    index of the requested hit
+  # Value:
+  #          list   $def          chr   defline
+  #                 $accession    chr   accession number
+  #                 $organism     chr   complete organism definition
+  #                 $species      chr   binomial species
+  #                 $E            num   E value
+  #                 $lengthAli    num   length of the alignment
+  #                 $nIdentitites num   number of identities
+  #                 $nGaps        num   number of gaps
+  #                 $Qbounds      num   2-element vector of query start-end
+  #                 $Sbounds      num   2-element vector of subject start-end
+  #                 $Qseq         chr   query sequence
+  #                 $midSeq       chr   midline string
+  #                 $Sseq         chr   subject sequence

  h <- list()

+  hit <- hits$hits[[idx]]
+
  # FASTA defline
-  h$def <- hit[1]
+  h$def <- hit$def

  # accesion number (ID), use the first if there are several, separated by "|"
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
@@ -276,70 +295,38 @@ parseBLASTalignment <- function(hit) {

  # species
  x <- unlist(strsplit(h$organism, "\\s+"))
-  if (length(x) < 2) {
-    h$species <- NA
+  if (length(x) >= 2) {
+    h$species <- paste(x[1], x[2])
+  } else if (length(x) == 1) {
+    h$species <- paste(x[1], "sp.")
  } else {
-    h$species <- paste(x[1:2], collapse = " ")
+    h$species <- NA
  }

  # E-value
-  x <- hit[grep("Expect\\s*=", hit)]
-  patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
-  h$E <-  as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
+  h$E <-  hit$E

  # length of hit and # identities
-  x <- hit[grep("Identities\\s*=", hit)]
-  patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
-  m <- regexec(patt, x)
-  h$lengthAli   <- as.numeric(regmatches(x, m)[[1]][2])
-  h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
+  h$lengthAli   <- hit$lengthAli
+  h$nIdentities <- hit$nIdentities

  # number of gaps
-  x <- hit[grep("Gaps\\s*=", hit)]
-  patt <- "Gaps\\s*=\\s*([0-9]+)"
-  h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
+  h$nGaps <- hit$nGaps

  # first and last positions
-  iAli <- grep("^Query\\s+", hit)
-  h$Qbounds <- getAliBounds(hit[iAli])
-  h$Sbounds <- getAliBounds(hit[iAli + 2])
+  h$Qbounds <- hit$Qbounds
+  h$Sbounds <- hit$Sbounds

  # aligned sequences

-  h$Qseq   <- character()
-  h$midSeq <- character()
-  h$Sseq   <- character()
-
-  for (i in iAli) {
-    patt <- "^Query\\s+[0-9]+\\s*"
-    first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
-
-    patt <- "\\s*[0-9]*\\s*$"
-    last <- regexec(patt, hit[i])[[1]][1] - 1
-
-    h$Qseq   <- paste0(h$Qseq,   substr(hit[i],     first, last))
-    h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
-    h$Sseq   <- paste0(h$Sseq,   substr(hit[i + 2], first, last))
-  }
+  h$Qseq   <- hit$Qseq
+  h$midSeq <- hit$midSeq
+  h$Sseq   <- hit$Sseq

  return(h)
 }


-getAliBounds <- function(s) {
-  # get first and last position from a vector of BLAST alignments s
-  # value: numeric vector of first and last position
-  patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
-  first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
-
-  patt <- "\\s*([0-9]+)\\s*$"
-  last <- as.numeric(regmatches(s[length(s)],
-                                regexec(patt, s[length(s)]))[[1]][2])
-  return(c (first, last))
-}
-
-
-
 # ==== TESTS ===================================================================

 # define query: