Updates for BIN-ALI-BLAST

2017-10-23 12:37:09 -04:00
parent 59ab6c573f
commit bc4afc97aa
5 changed files with 130 additions and 286 deletions
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@@ -3,253 +3,108 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-BLAST unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10  23
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.0    First live version 2017.
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
 # ==============================================================================
 # = 1 ___Section___
 # BLAST.R
 #
 # Purpose: Send off one BLAST search and return parsed list of results
 #          This script uses the BLAST URL-API
 #          (Application Programming Interface) at the NCBI.
 #          Read about the constraints here:
 # http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
 #
 #
 # Version: 1.0
 # Date:    2016-09
 # Author:  Boris Steipe
 #
 #
 # ToDo:
 # Notes:   The bioconducter "annotate" package contains code for BLAST searches,
 #          in case you need to do something more involved.
 #
 # ==============================================================================
-
+#TOC> ==========================================================================
-# Dependencies:  myEmail must exist as a global variable with
+#TOC> 
-#                     your valid email adress
+#TOC>   Section  Title                         Line
-#                waitTimer() must be loaded (it should have been loaded from
+#TOC> ---------------------------------------------
-#                     .utilities.R, which was sourced via .Rprofile)
+#TOC>   1        Packages                        41
 #TOC>   2        Defining the APSES domain       50
 #TOC>   3        Executing the BLAST search      72
 #TOC>   4        Analysing results               94
 #TOC> 
 #TOC> ==========================================================================
 # library to interface with WebServers and process their XML/HTML
 # responses
 if (!require(xml2, quietly = TRUE)) {
  install.packages("xml2")
  library(xml2)
 }
-if (!require(httr, quietly = TRUE)) {
+
-  install.packages("httr")
+# =    1  Packages  ============================================================
-  library(httr)
+
 if (!require(Biostrings, quietly=TRUE)) {
  source("https://bioconductor.org/biocLite.R")
  biocLite("Biostrings")
  library(Biostrings)
 }
 # =    2  Defining the APSES domain  ===========================================
-parseBLAST_XML <- function(hit) {
+# Load your protein database
-  # parse one BLAST hit XML node with the xml2 package;
+source("makeProteinDB.R")
  # return a list
-  h <- list()
+# Get the APSES domain sequence for MBP1_MYSPE feature annotation. (You have
-  h$id <-  xml_text(xml_find_first(hit, ".//Hit_accession"))
+# entered this data in the BIN-ALI-Optimal_sequence_alignment unit.)
  h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
  h$bestE <- Inf
  h$sumLen <- 0
  h$sumId <- 0
  h$sumGap <- 0
  hsps <- xml_find_all(hit, ".//Hsp")
  h$Hsp <- list()
  h$nHsps <- length(hsps)
  if (h$nHsps > 0) {
    for (i in 1:length(hsps)) {
      h$Hsp[[i]] <- list()
      h$Hsp[[i]]$e <-          xml_numeric(hsps[i], ".//Hsp_evalue")
      h$Hsp[[i]]$q_from <-     xml_numeric(hsps[i], ".//Hsp_query-from")
      h$Hsp[[i]]$q_to <-       xml_numeric(hsps[i], ".//Hsp_query-to")
      h$Hsp[[i]]$h_from <-     xml_numeric(hsps[i], ".//Hsp_hit-from")
      h$Hsp[[i]]$h_to <-       xml_numeric(hsps[i], ".//Hsp_hit-to")
      h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
      h$Hsp[[i]]$h_gaps <-     xml_numeric(hsps[i], ".//Hsp_gaps")
      h$Hsp[[i]]$h_len <-      xml_numeric(hsps[i], ".//Hsp_align-len")
      h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
      h$Hsp[[i]]$mid <-  xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
      h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
-      h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
+(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
-      h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-      h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-      h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
+                               myDB$annotation$featureID == ftrID])
-    }
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-  }
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-  return(h)
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-}
+                 start,
                 end))
-xml_numeric <- function(n, p) {
+# The MYSPE "apses" sequence is the sequence that we will use for our reverse
-  # Utility: return first node matching xpath p in XML node n as numeric
+# BLAST search.
  return(as.numeric(xml_text(xml_find_first(n, p))))
 }
-BLAST <- function(q,
+# =    3  Executing the BLAST search  ==========================================
                  db = "refseq_protein",
                  nHits = 30,
                  E = 3,
                  limits = "\"\"",
                  email = myEMail,
                  rid = "",
                  quietly = FALSE) {
  # Purpose:
  #     Basic BLAST search
  # Version: 1.0
  # Date:    2016-09
  # Author:  Boris Steipe
  #
  # Parameters:
  #     q: query - either a valid ID or a sequence
  #     db: "refseq_protein" by default,
  #         other legal valuses include: "nr", "pdb", "swissprot" ...
  #     nHits: number of hits to maximally return
  #     E: E-value cutoff. Do not return hits whose score would be expected
  #        to occur E or more times in a database of random sequence.
  #     limits: a valid ENTREZ filter
  #     email: a valid email address, defaults to global value myEMail
  #     quietly: controls printing of wait-time progress bar
  # Value:
  #     result: list of resulting hits and some metadata
-  results <- list()
+# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
-  results$rid <- rid
+# through its Web API, and to parse results. Have a look at the script, then
-  results$rtoe <- 0
+# source it:
-  if (rid == "") {  # prepare, send and analyse query
+source("./scripts/BLAST.R")
    results$query <- paste(
      "https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
      "?",
      "QUERY=", q,
      "&DATABASE=", db,
      "&HITLIST_SIZE=", as.character(nHits),
      "&EXPECT=", as.character(E),
      "&PROGRAM=", "blastp",
      "&ENTREZ_QUERY=", limits,
      "&NOHEADER=", "true",
      "&EMAIL=", email,
      "&CMD=Put",
      sep = "")
-    # send it off ...
+# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
-    response <- read_xml(results$query, as_html = TRUE)
+# cerevisiae:
-    # find the comment node that contains the information we need
+BLASThits <- BLAST(apses,                       # MYSPE APSES domain sequence
-    # using an xpath expression
+                  db = "refseq_protein",        # database to search in
-    info <- xml_find_first(response,
+                  nHits = 10,                   #
-                           "//comment()[contains(., \"QBlastInfo\")]")
+                  E = 0.01,                     #
-
+                  limits = "txid559292[ORGN]")  # S. cerevisiae S288c
    info <- xml_text(info)  # extract its contents
    # parse
    results$rid  <- regmatches(info,
                               regexec("RID = (\\w+)",  info))[[1]][2]
    results$rtoe <- regmatches(info,
                               regexec("RTOE = (\\d+)", info))[[1]][2]
    results$rtoe <- as.numeric(results$rtoe)
  } # done analysing query
  # Now we wait ...
  if (quietly) {
    Sys.sleep(results$rtoe)
  } else {
    cat(sprintf("BLAST is processing %s:\n", results$rid))
    waitTimer(results$rtoe)
  }
  # retrieve results from BLAST server
  URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
               "?",
               "RID=", results$rid,
               "&FORMAT_TYPE=", "XML",
               "&EMAIL=", email,
               "&CMD=Get",
               sep = "")
  raw <- GET(URL)
  timeOut <- 300
  nWait <- 0
  while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
    cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
    waitTimer(10)
    nWait <- nWait + 1
    raw <- GET(URL)
  }
  # If we get to here, we received some result. But what?
  if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
    stop(sprintf("Query >>%s<< didn't complete.", results$rid))
  } else if (raw$headers["content-type"] == "application/xml") { # Good!
    response <- read_xml(raw)
  } else { # Unknown, abort.
    stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
  }
  hits <- xml_find_all(response, ".//Hit")
  if (length(hits) == 0) {
    s <- "No hit returned.\n"
    s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
    s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
    s <- c(s, sprintf("   BLAST(rid = \"%s\")\n", results$rid))
    stop(paste(s, collapse = ""))
  }
  results$hits <- list()
  for (i in 1:length(hits)) {
    results$hits[[i]] <- parseBLAST_XML(hits[i])
  }
  return(results)
 }
-
+length(BLASThits)  # There should be at least one hit there. Ask for advice
-# = 1 Tasks
+                   # in case this step fails.
 # =    4  Analysing results  ===================================================
 # The BLAST.R script has defined a convenience function to parse BLAST
 # alignments.
 (topHit <- parseBLASTalignment(BLASThits, idx = 1))   # Parse the top hit
 # What is the refseq ID of the top hit
 topHit$accession
 # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
 # domain. If it is not, ask me for advice.
 # ==== TESTS ===================================================================
 # q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI",   # Mbp1 APSES domain
 #              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
 #              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
 #              sep="")
 # q <- "NP_010227"
 # fungi <- "txid4751[ORGN]"
 #
 # test <- BLAST("NP_010227",
 #               nHits = 1000,
 #               E = 0.01,
 #               limits = fungi)
 # length(test$hits)
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -3,12 +3,13 @@
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
 #
-# Version: 1.0
+# Version: 1.0.1
 #
 # Date:    2017  09  21
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
-# V 1.0    Final code, after rewriting BLAST parser and creating current MYSPElist
+# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
 # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
 #
 # TODO:
@@ -28,9 +29,9 @@
 #TOC>
 #TOC>   Section  Title                   Line
 #TOC> ---------------------------------------
-#TOC>   1        Preparations              38
+#TOC>   1        Preparations              39
-#TOC>   2        Suitable MYSPE Species    50
+#TOC>   2        Suitable MYSPE Species    51
-#TOC>   3        Adopt "MYSPE"             64
+#TOC>   3        Adopt "MYSPE"             65
 #TOC>
 #TOC> ==========================================================================
@@ -56,10 +57,10 @@ if (! exists("myStudentNumber")) {
 # A detailed description of the process of compiling the list of genome
 # sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ABC-makeMYSPElist.R
+# ./scripts/ABC-makeMYSPElist.R
-# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
+# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
-# selecting and combining data from various public-domain data resources.
+#       of selecting and combining data from  public-domain data resources.
 # =    3  Adopt "MYSPE"  =======================================================
--- a/data/refAnnotations.json
+++ b/data/refAnnotations.json
@@ -71,8 +71,8 @@
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
-  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "113", "end" : "211"},
+  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
-  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "131", "end" : "215"},
+  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -3,11 +3,12 @@
 # Purpose:  Create a list of genome sequenced fungi with protein annotations and
 #               Mbp1 homologues.
 #
-# Version: 1.1.1
+# Version: 1.1.2
 #
 # Date:    2016 09 - 2017 09
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
 # V 1.1.2  Moved BLAST.R to ./scripts directory
 # V 1.1    Update 2017
 # V 1.0    First code 2016
 #
@@ -184,12 +185,12 @@ length(GOLDspecies)
 # amount of error handling involved that is not supported by the API in a
 # principled way but requires rather ad hoc solutions. The code I threw together
 # to make a BLAST interface (demo-quality, not research-quality) is in the file
-# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
-# of communicating with servers and parsing responses - everyday fare in the
+# standard task of communicating with servers and parsing responses - everyday
-# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
+# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
-# in currently available packages.
+# parser in currently available packages.
-# source("BLAST.R")   # load the function and its utilities
+# source("./scripts/BLAST.R")   # load the function and its utilities
 # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 #                    db = "refseq_protein",        # database to search in
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -7,11 +7,13 @@
 #          https://ncbi.github.io/blast-cloud/dev/api.html
 #
 #
-# Version: 2.0
+# Version: 2.1
-# Date:    2016 09 - 2017 09
+# Date:    2016 09 - 2017 10
 # Author:  Boris Steipe
 #
 # Versions:
 #    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 #          refactored parseBLASTalignment() to handle lists with multiple hits.
 #    2.0   Completely rewritten because the interface completely changed.
 #          Code adpated in part from NCBI Perl sample code:
 #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
@@ -68,8 +70,9 @@ BLAST <- function(q,
    results$rid <- rid
    results$rtoe <- 0
-    if (rid == "") {  # we skip, and proceed directly to retrieval
+    if (rid == "") {  # if rid is not the empty string we skip the
-                      # if rid is not the empty string
+                      # initial search and and proceed directly to retrieval
      # prepare query, GET(), and parse rid and rtoe from BLAST server response
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
@@ -216,8 +219,10 @@ BLAST <- function(q,
    #  Merge these lines to the preceding lines and delete them.
    #
    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
    if (length(x) > 0) {
      txt[x-1] <- paste0(txt[x-1], txt[x])
      txt <- txt[-x]
    }
    # Special case: there may be multiple deflines when the BLAST hit is to
    # redundant, identical sequences. Keep only the first instance.
@@ -253,18 +258,32 @@ BLAST <- function(q,
    return(results)
 }
-parseBLASTalignment <- function(hit) {
+parseBLASTalignment <- function(hits, idx) {
-  # parse one BLAST hit;
+  # Parse one BLAST hit from a BLAST result
-  # return a list
+  # Parameters:
-
+  #    hits  list   contains the BLAST hits
-  if (length(grep("Length", hit)) > 1) {
+  #    idx   int    index of the requested hit
-    stop("Parsing function can't handle multiple HSPs (yet).")
+  # Value:
-  }
+  #          list   $def          chr   defline
  #                 $accession    chr   accession number
  #                 $organism     chr   complete organism definition
  #                 $species      chr   binomial species
  #                 $E            num   E value
  #                 $lengthAli    num   length of the alignment
  #                 $nIdentitites num   number of identities
  #                 $nGaps        num   number of gaps
  #                 $Qbounds      num   2-element vector of query start-end
  #                 $Sbounds      num   2-element vector of subject start-end
  #                 $Qseq         chr   query sequence
  #                 $midSeq       chr   midline string
  #                 $Sseq         chr   subject sequence
  h <- list()
  hit <- hits$hits[[idx]]
  # FASTA defline
-  h$def <- hit[1]
+  h$def <- hit$def
  # accesion number (ID), use the first if there are several, separated by "|"
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
@@ -276,70 +295,38 @@ parseBLASTalignment <- function(hit) {
  # species
  x <- unlist(strsplit(h$organism, "\\s+"))
-  if (length(x) < 2) {
+  if (length(x) >= 2) {
-    h$species <- NA
+    h$species <- paste(x[1], x[2])
  } else if (length(x) == 1) {
    h$species <- paste(x[1], "sp.")
  } else {
-    h$species <- paste(x[1:2], collapse = " ")
+    h$species <- NA
  }
  # E-value
-  x <- hit[grep("Expect\\s*=", hit)]
+  h$E <-  hit$E
  patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
  h$E <-  as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
  # length of hit and # identities
-  x <- hit[grep("Identities\\s*=", hit)]
+  h$lengthAli   <- hit$lengthAli
-  patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
+  h$nIdentities <- hit$nIdentities
  m <- regexec(patt, x)
  h$lengthAli   <- as.numeric(regmatches(x, m)[[1]][2])
  h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
  # number of gaps
-  x <- hit[grep("Gaps\\s*=", hit)]
+  h$nGaps <- hit$nGaps
  patt <- "Gaps\\s*=\\s*([0-9]+)"
  h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
  # first and last positions
-  iAli <- grep("^Query\\s+", hit)
+  h$Qbounds <- hit$Qbounds
-  h$Qbounds <- getAliBounds(hit[iAli])
+  h$Sbounds <- hit$Sbounds
  h$Sbounds <- getAliBounds(hit[iAli + 2])
  # aligned sequences
-  h$Qseq   <- character()
+  h$Qseq   <- hit$Qseq
-  h$midSeq <- character()
+  h$midSeq <- hit$midSeq
-  h$Sseq   <- character()
+  h$Sseq   <- hit$Sseq
  for (i in iAli) {
    patt <- "^Query\\s+[0-9]+\\s*"
    first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
    patt <- "\\s*[0-9]*\\s*$"
    last <- regexec(patt, hit[i])[[1]][1] - 1
    h$Qseq   <- paste0(h$Qseq,   substr(hit[i],     first, last))
    h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
    h$Sseq   <- paste0(h$Sseq,   substr(hit[i + 2], first, last))
  }
  return(h)
 }
 getAliBounds <- function(s) {
  # get first and last position from a vector of BLAST alignments s
  # value: numeric vector of first and last position
  patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
  first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
  patt <- "\\s*([0-9]+)\\s*$"
  last <- as.numeric(regmatches(s[length(s)],
                                regexec(patt, s[length(s)]))[[1]][2])
  return(c (first, last))
 }
 # ==== TESTS ===================================================================
 # define query: