Updates for BIN-ALI-BLAST

2017-10-23 12:37:09 -04:00
parent 59ab6c573f
commit bc4afc97aa
5 changed files with 130 additions and 286 deletions
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -0,0 +1,283 @@
+# ABC_makeMYSPElist.R
+#
+# Purpose:  Create a list of genome sequenced fungi with protein annotations and
+#               Mbp1 homologues.
+#
+# Version: 1.1.2
+#
+# Date:    2016 09 - 2017 09
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# V 1.1.2  Moved BLAST.R to ./scripts directory
+# V 1.1    Update 2017
+# V 1.0    First code 2016
+#
+# TODO:
+#
+#   type out workflow
+#
+# ==============================================================================
+#
+# DO NOT  source()  THIS FILE!
+#
+# This file is code I provide for your deeper understanding of a process and
+# to provide you with useful sample code. It is not actually necessary for
+# you to run this code, but I encourage you to read it carefully and discuss
+# if there are parts you don't understand.
+#
+# Run the commands that interact with the NCBI servers only if you want to
+# experiment specifically with the code and/or parameters. I have commented out
+# those parts. If you only want to study the general workflow, just load()
+# the respective intermediate results.
+#
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                               Line
+#TOC> ---------------------------------------------------
+#TOC>   1        The strategy                          54
+#TOC>   2        GOLD species                          66
+#TOC>   2.1      Initialize                            71
+#TOC>   2.2      Import                                77
+#TOC>   2.3      Unique species                       129
+#TOC>   3        BLAST species                        171
+#TOC>   3.1      find homologous proteins             178
+#TOC>   3.2      Identify species in "hits"           202
+#TOC>   4        Intersect GOLD and BLAST species     247
+#TOC>   5        Cleanup and finish                   265
+#TOC>
+#TOC> ==========================================================================
+
+
+#TOC>
+#TOC>
+
+# =    1  The strategy  ========================================================
+
+# This script will create a list of "MYSPE" species and save it in an R object
+# MYSPEspecies that is stored in the data subdirectory of this project from where
+# it can be loaded. The strategy is as follows: we download a list of all
+# genome projects and then select species for which protein annotations are
+# available - i.e. these are all genome-sequenced species that have been
+# annotated. Then we search for fungal species that have homologues to MBP1.
+# Then we intersect the two lists to give us genome-sequenced species that
+# also have Mbp1 homologues ...
+
+
+# =    2  GOLD species  ========================================================
+
+#  Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
+#  (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
+
+# ==   2.1  Initialize  ========================================================
+if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
+    install.packages("httr")
+    library(httr)
+}
+
+# ==   2.2  Import  ============================================================
+
+# The URL of the genome data directory at the NCBI:
+# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
+# Note the relative size of the prokaryotes and the eukaryotes data.
+
+# What's in this directory?
+URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
+GOLDreadme <- readLines(URL) # read the file into a vector
+cat(GOLDreadme, sep = "\n")  # display the contents
+
+# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
+# into a dataframe. This will take a few moments.
+# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
+# GOLDdata <- read.csv(URL,
+#                      header = TRUE,
+#                      sep = "\t",
+#                      stringsAsFactors = FALSE)
+# save(GOLDdata, file="data/GOLDdata.RData")
+# or ...
+load(file="data/GOLDdata.RData")
+
+
+# What columns does the table have, how is it structured?
+str(GOLDdata)
+
+# What groups of organisms are in the table? How many of each?
+table(GOLDdata$Group)
+
+# What subgroups of fungi do we have?
+table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
+
+# How many of the fungi have protein annotations? The README file told us that
+# the column "Proteins" contains "Number of Proteins annotated in the assembly".
+# Looking at a few ...
+head(GOLDdata$Proteins, 30)
+# ... we see that the number varies, and some have a hyphen, i.e. no
+# annotations. The hyphens make this a char type column (as per: all elements
+# of a vector must have the same type). Therefore we can't read this as numbers
+# and filter by some value > 0. But we can filter for all genomes that don't
+# have the hyphen:
+sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
+
+# Subset the data, with fungi that have protein annotations
+GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
+                          GOLDdata$Proteins != "-" , ]
+
+# check what we have in the table
+nrow(GOLDfungi)
+head(GOLDfungi)
+
+
+# ==   2.3  Unique species  ====================================================
+
+# For our purpose of defining species, we will select only species, not strains
+# from this list. To do this, we pick the first two words i.e. the systematic
+# binomial name from the "X.Organism.Name" column, and then we remove redundant
+# species. Here is a function:
+#
+
+getBinom <- function(s) {
+    # Fetch the first two words from a string.
+    # Parameters:
+    #   s: char  a string which is expected to contain a binomial species name
+    #            as the first two words, possibly followed by other text.
+    # Value: char  the first two words separated by a single blank
+    #
+    x <- unlist(strsplit(s, "\\s+"))     # split s on one or more whitespace
+    return(paste(x[1:2], collapse=" "))  # return first two elements
+}
+
+# iterate through GOLDdata and extract species names
+GOLDspecies <- character()
+for (i in 1:nrow(GOLDfungi)) {
+    GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
+}
+head(GOLDspecies)
+length(GOLDspecies)
+
+# N.b. this would be more efficiently (but perhaps less explicitly) coded with
+# one of the apply() functions, instead of a for-loop.
+# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
+
+# Species of great interest may appear more than once, one for each sequenced
+# strain: e.g. brewer's yeast:
+sum(GOLDspecies == "Saccharomyces cerevisiae")
+
+# Therefore we use the function unique() to throw out duplicates. Simple:
+GOLDspecies <- unique(GOLDspecies)
+
+length(GOLDspecies)
+# i.e. we got rid of about 40% of the species by removing duplicates.
+
+
+# =    3  BLAST species  =======================================================
+#
+# Next, we filter our list by species that have homologues to the yeast Mbp1
+# gene. To do this we run a BLAST search to find all related proteins in any
+# fungus. We list the species that appear in that list, and then we select those
+# that appear in our GOLD table as well.
+#
+# ==   3.1  find homologous proteins  ==========================================
+#
+# Use BLAST to fetch proteins related to Mbp1 and identify the species that
+# contain them.
+
+# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
+# amount of error handling involved that is not supported by the API in a
+# principled way but requires rather ad hoc solutions. The code I threw together
+# to make a BLAST interface (demo-quality, not research-quality) is in the file
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
+# standard task of communicating with servers and parsing responses - everyday
+# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
+# parser in currently available packages.
+
+# source("./scripts/BLAST.R")   # load the function and its utilities
+# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
+# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
+#                    db = "refseq_protein",        # database to search in
+#                    nHits = 3000,                 # 720 hits in 2017
+#                    E = 0.01,                     #
+#                    limits = "txid4751[ORGN]")    # = fungi
+# save(BLASThits, file="data/BLASThits.RData")
+load(file="data/BLASThits.RData")
+
+# ==   3.2  Identify species in "hits"  ========================================
+
+# This is a very big list that can't be usefully analyzed manually. Here
+# we are only interested in the species names that it contains.
+
+# How many hits in the list?
+length(BLASThits$hits)
+
+# Let's look at a hit somewhere down the list
+str(BLASThits$hit[[277]])
+
+# A fair amount of parsing has gone into the BLAST.R code to prepare the results
+# in a useful way. The species information is in the $species element of every
+# hit.
+
+# Run a loop to extract all the species names into a vector. We subset ...
+# Blasthits$hits                 ... the list of hits, from which we choose ...
+# Blasthits$hits[[i]]            ... the i-th hit, and get ...
+# Blasthits$hits[[i]]$species    ... the species element from that.
+# Subsetting FTW.
+
+BLASTspecies <- character()
+for (i in seq_along(BLASThits$hits)) {
+    BLASTspecies[i] <-BLASThits$hits[[i]]$species
+}
+
+# You can confirm that BLASTspecies has the expected size.
+length(BLASTspecies)
+
+# Again, some species appear more than once, e.g. ...
+sum(BLASTspecies == "Saccharomyces cerevisiae")
+
+# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
+
+# Therefore we use unique() to throw out duplicates:
+BLASTspecies <- unique(BLASTspecies)
+
+length(BLASTspecies)
+# i.e. we got rid of about two thirds of the hits.
+
+# You should think about this: what is the biological interpretation of the
+# finding that on average we have three sequences that are similar to Mbp1 in
+# other species?
+
+
+# =    4  Intersect GOLD and BLAST species  ====================================
+
+# Now we can compare the two lists for species that appear in both sources: the
+# simplest way is to use the set operation functions union(), intersection()
+# etc. See here:
+?union
+
+MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
+
+# Again: interpret this:
+#  - what is the number of GOLDspecies?
+#  - what is the number of BLAST species?
+#  - how many species are present in both lists?
+#  - what does it mean if a species is in GOLD but not in the BLAST list?
+#  - what does it mean if a species has been found during BLAST, but it
+#    is not in GOLD?
+
+
+# =    5  Cleanup and finish  ==================================================
+
+# One final thing: some of the species will be our so-called "reference" species
+# which we use for model solutions and examples in the course. They are defined
+# in the .utilities.R file of this project. We remove them from the list so that
+# we don't inadvertently assign them.
+#
+
+REFspecies
+
+MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
+
+# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
+
+
+
+
+# [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -0,0 +1,348 @@
+# BLAST.R
+#
+# Purpose: Send off one BLAST search and return parsed list of results
+#          This script uses the BLAST URL-API
+#          (Application Programming Interface) at the NCBI.
+#          Read about the constraints here:
+#          https://ncbi.github.io/blast-cloud/dev/api.html
+#
+#
+# Version: 2.1
+# Date:    2016 09 - 2017 10
+# Author:  Boris Steipe
+#
+# Versions:
+#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
+#          refactored parseBLASTalignment() to handle lists with multiple hits.
+#    2.0   Completely rewritten because the interface completely changed.
+#          Code adpated in part from NCBI Perl sample code:
+#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
+#
+#    1.0   first version posted for BCH441 2016, based on BLAST - API
+#
+# ToDo:
+#
+# Notes:   This is somewhat pedestrian, but apparently there are currently
+#          no R packages that contain such code.
+#
+# ==============================================================================
+
+
+if (!require(httr, quietly = TRUE)) {
+  install.packages("httr")
+  library(httr)
+}
+
+
+BLAST <- function(q,
+                  db = "refseq_protein",
+                  nHits = 30,
+                  E = 0.1,
+                  limits = "",
+                  rid = "",
+                  quietly = FALSE,
+                  myTimeout = 120) {
+    # Purpose:
+    #     Basic BLAST search
+    # Version: 2.0
+    # Date:    2017-09
+    # Author:  Boris Steipe
+    #
+    # Parameters:
+    #     q: query - either a valid ID or a sequence
+    #     db: "refseq_protein" by default,
+    #         other legal valuses include: "nr", "pdb", "swissprot" ...
+    #     nHits: number of hits to maximally return
+    #     E: E-value cutoff. Do not return hits whose score would be expected
+    #        to occur E or more times in a database of random sequence.
+    #     limits: a valid ENTREZ filter
+    #     rid: a request ID - to retrieve earleir search results
+    #     quietly: controls printing of wait-time progress bar
+    #     timeout: how much longer _after_ rtoe to wait for a result
+    #              before giving up (seconds)
+    # Value:
+    #     result: list of resulting hits and some metadata
+
+
+    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
+
+    results <- list()
+    results$rid <- rid
+    results$rtoe <- 0
+
+    if (rid == "") {  # if rid is not the empty string we skip the
+                      # initial search and and proceed directly to retrieval
+
+
+      # prepare query, GET(), and parse rid and rtoe from BLAST server response
+      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                              "?",
+                              "CMD=Put",
+                              "&PROGRAM=", "blastp",
+                              "&QUERY=", URLencode(q),
+                              "&DATABASE=", db,
+                              "&MATRIX=", "BLOSUM62",
+                              "&EXPECT=", as.character(E),
+                              "&HITLIST_SIZE=", as.character(nHits),
+                              "&ALIGNMENTS=", as.character(nHits),
+                              "&FORMAT_TYPE=Text")
+
+      if (limits != "") {
+        results$query <- paste0(
+          results$query,
+          "&ENTREZ_QUERY=", limits)
+      }
+
+      # send it off ...
+      response <- GET(results$query)
+      if (http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
+                     http_status(response)$message))
+      }
+
+      txt <- content(response, "text", encoding = "UTF-8")
+
+      patt <- "RID = (\\w+)" # match the request id
+      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
+
+      patt <- "RTOE = (\\d+)" # match the expected completion time
+      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
+
+      # Now we wait ...
+      if (quietly) {
+        Sys.sleep(results$rtoe)
+      } else {
+        cat(sprintf("BLAST is processing %s:\n", results$rid))
+        waitTimer(results$rtoe)
+      }
+
+    } # done sending query and retrieving rid, rtoe
+
+    # Enter an infinite loop to check for result availability
+    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                         "?",
+                         "CMD=Get",
+                         "&RID=", results$rid,
+                         "&FORMAT_TYPE=Text",
+                         "&FORMAT_OBJECT=SearchInfo",
+                         sep = "")
+
+    while (TRUE) {
+      # Check whether the result is ready
+      response <- GET(checkStatus)
+      if (http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
+                     http_status(response)$message))
+      }
+
+      txt <- content(response, "text", encoding = "UTF-8")
+
+      if (length(grep("Status=WAITING",  txt)) > 0) {
+        myTimeout <- myTimeout - EXTRAWAIT
+
+        if (myTimeout <= 0) { # abort
+          cat("BLAST search not concluded before timeout. Aborting.\n")
+          cat(sprintf("You could check back later with rid \"%s\"\n",
+                      results$rid))
+          return(results)
+        }
+
+        if (quietly) {
+          Sys.sleep(EXTRAWAIT)
+        } else {
+          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
+                      EXTRAWAIT,
+                      myTimeout))
+          waitTimer(EXTRAWAIT)
+          next
+        }
+
+      } else if (length(grep("Status=FAILED",  txt)) > 0) {
+          cat("BLAST search returned status \"FAILED\". Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
+          cat("BLAST search returned status \"UNKNOWN\".\n")
+          cat("This probably means the rid has expired. Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
+
+          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
+            cat("BLAST search ready but no hits found. Aborting.\n")
+            return(results)
+
+          } else {
+            break  # done ... retrieve search result
+          }
+      }
+    } # end result-check loop
+
+    # retrieve results from BLAST server
+    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                      "?",
+                      "&CMD=Get",
+                      "&RID=", results$rid,
+                      "&FORMAT_TYPE=Text",
+                      sep = "")
+
+    response <- GET(retrieve)
+    if (http_status(response)$category != "Success" ) {
+      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
+                   http_status(response)$message))
+    }
+
+    txt <- content(response, "text", encoding = "UTF-8")
+
+    # txt contains the whole set of results. Process:
+
+    # First, we strsplit() on linebreaks:
+    txt <- unlist(strsplit(txt, "\n"))
+
+    # The alignments range from the first line that begins with ">" ...
+    iFirst <- grep("^>", txt)[1]
+
+    # ... to the last line that begins with "Sbjct"
+    x <- grep("^Sbjct", txt)
+    iLast <- x[length(x)]
+
+    # Get the alignments block
+    txt <- txt[iFirst:iLast]
+
+    # Drop empty lines
+    txt <- txt[!(nchar(txt) == 0)]
+
+    # A line that ends "]" but does not begin ">" seems to be a split
+    # defline ... eg.
+    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
+    #  [2] "EXF-2481]"
+    #  Merge these lines to the preceding lines and delete them.
+    #
+    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
+    if (length(x) > 0) {
+      txt[x-1] <- paste0(txt[x-1], txt[x])
+      txt <- txt[-x]
+    }
+
+    # Special case: there may be multiple deflines when the BLAST hit is to
+    # redundant, identical sequences. Keep only the first instance.
+    iKeep <- ! grepl("^>", txt)
+    x <- rle(iKeep)
+    x$positions <- cumsum(x$lengths)
+    i <- which(x$lengths > 1 & x$values == FALSE)
+    if (length(i) > 0) {
+      firsts <- x$positions[i] - x$lengths[i] + 1
+      iKeep[firsts] <- TRUE
+      txt <- txt[iKeep]
+    }
+
+    # After this preprocessing the following should be true:
+    # - Every alignment block begins with a defline in which the
+    #   first character is ">"
+    # - There is only one defline in each block.
+    # - Lines are not split.
+
+    # Make a dataframe of first and last indices of alignment blocks
+    x <- grep("^>", txt)
+    blocks <- data.frame(iFirst = x,
+                         iLast  = c((x[-1] - 1), length(txt)))
+
+    # Build the hits list by parsing the blocks
+    results$hits <- list()
+
+    for (i in seq_len(nrow(blocks))) {
+      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
+      results$hits[[i]] <- parseBLASTalignment(thisBlock)
+    }
+
+    return(results)
+}
+
+parseBLASTalignment <- function(hits, idx) {
+  # Parse one BLAST hit from a BLAST result
+  # Parameters:
+  #    hits  list   contains the BLAST hits
+  #    idx   int    index of the requested hit
+  # Value:
+  #          list   $def          chr   defline
+  #                 $accession    chr   accession number
+  #                 $organism     chr   complete organism definition
+  #                 $species      chr   binomial species
+  #                 $E            num   E value
+  #                 $lengthAli    num   length of the alignment
+  #                 $nIdentitites num   number of identities
+  #                 $nGaps        num   number of gaps
+  #                 $Qbounds      num   2-element vector of query start-end
+  #                 $Sbounds      num   2-element vector of subject start-end
+  #                 $Qseq         chr   query sequence
+  #                 $midSeq       chr   midline string
+  #                 $Sseq         chr   subject sequence
+
+  h <- list()
+
+  hit <- hits$hits[[idx]]
+
+  # FASTA defline
+  h$def <- hit$def
+
+  # accesion number (ID), use the first if there are several, separated by "|"
+  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
+  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # organism
+  patt <- "\\[(.+)]"
+  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # species
+  x <- unlist(strsplit(h$organism, "\\s+"))
+  if (length(x) >= 2) {
+    h$species <- paste(x[1], x[2])
+  } else if (length(x) == 1) {
+    h$species <- paste(x[1], "sp.")
+  } else {
+    h$species <- NA
+  }
+
+  # E-value
+  h$E <-  hit$E
+
+  # length of hit and # identities
+  h$lengthAli   <- hit$lengthAli
+  h$nIdentities <- hit$nIdentities
+
+  # number of gaps
+  h$nGaps <- hit$nGaps
+
+  # first and last positions
+  h$Qbounds <- hit$Qbounds
+  h$Sbounds <- hit$Sbounds
+
+  # aligned sequences
+
+  h$Qseq   <- hit$Qseq
+  h$midSeq <- hit$midSeq
+  h$Sseq   <- hit$Sseq
+
+  return(h)
+}
+
+
+# ==== TESTS ===================================================================
+
+# define query:
+# q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
+#              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+#              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+#              sep="")
+# or ...
+# q <- "NP_010227" # refseq ID
+#
+# test <- BLAST(q,
+#               nHits = 100,
+#               E = 0.001,
+#               rid = "",
+#               limits = "txid4751[ORGN]")
+# length(test$hits)
+
+# [END]
+