Updates for BIN-ALI-BLAST

This commit is contained in:
hyginn
2017-10-23 12:37:09 -04:00
parent 59ab6c573f
commit bc4afc97aa
5 changed files with 130 additions and 286 deletions

283
scripts/ABC-makeMYSPElist.R Normal file
View File

@@ -0,0 +1,283 @@
# ABC_makeMYSPElist.R
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1.2
#
# Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1.2 Moved BLAST.R to ./scripts directory
# V 1.1 Update 2017
# V 1.0 First code 2016
#
# TODO:
#
# type out workflow
#
# ==============================================================================
#
# DO NOT source() THIS FILE!
#
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
#
# Run the commands that interact with the NCBI servers only if you want to
# experiment specifically with the code and/or parameters. I have commented out
# those parts. If you only want to study the general workflow, just load()
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 The strategy 54
#TOC> 2 GOLD species 66
#TOC> 2.1 Initialize 71
#TOC> 2.2 Import 77
#TOC> 2.3 Unique species 129
#TOC> 3 BLAST species 171
#TOC> 3.1 find homologous proteins 178
#TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
# = 1 The strategy ========================================================
# This script will create a list of "MYSPE" species and save it in an R object
# MYSPEspecies that is stored in the data subdirectory of this project from where
# it can be loaded. The strategy is as follows: we download a list of all
# genome projects and then select species for which protein annotations are
# available - i.e. these are all genome-sequenced species that have been
# annotated. Then we search for fungal species that have homologues to MBP1.
# Then we intersect the two lists to give us genome-sequenced species that
# also have Mbp1 homologues ...
# = 2 GOLD species ========================================================
# Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
# == 2.1 Initialize ========================================================
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
install.packages("httr")
library(httr)
}
# == 2.2 Import ============================================================
# The URL of the genome data directory at the NCBI:
# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
# Note the relative size of the prokaryotes and the eukaryotes data.
# What's in this directory?
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
GOLDreadme <- readLines(URL) # read the file into a vector
cat(GOLDreadme, sep = "\n") # display the contents
# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
# into a dataframe. This will take a few moments.
# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# GOLDdata <- read.csv(URL,
# header = TRUE,
# sep = "\t",
# stringsAsFactors = FALSE)
# save(GOLDdata, file="data/GOLDdata.RData")
# or ...
load(file="data/GOLDdata.RData")
# What columns does the table have, how is it structured?
str(GOLDdata)
# What groups of organisms are in the table? How many of each?
table(GOLDdata$Group)
# What subgroups of fungi do we have?
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
# How many of the fungi have protein annotations? The README file told us that
# the column "Proteins" contains "Number of Proteins annotated in the assembly".
# Looking at a few ...
head(GOLDdata$Proteins, 30)
# ... we see that the number varies, and some have a hyphen, i.e. no
# annotations. The hyphens make this a char type column (as per: all elements
# of a vector must have the same type). Therefore we can't read this as numbers
# and filter by some value > 0. But we can filter for all genomes that don't
# have the hyphen:
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
# Subset the data, with fungi that have protein annotations
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
GOLDdata$Proteins != "-" , ]
# check what we have in the table
nrow(GOLDfungi)
head(GOLDfungi)
# == 2.3 Unique species ====================================================
# For our purpose of defining species, we will select only species, not strains
# from this list. To do this, we pick the first two words i.e. the systematic
# binomial name from the "X.Organism.Name" column, and then we remove redundant
# species. Here is a function:
#
getBinom <- function(s) {
# Fetch the first two words from a string.
# Parameters:
# s: char a string which is expected to contain a binomial species name
# as the first two words, possibly followed by other text.
# Value: char the first two words separated by a single blank
#
x <- unlist(strsplit(s, "\\s+")) # split s on one or more whitespace
return(paste(x[1:2], collapse=" ")) # return first two elements
}
# iterate through GOLDdata and extract species names
GOLDspecies <- character()
for (i in 1:nrow(GOLDfungi)) {
GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
}
head(GOLDspecies)
length(GOLDspecies)
# N.b. this would be more efficiently (but perhaps less explicitly) coded with
# one of the apply() functions, instead of a for-loop.
# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
# Species of great interest may appear more than once, one for each sequenced
# strain: e.g. brewer's yeast:
sum(GOLDspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
GOLDspecies <- unique(GOLDspecies)
length(GOLDspecies)
# i.e. we got rid of about 40% of the species by removing duplicates.
# = 3 BLAST species =======================================================
#
# Next, we filter our list by species that have homologues to the yeast Mbp1
# gene. To do this we run a BLAST search to find all related proteins in any
# fungus. We list the species that appear in that list, and then we select those
# that appear in our GOLD table as well.
#
# == 3.1 find homologous proteins ==========================================
#
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them.
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
# standard task of communicating with servers and parsing responses - everyday
# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
# parser in currently available packages.
# source("./scripts/BLAST.R") # load the function and its utilities
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in
# nHits = 3000, # 720 hits in 2017
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# save(BLASThits, file="data/BLASThits.RData")
load(file="data/BLASThits.RData")
# == 3.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(BLASThits$hits)
# Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]])
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
# in a useful way. The species information is in the $species element of every
# hit.
# Run a loop to extract all the species names into a vector. We subset ...
# Blasthits$hits ... the list of hits, from which we choose ...
# Blasthits$hits[[i]] ... the i-th hit, and get ...
# Blasthits$hits[[i]]$species ... the species element from that.
# Subsetting FTW.
BLASTspecies <- character()
for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <-BLASThits$hits[[i]]$species
}
# You can confirm that BLASTspecies has the expected size.
length(BLASTspecies)
# Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae")
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we use unique() to throw out duplicates:
BLASTspecies <- unique(BLASTspecies)
length(BLASTspecies)
# i.e. we got rid of about two thirds of the hits.
# You should think about this: what is the biological interpretation of the
# finding that on average we have three sequences that are similar to Mbp1 in
# other species?
# = 4 Intersect GOLD and BLAST species ====================================
# Now we can compare the two lists for species that appear in both sources: the
# simplest way is to use the set operation functions union(), intersection()
# etc. See here:
?union
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
# Again: interpret this:
# - what is the number of GOLDspecies?
# - what is the number of BLAST species?
# - how many species are present in both lists?
# - what does it mean if a species is in GOLD but not in the BLAST list?
# - what does it mean if a species has been found during BLAST, but it
# is not in GOLD?
# = 5 Cleanup and finish ==================================================
# One final thing: some of the species will be our so-called "reference" species
# which we use for model solutions and examples in the course. They are defined
# in the .utilities.R file of this project. We remove them from the list so that
# we don't inadvertently assign them.
#
REFspecies
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
# [END]

348
scripts/BLAST.R Normal file
View File

@@ -0,0 +1,348 @@
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 2.1
# Date: 2016 09 - 2017 10
# Author: Boris Steipe
#
# Versions:
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
#
# 1.0 first version posted for BCH441 2016, based on BLAST - API
#
# ToDo:
#
# Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code.
#
# ==============================================================================
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 0.1,
limits = "",
rid = "",
quietly = FALSE,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
# Version: 2.0
# Date: 2017-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earleir search results
# quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds)
# Value:
# result: list of resulting hits and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list()
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # if rid is not the empty string we skip the
# initial search and and proceed directly to retrieval
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Put",
"&PROGRAM=", "blastp",
"&QUERY=", URLencode(q),
"&DATABASE=", db,
"&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text")
if (limits != "") {
results$query <- paste0(
results$query,
"&ENTREZ_QUERY=", limits)
}
# send it off ...
response <- GET(results$query)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
} # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo",
sep = "")
while (TRUE) {
# Check whether the result is ready
response <- GET(checkStatus)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("You could check back later with rid \"%s\"\n",
results$rid))
return(results)
}
if (quietly) {
Sys.sleep(EXTRAWAIT)
} else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT,
myTimeout))
waitTimer(EXTRAWAIT)
next
}
} else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n")
return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n")
return(results)
} else {
break # done ... retrieve search result
}
}
} # end result-check loop
# retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"&CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
sep = "")
response <- GET(retrieve)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt)
iLast <- x[length(x)]
# Get the alignments block
txt <- txt[iFirst:iLast]
# Drop empty lines
txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
if (length(x) > 0) {
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
}
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt)
x <- rle(iKeep)
x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE
txt <- txt[iKeep]
}
# After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the
# first character is ">"
# - There is only one defline in each block.
# - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt)
blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks
results$hits <- list()
for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock)
}
return(results)
}
parseBLASTalignment <- function(hits, idx) {
# Parse one BLAST hit from a BLAST result
# Parameters:
# hits list contains the BLAST hits
# idx int index of the requested hit
# Value:
# list $def chr defline
# $accession chr accession number
# $organism chr complete organism definition
# $species chr binomial species
# $E num E value
# $lengthAli num length of the alignment
# $nIdentitites num number of identities
# $nGaps num number of gaps
# $Qbounds num 2-element vector of query start-end
# $Sbounds num 2-element vector of subject start-end
# $Qseq chr query sequence
# $midSeq chr midline string
# $Sseq chr subject sequence
h <- list()
hit <- hits$hits[[idx]]
# FASTA defline
h$def <- hit$def
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism
patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) >= 2) {
h$species <- paste(x[1], x[2])
} else if (length(x) == 1) {
h$species <- paste(x[1], "sp.")
} else {
h$species <- NA
}
# E-value
h$E <- hit$E
# length of hit and # identities
h$lengthAli <- hit$lengthAli
h$nIdentities <- hit$nIdentities
# number of gaps
h$nGaps <- hit$nGaps
# first and last positions
h$Qbounds <- hit$Qbounds
h$Sbounds <- hit$Sbounds
# aligned sequences
h$Qseq <- hit$Qseq
h$midSeq <- hit$midSeq
h$Sseq <- hit$Sseq
return(h)
}
# ==== TESTS ===================================================================
# define query:
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# or ...
# q <- "NP_010227" # refseq ID
#
# test <- BLAST(q,
# nHits = 100,
# E = 0.001,
# rid = "",
# limits = "txid4751[ORGN]")
# length(test$hits)
# [END]