New code for YFO list and BLAST

This commit is contained in:
hyginn 2017-09-21 17:49:55 -04:00
parent ad783c1254
commit ce3a17da6d
3 changed files with 561 additions and 121 deletions

View File

@ -5,61 +5,95 @@
# #
# Version: 1.1 # Version: 1.1
# #
# Date: 2016 09 - 2017 08 # Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 1.1 Update 2017 # V 1.1 Update 2017
# V 1.0 First code 2016 # V 1.0 First code 2016
# #
# TODO: # TODO:
# actually rerun for 2017 #
# type out workflow # type out workflow
# #
# ============================================================================== # ==============================================================================
#
# DO NOT source() THIS FILE! # DO NOT source() THIS FILE!
#
# This file is code I provide for your deeper understanding of a process and # This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for # to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss # you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand. # if there are parts you don't understand.
#
# Run the commands that interact with the NCBI servers only if you want to # Run the commands that interact with the NCBI servers only if you want to
# experiment with the code and/or parameters. I have commented out those # experiment specifically with the code and/or parameters. I have commented out
# parts. If you simply want to reproduce the process you can simply # those parts. If you only want to study the general workflow, just load()
# load() the respective intermediate results. # the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 The strategy 54
#TOC> 2 GOLD species 66
#TOC> 2.1 Initialize 71
#TOC> 2.2 Import 77
#TOC> 2.3 Unique species 129
#TOC> 3 BLAST species 171
#TOC> 3.1 find homologous proteins 178
#TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265
#TOC>
#TOC> ==========================================================================
# ============================================================================== #TOC>
# CREATING A YFO LIST #TOC>
# ==============================================================================
# = 1 The strategy ========================================================
# This script will create a list of "YFO" species and save it in an R object # This script will create a list of "YFO" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where # YFOspecies that is stored in the data subdirectory of this project from where
# it can be loaded. # it can be loaded. The strategy is as follows: we download a list of all
# genome projects and then select species for which protein annotations are
# available - i.e. these are all genome-sequenced species that have been
# annotated. Then we search for fungal species that have homologues to MBP1.
# Then we intersect the two lists to give us genome-sequenced species that
# also have Mbp1 homologues ...
# ==== GOLD species ============================================================ # = 2 GOLD species ========================================================
#
# Fetch and parse genome data from the NCBI genome project database
# === Initialize # Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
# == 2.1 Initialize ========================================================
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
install.packages("httr") install.packages("httr")
library(httr) library(httr)
} }
# The URL where the genome data can be downloaded # == 2.2 Import ============================================================
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# Read the data directly from the NCBI ftp server and put it into a dataframe. # The URL of the genome data directory at the NCBI:
# This will take about a minute. # is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
# Note the relative size of the prokaryotes and the eukaryotes data.
# What's in this directory?
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
GOLDreadme <- readLines(URL) # read the file into a vector
cat(GOLDreadme, sep = "\n") # display the contents
# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
# into a dataframe. This will take a few moments.
# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# GOLDdata <- read.csv(URL, # GOLDdata <- read.csv(URL,
# header = TRUE, # header = TRUE,
# sep = "\t", # sep = "\t",
# stringsAsFactors = FALSE) # stringsAsFactors = FALSE)
# save(GOLDdata, file="data/GOLDdata.RData") # save(GOLDdata, file="data/GOLDdata.RData")
# or ...
load(file="data/GOLDdata.RData") load(file="data/GOLDdata.RData")
@ -72,133 +106,145 @@ table(GOLDdata$Group)
# What subgroups of fungi do we have? # What subgroups of fungi do we have?
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"]) table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
# How many of the fungi have protein annotations? # How many of the fungi have protein annotations? The README file told us that
# the column "Proteins" contains "Number of Proteins annotated in the assembly".
# Looking at a few ...
head(GOLDdata$Proteins, 30)
# ... we see that the number varies, and some have a hyphen, i.e. no
# annotations. The hyphens make this a char type column (as per: all elements
# of a vector must have the same type). Therefore we can't read this as numbers
# and filter by some value > 0. But we can filter for all genomes that don't
# have the hyphen:
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-") sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
# Get a subset of the data, with fungi that have protein annotations # Subset the data, with fungi that have protein annotations
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" & GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
GOLDdata$Proteins != "-" , ] GOLDdata$Proteins != "-" , ]
# check what we have in the table # check what we have in the table
nrow(GOLDfungi)
head(GOLDfungi) head(GOLDfungi)
# For our purpose of defining species, we pick only the first two words from the
# "X.Organism.Name" column ... here is a function to do this: # == 2.3 Unique species ====================================================
# For our purpose of defining species, we will select only species, not strains
# from this list. To do this, we pick the first two words i.e. the systematic
# binomial name from the "X.Organism.Name" column, and then we remove redundant
# species. Here is a function:
# #
makeBinomial <- function(s) { getBinom <- function(s) {
# input: # Fetch the first two words from a string.
# s: a string which is expected to contain a binomial # Parameters:
# species name as the first two words, followed by other text # s: char a string which is expected to contain a binomial species name
# output: # as the first two words, possibly followed by other text.
# the first two words separeted by a single blank # Value: char the first two words separated by a single blank
# #
x <- unlist(strsplit(s, "\\s+")) # split second element on x <- unlist(strsplit(s, "\\s+")) # split s on one or more whitespace
# one or more whitespace
return(paste(x[1:2], collapse=" ")) # return first two elements return(paste(x[1:2], collapse=" ")) # return first two elements
} }
# iterate through GOLDdata and extract species names # iterate through GOLDdata and extract species names
GOLDspecies <- character() GOLDspecies <- character()
for (i in 1:nrow(GOLDfungi)) { for (i in 1:nrow(GOLDfungi)) {
GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i]) GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
} }
head(GOLDspecies)
length(GOLDspecies)
# N.b. this would be more efficiently (but perhaps less explicitly) coded with
# one of the apply() functions, instead of a for-loop.
# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
# Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast: # Species of great interest may appear more than once, one for each sequenced
# strain: e.g. brewer's yeast:
sum(GOLDspecies == "Saccharomyces cerevisiae") sum(GOLDspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple: # Therefore we use the function unique() to throw out duplicates. Simple:
GOLDspecies <- unique(GOLDspecies) GOLDspecies <- unique(GOLDspecies)
length(GOLDspecies) length(GOLDspecies)
# i.e. we got rid of about half of the species. # i.e. we got rid of about 40% of the species by removing duplicates.
# = 3 BLAST species =======================================================
# ==== BLAST species ===========================================================
# #
# Use BLAST to fetch proteins related to Mbp1 and identifying the species that # Next, we filter our list by species that have homologues to the yeast Mbp1
# gene. To do this we run a BLAST search to find all related proteins in any
# fungus. We list the species that appear in that list, and then we select those
# that appear in our GOLD table as well.
#
# == 3.1 find homologous proteins ==========================================
#
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them. # contain them.
#
# Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a # amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw # principled way but requires rather ad hoc solutions. The code I threw together
# together to make a BLAST interface for the course is in the file BLAST.R # to make a BLAST interface (demo-quality, not research-quality) is in the file
# Feel encouraged to study how this works. # BLAST.R Feel encouraged to study how this works. It's a pretty standard task
# of communicating with servers and parsing responses - everyday fare in the
source("BLAST.R") # load the function and its utilites # bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
# in currently available packages.
# source("BLAST.R") # load the function and its utilities
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID # BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# nHits = 1000, # 633 hits in 2016 # db = "refseq_protein", # database to search in
# E = 0.01, # # nHits = 3000, # 720 hits in 2017
# limits = "txid4751[ORGN]") # = fungi # E = 0.01, #
# save(hits, file="data/BLASThits.RData") # limits = "txid4751[ORGN]") # = fungi
# save(BLASThits, file="data/BLASThits.RData")
load(file="data/BLASThits.RData") load(file="data/BLASThits.RData")
# == 3.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here # This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains. # we are only interested in the species names that it contains.
# How many hits in the list? # How many hits in the list?
length(hits$hits) length(BLASThits$hits)
# Let's look at the first one # Let's look at a hit somewhere down the list
str(hits$hit[[1]]) str(BLASThits$hit[[277]])
# the species information is in the $def element - the definition line of the # A fair amount of parsing has gone into the BLAST.R code to prepare the results
# sequence record ... but we need a function to retrieve it. This one is a great # in a useful way. The species information is in the $species element of every
# example, because it is really messy. Have a look: # hit.
hits$hits[[1]]$def
# We get so many species because the exact same hit has been found by BLAST in a # Run a loop to extract all the species names into a vector. We subset ...
# number of RefSeqs sequence records, all of which are different strains of # Blasthits$hits ... the list of hits, from which we choose ...
# yeast. We only need one of those though, any one, so we shall parse out the # Blasthits$hits[[i]] ... the i-th hit, and get ...
# first one. For this, We will simply use the immensely versatile strsplit() # Blasthits$hits[[i]]$species ... the species element from that.
# function, split on square brackets, take the second element of the resulting # Subsetting FTW.
# array and run that through our makeBinomial function.
# define the function (i.e. execute the lines below)
parseDeflineSpecies <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name in square brackets embedded in other text
# output:
# the species name found the first bracketed string
#
x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters
return(makeBinomial(x[2])) # return Binomial name from
# second element of vector
}
#test it
parseDeflineSpecies(hits$hits[[1]]$def)
parseDeflineSpecies(hits$hits[[11]]$def)
parseDeflineSpecies(hits$hits[[111]]$def)
# now run a simple loop to extract all the species names into a vector
BLASTspecies <- character() BLASTspecies <- character()
for (i in 1:length(hits$hits)) { for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def) BLASTspecies[i] <-BLASThits$hits[[i]]$species
} }
# You can confirm in the Values section of the Environment pane that # You can confirm that BLASTspecies has the expected size.
# BLASTspecies has the expected size. Again, species may appear more than once, length(BLASTspecies)
# e.g.
# Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae") sum(BLASTspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple: # ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we use unique() to throw out duplicates:
BLASTspecies <- unique(BLASTspecies) BLASTspecies <- unique(BLASTspecies)
length(BLASTspecies) length(BLASTspecies)
# i.e. we got rid of about one third of the species. # i.e. we got rid of about two thirds of the hits.
#
# You should think about this: what does it mean that on average we have three # You should think about this: what is the biological interpretation of the
# hits by sequence similarity to Mbp1 in other species? # finding that on average we have three sequences that are similar to Mbp1 in
# other species?
# ==== Intersecting BLAST and GOLD species lists =============================== # = 4 Intersect GOLD and BLAST species ====================================
# Now we can compare the two lists for species that appear in both sources: the # Now we can compare the two lists for species that appear in both sources: the
# simplest way is to use the set operation functions union(), intersection() # simplest way is to use the set operation functions union(), intersection()
@ -207,7 +253,21 @@ length(BLASTspecies)
YFOspecies <- intersect(GOLDspecies, BLASTspecies) YFOspecies <- intersect(GOLDspecies, BLASTspecies)
# Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list. # Again: interpret this:
# - what is the number of GOLDspecies?
# - what is the number of BLAST species?
# - how many species are present in both lists?
# - what does it mean if a species is in GOLD but not in the BLAST list?
# - what does it mean if a species has been found during BLAST, but it
# is not in GOLD?
# = 5 Cleanup and finish ==================================================
# One final thing: some of the species will be our so-called "reference" species
# which we use for model solutions and examples in the course. They are defined
# in the .utilities.R file of this project. We remove them from the list so that
# we don't inadvertently assign them.
# #
REFspecies REFspecies
@ -217,4 +277,6 @@ YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData") # save(YFOspecies, file = "data/YFOspecies.RData")
# [END] # [END]

View File

@ -3,11 +3,12 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-YFO unit # R code accompanying the BIN-YFO unit
# #
# Version: 0.1 # Version: 1.0
# #
# Date: 2017 08 25 # Date: 2017 09 21
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R # V 0.1 First code copied from BCH441_A03_makeYFOlist.R
# #
# TODO: # TODO:
@ -16,27 +17,55 @@
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# DO NOT SIMPLY source() THESE FILES! # DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
# ============================================================================== #TOC> ==========================================================================
# PART ONE: YFO Species #TOC>
# ============================================================================== #TOC> Section Title Line
#TOC> ---------------------------------------
#TOC> 1 Preparations 38
#TOC> 2 Suitable YFO Species 50
#TOC> 3 Adopt "YFO" 64
#TOC>
#TOC> ==========================================================================
# There are some "rabbitholes" that you are encouraged to follow to explore the code that went into generating the YFO species list. The minimal required result however is that you have picked an '''YFO''', and that its name got written into your personalized profile file. # = 1 Preparations ========================================================
#
# Execute the two conditionals below:
if (! file.exists(".myProfile.R")) {
stop("PANIC: profile file does not exist. Fix problem or ask for help.")
}
if (! exists("myStudentNumber")) {
stop("PANIC: profile data wasn't loaded. Fix problem or ask for help.")
}
# = 2 Suitable YFO Species ================================================
# A detailed description of the process of compiling the YFO list of genome
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "YFO" (Your Favourite Organism) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file # sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC_makeYFOlist.R I encourage you to study it. Here we load the # ABC-makeYFOlist.R
# resulting vector of species name, then pick one of them in a random but
# reproducible way, determined by your student number. # Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
# selecting and combining data from various public-domain data resources.
# = 3 Adopt "YFO" =========================================================
# In the code below, we load the resulting vector of species name, then pick one
# of them in a random but reproducible way, determined by your student number.
load("data/YFOspecies.RData") # load the species names load("data/YFOspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator set.seed(myStudentNumber) # seed the random number generator
@ -48,21 +77,9 @@ cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
YFO # so, which species is it ... ? YFO # so, which species is it ... ?
biCode(YFO) # and what is it's "BiCode" ... ? biCode(YFO) # and what is it's "BiCode" ... ?
# Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# Note down the species name and its five letter label on your Student Wiki user page. '''Use this species whenever this or future assignments refer to YFO'''. # to YFO. In code, we will automatically load it from your.myProfile.R file.
#
#
# ==============================================================================
# [END] # [END]

361
BLAST.R Normal file
View File

@ -0,0 +1,361 @@
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 2.0
# Date: 2016 09 - 2017 09
# Author: Boris Steipe
#
# Versions:
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
#
# 1.0 first version posted for BCH441 2016, based on BLAST - API
#
# ToDo:
#
# Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code.
#
# ==============================================================================
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 0.1,
limits = "",
rid = "",
quietly = FALSE,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
# Version: 2.0
# Date: 2017-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earleir search results
# quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds)
# Value:
# result: list of resulting hits and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list()
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # we skip, and proceed directly to retrieval
# if rid is not the empty string
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Put",
"&PROGRAM=", "blastp",
"&QUERY=", URLencode(q),
"&DATABASE=", db,
"&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text")
if (limits != "") {
results$query <- paste0(
results$query,
"&ENTREZ_QUERY=", limits)
}
# send it off ...
response <- GET(results$query)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
} # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo",
sep = "")
while (TRUE) {
# Check whether the result is ready
response <- GET(checkStatus)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("You could check back later with rid \"%s\"\n",
results$rid))
return(results)
}
if (quietly) {
Sys.sleep(EXTRAWAIT)
} else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT,
myTimeout))
waitTimer(EXTRAWAIT)
next
}
} else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n")
return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n")
return(results)
} else {
break # done ... retrieve search result
}
}
} # end result-check loop
# retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"&CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
sep = "")
response <- GET(retrieve)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt)
iLast <- x[length(x)]
# Get the alignments block
txt <- txt[iFirst:iLast]
# Drop empty lines
txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt)
x <- rle(iKeep)
x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE
txt <- txt[iKeep]
}
# After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the
# first character is ">"
# - There is only one defline in each block.
# - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt)
blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks
results$hits <- list()
for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock)
}
return(results)
}
parseBLASTalignment <- function(hit) {
# parse one BLAST hit;
# return a list
if (length(grep("Length", hit)) > 1) {
stop("Parsing function can't handle multiple HSPs (yet).")
}
h <- list()
# FASTA defline
h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism
patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) < 2) {
h$species <- NA
} else {
h$species <- paste(x[1:2], collapse = " ")
}
# E-value
x <- hit[grep("Expect\\s*=", hit)]
patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
h$E <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
# length of hit and # identities
x <- hit[grep("Identities\\s*=", hit)]
patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
m <- regexec(patt, x)
h$lengthAli <- as.numeric(regmatches(x, m)[[1]][2])
h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
# number of gaps
x <- hit[grep("Gaps\\s*=", hit)]
patt <- "Gaps\\s*=\\s*([0-9]+)"
h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
# first and last positions
iAli <- grep("^Query\\s+", hit)
h$Qbounds <- getAliBounds(hit[iAli])
h$Sbounds <- getAliBounds(hit[iAli + 2])
# aligned sequences
h$Qseq <- character()
h$midSeq <- character()
h$Sseq <- character()
for (i in iAli) {
patt <- "^Query\\s+[0-9]+\\s*"
first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
patt <- "\\s*[0-9]*\\s*$"
last <- regexec(patt, hit[i])[[1]][1] - 1
h$Qseq <- paste0(h$Qseq, substr(hit[i], first, last))
h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
h$Sseq <- paste0(h$Sseq, substr(hit[i + 2], first, last))
}
return(h)
}
getAliBounds <- function(s) {
# get first and last position from a vector of BLAST alignments s
# value: numeric vector of first and last position
patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
patt <- "\\s*([0-9]+)\\s*$"
last <- as.numeric(regmatches(s[length(s)],
regexec(patt, s[length(s)]))[[1]][2])
return(c (first, last))
}
# ==== TESTS ===================================================================
# define query:
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# or ...
# q <- "NP_010227" # refseq ID
#
# test <- BLAST(q,
# nHits = 100,
# E = 0.001,
# rid = "",
# limits = "txid4751[ORGN]")
# length(test$hits)
# [END]