New code for YFO list and BLAST

This commit is contained in:
hyginn 2017-09-21 17:49:55 -04:00
parent ad783c1254
commit ce3a17da6d
3 changed files with 561 additions and 121 deletions

View File

@ -5,61 +5,95 @@
#
# Version: 1.1
#
# Date: 2016 09 - 2017 08
# Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1 Update 2017
# V 1.0 First code 2016
#
# TODO:
# actually rerun for 2017
#
# type out workflow
#
# ==============================================================================
#
# DO NOT source() THIS FILE!
#
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
#
# Run the commands that interact with the NCBI servers only if you want to
# experiment with the code and/or parameters. I have commented out those
# parts. If you simply want to reproduce the process you can simply
# load() the respective intermediate results.
# experiment specifically with the code and/or parameters. I have commented out
# those parts. If you only want to study the general workflow, just load()
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 The strategy 54
#TOC> 2 GOLD species 66
#TOC> 2.1 Initialize 71
#TOC> 2.2 Import 77
#TOC> 2.3 Unique species 129
#TOC> 3 BLAST species 171
#TOC> 3.1 find homologous proteins 178
#TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
# ==============================================================================
# CREATING A YFO LIST
# ==============================================================================
# = 1 The strategy ========================================================
# This script will create a list of "YFO" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where
# it can be loaded.
# it can be loaded. The strategy is as follows: we download a list of all
# genome projects and then select species for which protein annotations are
# available - i.e. these are all genome-sequenced species that have been
# annotated. Then we search for fungal species that have homologues to MBP1.
# Then we intersect the two lists to give us genome-sequenced species that
# also have Mbp1 homologues ...
# ==== GOLD species ============================================================
#
# Fetch and parse genome data from the NCBI genome project database
# = 2 GOLD species ========================================================
# === Initialize
# Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
# == 2.1 Initialize ========================================================
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
install.packages("httr")
library(httr)
}
# The URL where the genome data can be downloaded
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# == 2.2 Import ============================================================
# Read the data directly from the NCBI ftp server and put it into a dataframe.
# This will take about a minute.
# The URL of the genome data directory at the NCBI:
# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
# Note the relative size of the prokaryotes and the eukaryotes data.
# What's in this directory?
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
GOLDreadme <- readLines(URL) # read the file into a vector
cat(GOLDreadme, sep = "\n") # display the contents
# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
# into a dataframe. This will take a few moments.
# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# GOLDdata <- read.csv(URL,
# header = TRUE,
# sep = "\t",
# stringsAsFactors = FALSE)
# save(GOLDdata, file="data/GOLDdata.RData")
# or ...
load(file="data/GOLDdata.RData")
@ -72,133 +106,145 @@ table(GOLDdata$Group)
# What subgroups of fungi do we have?
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
# How many of the fungi have protein annotations?
# How many of the fungi have protein annotations? The README file told us that
# the column "Proteins" contains "Number of Proteins annotated in the assembly".
# Looking at a few ...
head(GOLDdata$Proteins, 30)
# ... we see that the number varies, and some have a hyphen, i.e. no
# annotations. The hyphens make this a char type column (as per: all elements
# of a vector must have the same type). Therefore we can't read this as numbers
# and filter by some value > 0. But we can filter for all genomes that don't
# have the hyphen:
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
# Get a subset of the data, with fungi that have protein annotations
# Subset the data, with fungi that have protein annotations
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
GOLDdata$Proteins != "-" , ]
# check what we have in the table
nrow(GOLDfungi)
head(GOLDfungi)
# For our purpose of defining species, we pick only the first two words from the
# "X.Organism.Name" column ... here is a function to do this:
# == 2.3 Unique species ====================================================
# For our purpose of defining species, we will select only species, not strains
# from this list. To do this, we pick the first two words i.e. the systematic
# binomial name from the "X.Organism.Name" column, and then we remove redundant
# species. Here is a function:
#
makeBinomial <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name as the first two words, followed by other text
# output:
# the first two words separeted by a single blank
getBinom <- function(s) {
# Fetch the first two words from a string.
# Parameters:
# s: char a string which is expected to contain a binomial species name
# as the first two words, possibly followed by other text.
# Value: char the first two words separated by a single blank
#
x <- unlist(strsplit(s, "\\s+")) # split second element on
# one or more whitespace
x <- unlist(strsplit(s, "\\s+")) # split s on one or more whitespace
return(paste(x[1:2], collapse=" ")) # return first two elements
}
# iterate through GOLDdata and extract species names
GOLDspecies <- character()
for (i in 1:nrow(GOLDfungi)) {
GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i])
GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
}
head(GOLDspecies)
length(GOLDspecies)
# N.b. this would be more efficiently (but perhaps less explicitly) coded with
# one of the apply() functions, instead of a for-loop.
# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
# Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast:
# Species of great interest may appear more than once, one for each sequenced
# strain: e.g. brewer's yeast:
sum(GOLDspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
GOLDspecies <- unique(GOLDspecies)
length(GOLDspecies)
# i.e. we got rid of about half of the species.
# i.e. we got rid of about 40% of the species by removing duplicates.
# ==== BLAST species ===========================================================
# = 3 BLAST species =======================================================
#
# Use BLAST to fetch proteins related to Mbp1 and identifying the species that
# Next, we filter our list by species that have homologues to the yeast Mbp1
# gene. To do this we run a BLAST search to find all related proteins in any
# fungus. We list the species that appear in that list, and then we select those
# that appear in our GOLD table as well.
#
# == 3.1 find homologous proteins ==========================================
#
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them.
#
# Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw
# together to make a BLAST interface for the course is in the file BLAST.R
# Feel encouraged to study how this works.
source("BLAST.R") # load the function and its utilites
# principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file
# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
# of communicating with servers and parsing responses - everyday fare in the
# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
# in currently available packages.
# source("BLAST.R") # load the function and its utilities
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# nHits = 1000, # 633 hits in 2016
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# save(hits, file="data/BLASThits.RData")
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in
# nHits = 3000, # 720 hits in 2017
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# save(BLASThits, file="data/BLASThits.RData")
load(file="data/BLASThits.RData")
# == 3.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(hits$hits)
length(BLASThits$hits)
# Let's look at the first one
str(hits$hit[[1]])
# Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]])
# the species information is in the $def element - the definition line of the
# sequence record ... but we need a function to retrieve it. This one is a great
# example, because it is really messy. Have a look:
hits$hits[[1]]$def
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
# in a useful way. The species information is in the $species element of every
# hit.
# We get so many species because the exact same hit has been found by BLAST in a
# number of RefSeqs sequence records, all of which are different strains of
# yeast. We only need one of those though, any one, so we shall parse out the
# first one. For this, We will simply use the immensely versatile strsplit()
# function, split on square brackets, take the second element of the resulting
# array and run that through our makeBinomial function.
# Run a loop to extract all the species names into a vector. We subset ...
# Blasthits$hits ... the list of hits, from which we choose ...
# Blasthits$hits[[i]] ... the i-th hit, and get ...
# Blasthits$hits[[i]]$species ... the species element from that.
# Subsetting FTW.
# define the function (i.e. execute the lines below)
parseDeflineSpecies <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name in square brackets embedded in other text
# output:
# the species name found the first bracketed string
#
x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters
return(makeBinomial(x[2])) # return Binomial name from
# second element of vector
}
#test it
parseDeflineSpecies(hits$hits[[1]]$def)
parseDeflineSpecies(hits$hits[[11]]$def)
parseDeflineSpecies(hits$hits[[111]]$def)
# now run a simple loop to extract all the species names into a vector
BLASTspecies <- character()
for (i in 1:length(hits$hits)) {
BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def)
for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <-BLASThits$hits[[i]]$species
}
# You can confirm in the Values section of the Environment pane that
# BLASTspecies has the expected size. Again, species may appear more than once,
# e.g.
# You can confirm that BLASTspecies has the expected size.
length(BLASTspecies)
# Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we use unique() to throw out duplicates:
BLASTspecies <- unique(BLASTspecies)
length(BLASTspecies)
# i.e. we got rid of about one third of the species.
#
# You should think about this: what does it mean that on average we have three
# hits by sequence similarity to Mbp1 in other species?
# i.e. we got rid of about two thirds of the hits.
# You should think about this: what is the biological interpretation of the
# finding that on average we have three sequences that are similar to Mbp1 in
# other species?
# ==== Intersecting BLAST and GOLD species lists ===============================
# = 4 Intersect GOLD and BLAST species ====================================
# Now we can compare the two lists for species that appear in both sources: the
# simplest way is to use the set operation functions union(), intersection()
@ -207,7 +253,21 @@ length(BLASTspecies)
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
# Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list.
# Again: interpret this:
# - what is the number of GOLDspecies?
# - what is the number of BLAST species?
# - how many species are present in both lists?
# - what does it mean if a species is in GOLD but not in the BLAST list?
# - what does it mean if a species has been found during BLAST, but it
# is not in GOLD?
# = 5 Cleanup and finish ==================================================
# One final thing: some of the species will be our so-called "reference" species
# which we use for model solutions and examples in the course. They are defined
# in the .utilities.R file of this project. We remove them from the list so that
# we don't inadvertently assign them.
#
REFspecies
@ -217,4 +277,6 @@ YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData")
# [END]

View File

@ -3,11 +3,12 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-YFO unit
#
# Version: 0.1
# Version: 1.0
#
# Date: 2017 08 25
# Date: 2017 09 21
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
@ -16,27 +17,55 @@
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------
#TOC> 1 Preparations 38
#TOC> 2 Suitable YFO Species 50
#TOC> 3 Adopt "YFO" 64
#TOC>
#TOC> ==========================================================================
# ==============================================================================
# PART ONE: YFO Species
# ==============================================================================
# = 1 Preparations ========================================================
#
# Execute the two conditionals below:
if (! file.exists(".myProfile.R")) {
stop("PANIC: profile file does not exist. Fix problem or ask for help.")
}
if (! exists("myStudentNumber")) {
stop("PANIC: profile data wasn't loaded. Fix problem or ask for help.")
}
# There are some "rabbitholes" that you are encouraged to follow to explore the code that went into generating the YFO species list. The minimal required result however is that you have picked an '''YFO''', and that its name got written into your personalized profile file.
# = 2 Suitable YFO Species ================================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "YFO" (Your Favourite Organism) for other learning units and exercises.
# A detailed description of the process of compiling the YFO list of genome
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC_makeYFOlist.R I encourage you to study it. Here we load the
# resulting vector of species name, then pick one of them in a random but
# reproducible way, determined by your student number.
# ABC-makeYFOlist.R
# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
# selecting and combining data from various public-domain data resources.
# = 3 Adopt "YFO" =========================================================
# In the code below, we load the resulting vector of species name, then pick one
# of them in a random but reproducible way, determined by your student number.
load("data/YFOspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator
@ -48,21 +77,9 @@ cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
YFO # so, which species is it ... ?
biCode(YFO) # and what is it's "BiCode" ... ?
# Note down the species name and its five letter label on your Student Wiki user page. '''Use this species whenever this or future assignments refer to YFO'''.
#
#
# ==============================================================================
# Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to YFO. In code, we will automatically load it from your.myProfile.R file.
# [END]

361
BLAST.R Normal file
View File

@ -0,0 +1,361 @@
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 2.0
# Date: 2016 09 - 2017 09
# Author: Boris Steipe
#
# Versions:
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
#
# 1.0 first version posted for BCH441 2016, based on BLAST - API
#
# ToDo:
#
# Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code.
#
# ==============================================================================
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 0.1,
limits = "",
rid = "",
quietly = FALSE,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
# Version: 2.0
# Date: 2017-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earleir search results
# quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds)
# Value:
# result: list of resulting hits and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list()
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # we skip, and proceed directly to retrieval
# if rid is not the empty string
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Put",
"&PROGRAM=", "blastp",
"&QUERY=", URLencode(q),
"&DATABASE=", db,
"&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text")
if (limits != "") {
results$query <- paste0(
results$query,
"&ENTREZ_QUERY=", limits)
}
# send it off ...
response <- GET(results$query)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
} # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo",
sep = "")
while (TRUE) {
# Check whether the result is ready
response <- GET(checkStatus)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("You could check back later with rid \"%s\"\n",
results$rid))
return(results)
}
if (quietly) {
Sys.sleep(EXTRAWAIT)
} else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT,
myTimeout))
waitTimer(EXTRAWAIT)
next
}
} else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n")
return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n")
return(results)
} else {
break # done ... retrieve search result
}
}
} # end result-check loop
# retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"&CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
sep = "")
response <- GET(retrieve)
if (http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt)
iLast <- x[length(x)]
# Get the alignments block
txt <- txt[iFirst:iLast]
# Drop empty lines
txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt)
x <- rle(iKeep)
x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE
txt <- txt[iKeep]
}
# After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the
# first character is ">"
# - There is only one defline in each block.
# - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt)
blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks
results$hits <- list()
for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock)
}
return(results)
}
parseBLASTalignment <- function(hit) {
# parse one BLAST hit;
# return a list
if (length(grep("Length", hit)) > 1) {
stop("Parsing function can't handle multiple HSPs (yet).")
}
h <- list()
# FASTA defline
h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism
patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) < 2) {
h$species <- NA
} else {
h$species <- paste(x[1:2], collapse = " ")
}
# E-value
x <- hit[grep("Expect\\s*=", hit)]
patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
h$E <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
# length of hit and # identities
x <- hit[grep("Identities\\s*=", hit)]
patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
m <- regexec(patt, x)
h$lengthAli <- as.numeric(regmatches(x, m)[[1]][2])
h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
# number of gaps
x <- hit[grep("Gaps\\s*=", hit)]
patt <- "Gaps\\s*=\\s*([0-9]+)"
h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
# first and last positions
iAli <- grep("^Query\\s+", hit)
h$Qbounds <- getAliBounds(hit[iAli])
h$Sbounds <- getAliBounds(hit[iAli + 2])
# aligned sequences
h$Qseq <- character()
h$midSeq <- character()
h$Sseq <- character()
for (i in iAli) {
patt <- "^Query\\s+[0-9]+\\s*"
first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
patt <- "\\s*[0-9]*\\s*$"
last <- regexec(patt, hit[i])[[1]][1] - 1
h$Qseq <- paste0(h$Qseq, substr(hit[i], first, last))
h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
h$Sseq <- paste0(h$Sseq, substr(hit[i + 2], first, last))
}
return(h)
}
getAliBounds <- function(s) {
# get first and last position from a vector of BLAST alignments s
# value: numeric vector of first and last position
patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
patt <- "\\s*([0-9]+)\\s*$"
last <- as.numeric(regmatches(s[length(s)],
regexec(patt, s[length(s)]))[[1]][2])
return(c (first, last))
}
# ==== TESTS ===================================================================
# define query:
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# or ...
# q <- "NP_010227" # refseq ID
#
# test <- BLAST(q,
# nHits = 100,
# E = 0.001,
# rid = "",
# limits = "txid4751[ORGN]")
# length(test$hits)
# [END]