Updates for BIN-ALI-BLAST
This commit is contained in:
parent
59ab6c573f
commit
bc4afc97aa
273
BIN-ALI-BLAST.R
273
BIN-ALI-BLAST.R
@ -3,253 +3,108 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-BLAST unit.
|
# R code accompanying the BIN-ALI-BLAST unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 10 23
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 First live version 2017.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
|
||||||
# BLAST.R
|
|
||||||
#
|
|
||||||
# Purpose: Send off one BLAST search and return parsed list of results
|
|
||||||
# This script uses the BLAST URL-API
|
|
||||||
# (Application Programming Interface) at the NCBI.
|
|
||||||
# Read about the constraints here:
|
|
||||||
# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Version: 1.0
|
|
||||||
# Date: 2016-09
|
|
||||||
# Author: Boris Steipe
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# ToDo:
|
|
||||||
# Notes: The bioconducter "annotate" package contains code for BLAST searches,
|
|
||||||
# in case you need to do something more involved.
|
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
# Dependencies: myEmail must exist as a global variable with
|
#TOC>
|
||||||
# your valid email adress
|
#TOC> Section Title Line
|
||||||
# waitTimer() must be loaded (it should have been loaded from
|
#TOC> ---------------------------------------------
|
||||||
# .utilities.R, which was sourced via .Rprofile)
|
#TOC> 1 Packages 41
|
||||||
|
#TOC> 2 Defining the APSES domain 50
|
||||||
|
#TOC> 3 Executing the BLAST search 72
|
||||||
|
#TOC> 4 Analysing results 94
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# library to interface with WebServers and process their XML/HTML
|
|
||||||
# responses
|
|
||||||
if (!require(xml2, quietly = TRUE)) {
|
|
||||||
install.packages("xml2")
|
|
||||||
library(xml2)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!require(httr, quietly = TRUE)) {
|
|
||||||
install.packages("httr")
|
# = 1 Packages ============================================================
|
||||||
library(httr)
|
|
||||||
|
if (!require(Biostrings, quietly=TRUE)) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
biocLite("Biostrings")
|
||||||
|
library(Biostrings)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 Defining the APSES domain ===========================================
|
||||||
|
|
||||||
parseBLAST_XML <- function(hit) {
|
# Load your protein database
|
||||||
# parse one BLAST hit XML node with the xml2 package;
|
source("makeProteinDB.R")
|
||||||
# return a list
|
|
||||||
|
|
||||||
h <- list()
|
# Get the APSES domain sequence for MBP1_MYSPE feature annotation. (You have
|
||||||
h$id <- xml_text(xml_find_first(hit, ".//Hit_accession"))
|
# entered this data in the BIN-ALI-Optimal_sequence_alignment unit.)
|
||||||
h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
|
|
||||||
h$bestE <- Inf
|
|
||||||
h$sumLen <- 0
|
|
||||||
h$sumId <- 0
|
|
||||||
h$sumGap <- 0
|
|
||||||
hsps <- xml_find_all(hit, ".//Hsp")
|
|
||||||
h$Hsp <- list()
|
|
||||||
h$nHsps <- length(hsps)
|
|
||||||
if (h$nHsps > 0) {
|
|
||||||
for (i in 1:length(hsps)) {
|
|
||||||
h$Hsp[[i]] <- list()
|
|
||||||
h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue")
|
|
||||||
h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from")
|
|
||||||
h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to")
|
|
||||||
h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from")
|
|
||||||
h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to")
|
|
||||||
h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
|
|
||||||
h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps")
|
|
||||||
h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len")
|
|
||||||
h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
|
|
||||||
h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
|
|
||||||
h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
|
|
||||||
|
|
||||||
h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
|
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
|
||||||
h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
|
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||||
h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
|
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||||
h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
|
myDB$annotation$featureID == ftrID])
|
||||||
}
|
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||||
}
|
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||||
return(h)
|
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||||
}
|
start,
|
||||||
|
end))
|
||||||
|
|
||||||
xml_numeric <- function(n, p) {
|
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
|
||||||
# Utility: return first node matching xpath p in XML node n as numeric
|
# BLAST search.
|
||||||
return(as.numeric(xml_text(xml_find_first(n, p))))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
BLAST <- function(q,
|
# = 3 Executing the BLAST search ==========================================
|
||||||
db = "refseq_protein",
|
|
||||||
nHits = 30,
|
|
||||||
E = 3,
|
|
||||||
limits = "\"\"",
|
|
||||||
email = myEMail,
|
|
||||||
rid = "",
|
|
||||||
quietly = FALSE) {
|
|
||||||
# Purpose:
|
|
||||||
# Basic BLAST search
|
|
||||||
# Version: 1.0
|
|
||||||
# Date: 2016-09
|
|
||||||
# Author: Boris Steipe
|
|
||||||
#
|
|
||||||
# Parameters:
|
|
||||||
# q: query - either a valid ID or a sequence
|
|
||||||
# db: "refseq_protein" by default,
|
|
||||||
# other legal valuses include: "nr", "pdb", "swissprot" ...
|
|
||||||
# nHits: number of hits to maximally return
|
|
||||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
|
||||||
# to occur E or more times in a database of random sequence.
|
|
||||||
# limits: a valid ENTREZ filter
|
|
||||||
# email: a valid email address, defaults to global value myEMail
|
|
||||||
# quietly: controls printing of wait-time progress bar
|
|
||||||
# Value:
|
|
||||||
# result: list of resulting hits and some metadata
|
|
||||||
|
|
||||||
results <- list()
|
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||||
results$rid <- rid
|
# through its Web API, and to parse results. Have a look at the script, then
|
||||||
results$rtoe <- 0
|
# source it:
|
||||||
|
|
||||||
if (rid == "") { # prepare, send and analyse query
|
source("./scripts/BLAST.R")
|
||||||
results$query <- paste(
|
|
||||||
"https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
|
||||||
"?",
|
|
||||||
"QUERY=", q,
|
|
||||||
"&DATABASE=", db,
|
|
||||||
"&HITLIST_SIZE=", as.character(nHits),
|
|
||||||
"&EXPECT=", as.character(E),
|
|
||||||
"&PROGRAM=", "blastp",
|
|
||||||
"&ENTREZ_QUERY=", limits,
|
|
||||||
"&NOHEADER=", "true",
|
|
||||||
"&EMAIL=", email,
|
|
||||||
"&CMD=Put",
|
|
||||||
sep = "")
|
|
||||||
|
|
||||||
# send it off ...
|
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
||||||
response <- read_xml(results$query, as_html = TRUE)
|
# cerevisiae:
|
||||||
|
|
||||||
# find the comment node that contains the information we need
|
BLASThits <- BLAST(apses, # MYSPE APSES domain sequence
|
||||||
# using an xpath expression
|
db = "refseq_protein", # database to search in
|
||||||
info <- xml_find_first(response,
|
nHits = 10, #
|
||||||
"//comment()[contains(., \"QBlastInfo\")]")
|
E = 0.01, #
|
||||||
|
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||||
info <- xml_text(info) # extract its contents
|
|
||||||
|
|
||||||
# parse
|
|
||||||
results$rid <- regmatches(info,
|
|
||||||
regexec("RID = (\\w+)", info))[[1]][2]
|
|
||||||
results$rtoe <- regmatches(info,
|
|
||||||
regexec("RTOE = (\\d+)", info))[[1]][2]
|
|
||||||
results$rtoe <- as.numeric(results$rtoe)
|
|
||||||
} # done analysing query
|
|
||||||
|
|
||||||
# Now we wait ...
|
|
||||||
if (quietly) {
|
|
||||||
Sys.sleep(results$rtoe)
|
|
||||||
} else {
|
|
||||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
|
||||||
waitTimer(results$rtoe)
|
|
||||||
}
|
|
||||||
|
|
||||||
# retrieve results from BLAST server
|
|
||||||
URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
|
||||||
"?",
|
|
||||||
"RID=", results$rid,
|
|
||||||
"&FORMAT_TYPE=", "XML",
|
|
||||||
"&EMAIL=", email,
|
|
||||||
"&CMD=Get",
|
|
||||||
sep = "")
|
|
||||||
raw <- GET(URL)
|
|
||||||
|
|
||||||
timeOut <- 300
|
|
||||||
nWait <- 0
|
|
||||||
while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
|
|
||||||
cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
|
|
||||||
waitTimer(10)
|
|
||||||
nWait <- nWait + 1
|
|
||||||
raw <- GET(URL)
|
|
||||||
}
|
|
||||||
|
|
||||||
# If we get to here, we received some result. But what?
|
|
||||||
if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
|
|
||||||
stop(sprintf("Query >>%s<< didn't complete.", results$rid))
|
|
||||||
} else if (raw$headers["content-type"] == "application/xml") { # Good!
|
|
||||||
response <- read_xml(raw)
|
|
||||||
} else { # Unknown, abort.
|
|
||||||
stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
|
|
||||||
}
|
|
||||||
|
|
||||||
hits <- xml_find_all(response, ".//Hit")
|
|
||||||
|
|
||||||
if (length(hits) == 0) {
|
|
||||||
s <- "No hit returned.\n"
|
|
||||||
s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
|
|
||||||
s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
|
|
||||||
s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid))
|
|
||||||
stop(paste(s, collapse = ""))
|
|
||||||
}
|
|
||||||
|
|
||||||
results$hits <- list()
|
|
||||||
|
|
||||||
for (i in 1:length(hits)) {
|
|
||||||
results$hits[[i]] <- parseBLAST_XML(hits[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
return(results)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
length(BLASThits) # There should be at least one hit there. Ask for advice
|
||||||
# = 1 Tasks
|
# in case this step fails.
|
||||||
|
|
||||||
|
|
||||||
|
# = 4 Analysing results ===================================================
|
||||||
|
|
||||||
|
# The BLAST.R script has defined a convenience function to parse BLAST
|
||||||
|
# alignments.
|
||||||
|
|
||||||
|
(topHit <- parseBLASTalignment(BLASThits, idx = 1)) # Parse the top hit
|
||||||
|
|
||||||
|
# What is the refseq ID of the top hit
|
||||||
|
topHit$accession
|
||||||
|
|
||||||
|
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
|
||||||
|
# domain. If it is not, ask me for advice.
|
||||||
|
|
||||||
|
|
||||||
# ==== TESTS ===================================================================
|
|
||||||
|
|
||||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
|
||||||
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
|
||||||
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
|
||||||
# sep="")
|
|
||||||
# q <- "NP_010227"
|
|
||||||
# fungi <- "txid4751[ORGN]"
|
|
||||||
#
|
|
||||||
# test <- BLAST("NP_010227",
|
|
||||||
# nHits = 1000,
|
|
||||||
# E = 0.01,
|
|
||||||
# limits = fungi)
|
|
||||||
# length(test$hits)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
17
BIN-MYSPE.R
17
BIN-MYSPE.R
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-MYSPE unit
|
# R code accompanying the BIN-MYSPE unit
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.0.1
|
||||||
#
|
#
|
||||||
# Date: 2017 09 21
|
# Date: 2017 09 21
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
|
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||||
|
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
@ -28,9 +29,9 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------
|
#TOC> ---------------------------------------
|
||||||
#TOC> 1 Preparations 38
|
#TOC> 1 Preparations 39
|
||||||
#TOC> 2 Suitable MYSPE Species 50
|
#TOC> 2 Suitable MYSPE Species 51
|
||||||
#TOC> 3 Adopt "MYSPE" 64
|
#TOC> 3 Adopt "MYSPE" 65
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -56,10 +57,10 @@ if (! exists("myStudentNumber")) {
|
|||||||
|
|
||||||
# A detailed description of the process of compiling the list of genome
|
# A detailed description of the process of compiling the list of genome
|
||||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||||
# ABC-makeMYSPElist.R
|
# ./scripts/ABC-makeMYSPElist.R
|
||||||
|
|
||||||
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
|
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
||||||
# selecting and combining data from various public-domain data resources.
|
# of selecting and combining data from public-domain data resources.
|
||||||
|
|
||||||
# = 3 Adopt "MYSPE" =======================================================
|
# = 3 Adopt "MYSPE" =======================================================
|
||||||
|
|
||||||
|
@ -71,8 +71,8 @@
|
|||||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
|
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
|
||||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
|
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
|
||||||
|
|
||||||
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "113", "end" : "211"},
|
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
|
||||||
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "131", "end" : "215"},
|
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
|
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
|
||||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
|
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
|
||||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
|
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
|
||||||
|
@ -3,11 +3,12 @@
|
|||||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||||
# Mbp1 homologues.
|
# Mbp1 homologues.
|
||||||
#
|
#
|
||||||
# Version: 1.1.1
|
# Version: 1.1.2
|
||||||
#
|
#
|
||||||
# Date: 2016 09 - 2017 09
|
# Date: 2016 09 - 2017 09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
|
# V 1.1.2 Moved BLAST.R to ./scripts directory
|
||||||
# V 1.1 Update 2017
|
# V 1.1 Update 2017
|
||||||
# V 1.0 First code 2016
|
# V 1.0 First code 2016
|
||||||
#
|
#
|
||||||
@ -184,12 +185,12 @@ length(GOLDspecies)
|
|||||||
# amount of error handling involved that is not supported by the API in a
|
# amount of error handling involved that is not supported by the API in a
|
||||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||||
# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
|
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||||
# of communicating with servers and parsing responses - everyday fare in the
|
# standard task of communicating with servers and parsing responses - everyday
|
||||||
# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
|
# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||||
# in currently available packages.
|
# parser in currently available packages.
|
||||||
|
|
||||||
# source("BLAST.R") # load the function and its utilities
|
# source("./scripts/BLAST.R") # load the function and its utilities
|
||||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||||
# db = "refseq_protein", # database to search in
|
# db = "refseq_protein", # database to search in
|
@ -4,14 +4,16 @@
|
|||||||
# This script uses the BLAST URL-API
|
# This script uses the BLAST URL-API
|
||||||
# (Application Programming Interface) at the NCBI.
|
# (Application Programming Interface) at the NCBI.
|
||||||
# Read about the constraints here:
|
# Read about the constraints here:
|
||||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Version: 2.0
|
# Version: 2.1
|
||||||
# Date: 2016 09 - 2017 09
|
# Date: 2016 09 - 2017 10
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||||
|
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||||
# 2.0 Completely rewritten because the interface completely changed.
|
# 2.0 Completely rewritten because the interface completely changed.
|
||||||
# Code adpated in part from NCBI Perl sample code:
|
# Code adpated in part from NCBI Perl sample code:
|
||||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||||
@ -68,8 +70,9 @@ BLAST <- function(q,
|
|||||||
results$rid <- rid
|
results$rid <- rid
|
||||||
results$rtoe <- 0
|
results$rtoe <- 0
|
||||||
|
|
||||||
if (rid == "") { # we skip, and proceed directly to retrieval
|
if (rid == "") { # if rid is not the empty string we skip the
|
||||||
# if rid is not the empty string
|
# initial search and and proceed directly to retrieval
|
||||||
|
|
||||||
|
|
||||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||||
@ -216,8 +219,10 @@ BLAST <- function(q,
|
|||||||
# Merge these lines to the preceding lines and delete them.
|
# Merge these lines to the preceding lines and delete them.
|
||||||
#
|
#
|
||||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
if (length(x) > 0) {
|
||||||
txt <- txt[-x]
|
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||||
|
txt <- txt[-x]
|
||||||
|
}
|
||||||
|
|
||||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||||
# redundant, identical sequences. Keep only the first instance.
|
# redundant, identical sequences. Keep only the first instance.
|
||||||
@ -253,18 +258,32 @@ BLAST <- function(q,
|
|||||||
return(results)
|
return(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
parseBLASTalignment <- function(hit) {
|
parseBLASTalignment <- function(hits, idx) {
|
||||||
# parse one BLAST hit;
|
# Parse one BLAST hit from a BLAST result
|
||||||
# return a list
|
# Parameters:
|
||||||
|
# hits list contains the BLAST hits
|
||||||
if (length(grep("Length", hit)) > 1) {
|
# idx int index of the requested hit
|
||||||
stop("Parsing function can't handle multiple HSPs (yet).")
|
# Value:
|
||||||
}
|
# list $def chr defline
|
||||||
|
# $accession chr accession number
|
||||||
|
# $organism chr complete organism definition
|
||||||
|
# $species chr binomial species
|
||||||
|
# $E num E value
|
||||||
|
# $lengthAli num length of the alignment
|
||||||
|
# $nIdentitites num number of identities
|
||||||
|
# $nGaps num number of gaps
|
||||||
|
# $Qbounds num 2-element vector of query start-end
|
||||||
|
# $Sbounds num 2-element vector of subject start-end
|
||||||
|
# $Qseq chr query sequence
|
||||||
|
# $midSeq chr midline string
|
||||||
|
# $Sseq chr subject sequence
|
||||||
|
|
||||||
h <- list()
|
h <- list()
|
||||||
|
|
||||||
|
hit <- hits$hits[[idx]]
|
||||||
|
|
||||||
# FASTA defline
|
# FASTA defline
|
||||||
h$def <- hit[1]
|
h$def <- hit$def
|
||||||
|
|
||||||
# accesion number (ID), use the first if there are several, separated by "|"
|
# accesion number (ID), use the first if there are several, separated by "|"
|
||||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||||
@ -276,70 +295,38 @@ parseBLASTalignment <- function(hit) {
|
|||||||
|
|
||||||
# species
|
# species
|
||||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||||
if (length(x) < 2) {
|
if (length(x) >= 2) {
|
||||||
h$species <- NA
|
h$species <- paste(x[1], x[2])
|
||||||
|
} else if (length(x) == 1) {
|
||||||
|
h$species <- paste(x[1], "sp.")
|
||||||
} else {
|
} else {
|
||||||
h$species <- paste(x[1:2], collapse = " ")
|
h$species <- NA
|
||||||
}
|
}
|
||||||
|
|
||||||
# E-value
|
# E-value
|
||||||
x <- hit[grep("Expect\\s*=", hit)]
|
h$E <- hit$E
|
||||||
patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
|
|
||||||
h$E <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
|
|
||||||
|
|
||||||
# length of hit and # identities
|
# length of hit and # identities
|
||||||
x <- hit[grep("Identities\\s*=", hit)]
|
h$lengthAli <- hit$lengthAli
|
||||||
patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
|
h$nIdentities <- hit$nIdentities
|
||||||
m <- regexec(patt, x)
|
|
||||||
h$lengthAli <- as.numeric(regmatches(x, m)[[1]][2])
|
|
||||||
h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
|
|
||||||
|
|
||||||
# number of gaps
|
# number of gaps
|
||||||
x <- hit[grep("Gaps\\s*=", hit)]
|
h$nGaps <- hit$nGaps
|
||||||
patt <- "Gaps\\s*=\\s*([0-9]+)"
|
|
||||||
h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
|
|
||||||
|
|
||||||
# first and last positions
|
# first and last positions
|
||||||
iAli <- grep("^Query\\s+", hit)
|
h$Qbounds <- hit$Qbounds
|
||||||
h$Qbounds <- getAliBounds(hit[iAli])
|
h$Sbounds <- hit$Sbounds
|
||||||
h$Sbounds <- getAliBounds(hit[iAli + 2])
|
|
||||||
|
|
||||||
# aligned sequences
|
# aligned sequences
|
||||||
|
|
||||||
h$Qseq <- character()
|
h$Qseq <- hit$Qseq
|
||||||
h$midSeq <- character()
|
h$midSeq <- hit$midSeq
|
||||||
h$Sseq <- character()
|
h$Sseq <- hit$Sseq
|
||||||
|
|
||||||
for (i in iAli) {
|
|
||||||
patt <- "^Query\\s+[0-9]+\\s*"
|
|
||||||
first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
|
|
||||||
|
|
||||||
patt <- "\\s*[0-9]*\\s*$"
|
|
||||||
last <- regexec(patt, hit[i])[[1]][1] - 1
|
|
||||||
|
|
||||||
h$Qseq <- paste0(h$Qseq, substr(hit[i], first, last))
|
|
||||||
h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
|
|
||||||
h$Sseq <- paste0(h$Sseq, substr(hit[i + 2], first, last))
|
|
||||||
}
|
|
||||||
|
|
||||||
return(h)
|
return(h)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
getAliBounds <- function(s) {
|
|
||||||
# get first and last position from a vector of BLAST alignments s
|
|
||||||
# value: numeric vector of first and last position
|
|
||||||
patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
|
|
||||||
first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
|
|
||||||
|
|
||||||
patt <- "\\s*([0-9]+)\\s*$"
|
|
||||||
last <- as.numeric(regmatches(s[length(s)],
|
|
||||||
regexec(patt, s[length(s)]))[[1]][2])
|
|
||||||
return(c (first, last))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ==== TESTS ===================================================================
|
# ==== TESTS ===================================================================
|
||||||
|
|
||||||
# define query:
|
# define query:
|
Loading…
Reference in New Issue
Block a user