Updates for BIN-ALI-BLAST
This commit is contained in:
parent
59ab6c573f
commit
bc4afc97aa
273
BIN-ALI-BLAST.R
273
BIN-ALI-BLAST.R
@ -3,253 +3,108 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-BLAST unit.
|
||||
#
|
||||
# Version: 0.1
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 08 28
|
||||
# Date: 2017 10 23
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 First live version 2017.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
|
||||
# ==============================================================================
|
||||
|
||||
# = 1 ___Section___
|
||||
|
||||
# BLAST.R
|
||||
#
|
||||
# Purpose: Send off one BLAST search and return parsed list of results
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
|
||||
#
|
||||
#
|
||||
# Version: 1.0
|
||||
# Date: 2016-09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
#
|
||||
# ToDo:
|
||||
# Notes: The bioconducter "annotate" package contains code for BLAST searches,
|
||||
# in case you need to do something more involved.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------
|
||||
#TOC> 1 Packages 41
|
||||
#TOC> 2 Defining the APSES domain 50
|
||||
#TOC> 3 Executing the BLAST search 72
|
||||
#TOC> 4 Analysing results 94
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
# Dependencies: myEmail must exist as a global variable with
|
||||
# your valid email adress
|
||||
# waitTimer() must be loaded (it should have been loaded from
|
||||
# .utilities.R, which was sourced via .Rprofile)
|
||||
|
||||
# = 1 Packages ============================================================
|
||||
|
||||
# library to interface with WebServers and process their XML/HTML
|
||||
# responses
|
||||
if (!require(xml2, quietly = TRUE)) {
|
||||
install.packages("xml2")
|
||||
library(xml2)
|
||||
}
|
||||
|
||||
if (!require(httr, quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
if (!require(Biostrings, quietly=TRUE)) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
}
|
||||
|
||||
|
||||
# = 2 Defining the APSES domain ===========================================
|
||||
|
||||
parseBLAST_XML <- function(hit) {
|
||||
# parse one BLAST hit XML node with the xml2 package;
|
||||
# return a list
|
||||
# Load your protein database
|
||||
source("makeProteinDB.R")
|
||||
|
||||
h <- list()
|
||||
h$id <- xml_text(xml_find_first(hit, ".//Hit_accession"))
|
||||
h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
|
||||
h$bestE <- Inf
|
||||
h$sumLen <- 0
|
||||
h$sumId <- 0
|
||||
h$sumGap <- 0
|
||||
hsps <- xml_find_all(hit, ".//Hsp")
|
||||
h$Hsp <- list()
|
||||
h$nHsps <- length(hsps)
|
||||
if (h$nHsps > 0) {
|
||||
for (i in 1:length(hsps)) {
|
||||
h$Hsp[[i]] <- list()
|
||||
h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue")
|
||||
h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from")
|
||||
h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to")
|
||||
h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from")
|
||||
h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to")
|
||||
h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
|
||||
h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps")
|
||||
h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len")
|
||||
h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
|
||||
h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
|
||||
h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
|
||||
# Get the APSES domain sequence for MBP1_MYSPE feature annotation. (You have
|
||||
# entered this data in the BIN-ALI-Optimal_sequence_alignment unit.)
|
||||
|
||||
h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
|
||||
h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
|
||||
h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
|
||||
h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
|
||||
}
|
||||
}
|
||||
return(h)
|
||||
}
|
||||
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
xml_numeric <- function(n, p) {
|
||||
# Utility: return first node matching xpath p in XML node n as numeric
|
||||
return(as.numeric(xml_text(xml_find_first(n, p))))
|
||||
}
|
||||
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
|
||||
# BLAST search.
|
||||
|
||||
|
||||
BLAST <- function(q,
|
||||
db = "refseq_protein",
|
||||
nHits = 30,
|
||||
E = 3,
|
||||
limits = "\"\"",
|
||||
email = myEMail,
|
||||
rid = "",
|
||||
quietly = FALSE) {
|
||||
# Purpose:
|
||||
# Basic BLAST search
|
||||
# Version: 1.0
|
||||
# Date: 2016-09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# q: query - either a valid ID or a sequence
|
||||
# db: "refseq_protein" by default,
|
||||
# other legal valuses include: "nr", "pdb", "swissprot" ...
|
||||
# nHits: number of hits to maximally return
|
||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||
# to occur E or more times in a database of random sequence.
|
||||
# limits: a valid ENTREZ filter
|
||||
# email: a valid email address, defaults to global value myEMail
|
||||
# quietly: controls printing of wait-time progress bar
|
||||
# Value:
|
||||
# result: list of resulting hits and some metadata
|
||||
# = 3 Executing the BLAST search ==========================================
|
||||
|
||||
results <- list()
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||
# through its Web API, and to parse results. Have a look at the script, then
|
||||
# source it:
|
||||
|
||||
if (rid == "") { # prepare, send and analyse query
|
||||
results$query <- paste(
|
||||
"https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"QUERY=", q,
|
||||
"&DATABASE=", db,
|
||||
"&HITLIST_SIZE=", as.character(nHits),
|
||||
"&EXPECT=", as.character(E),
|
||||
"&PROGRAM=", "blastp",
|
||||
"&ENTREZ_QUERY=", limits,
|
||||
"&NOHEADER=", "true",
|
||||
"&EMAIL=", email,
|
||||
"&CMD=Put",
|
||||
sep = "")
|
||||
source("./scripts/BLAST.R")
|
||||
|
||||
# send it off ...
|
||||
response <- read_xml(results$query, as_html = TRUE)
|
||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
||||
# cerevisiae:
|
||||
|
||||
# find the comment node that contains the information we need
|
||||
# using an xpath expression
|
||||
info <- xml_find_first(response,
|
||||
"//comment()[contains(., \"QBlastInfo\")]")
|
||||
|
||||
info <- xml_text(info) # extract its contents
|
||||
|
||||
# parse
|
||||
results$rid <- regmatches(info,
|
||||
regexec("RID = (\\w+)", info))[[1]][2]
|
||||
results$rtoe <- regmatches(info,
|
||||
regexec("RTOE = (\\d+)", info))[[1]][2]
|
||||
results$rtoe <- as.numeric(results$rtoe)
|
||||
} # done analysing query
|
||||
|
||||
# Now we wait ...
|
||||
if (quietly) {
|
||||
Sys.sleep(results$rtoe)
|
||||
} else {
|
||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
||||
waitTimer(results$rtoe)
|
||||
}
|
||||
|
||||
# retrieve results from BLAST server
|
||||
URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"RID=", results$rid,
|
||||
"&FORMAT_TYPE=", "XML",
|
||||
"&EMAIL=", email,
|
||||
"&CMD=Get",
|
||||
sep = "")
|
||||
raw <- GET(URL)
|
||||
|
||||
timeOut <- 300
|
||||
nWait <- 0
|
||||
while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
|
||||
cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
|
||||
waitTimer(10)
|
||||
nWait <- nWait + 1
|
||||
raw <- GET(URL)
|
||||
}
|
||||
|
||||
# If we get to here, we received some result. But what?
|
||||
if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
|
||||
stop(sprintf("Query >>%s<< didn't complete.", results$rid))
|
||||
} else if (raw$headers["content-type"] == "application/xml") { # Good!
|
||||
response <- read_xml(raw)
|
||||
} else { # Unknown, abort.
|
||||
stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
|
||||
}
|
||||
|
||||
hits <- xml_find_all(response, ".//Hit")
|
||||
|
||||
if (length(hits) == 0) {
|
||||
s <- "No hit returned.\n"
|
||||
s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
|
||||
s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
|
||||
s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid))
|
||||
stop(paste(s, collapse = ""))
|
||||
}
|
||||
|
||||
results$hits <- list()
|
||||
|
||||
for (i in 1:length(hits)) {
|
||||
results$hits[[i]] <- parseBLAST_XML(hits[i])
|
||||
}
|
||||
|
||||
return(results)
|
||||
}
|
||||
BLASThits <- BLAST(apses, # MYSPE APSES domain sequence
|
||||
db = "refseq_protein", # database to search in
|
||||
nHits = 10, #
|
||||
E = 0.01, #
|
||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||
|
||||
|
||||
|
||||
# = 1 Tasks
|
||||
length(BLASThits) # There should be at least one hit there. Ask for advice
|
||||
# in case this step fails.
|
||||
|
||||
|
||||
# = 4 Analysing results ===================================================
|
||||
|
||||
# The BLAST.R script has defined a convenience function to parse BLAST
|
||||
# alignments.
|
||||
|
||||
(topHit <- parseBLASTalignment(BLASThits, idx = 1)) # Parse the top hit
|
||||
|
||||
# What is the refseq ID of the top hit
|
||||
topHit$accession
|
||||
|
||||
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
|
||||
# domain. If it is not, ask me for advice.
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
# sep="")
|
||||
# q <- "NP_010227"
|
||||
# fungi <- "txid4751[ORGN]"
|
||||
#
|
||||
# test <- BLAST("NP_010227",
|
||||
# nHits = 1000,
|
||||
# E = 0.01,
|
||||
# limits = fungi)
|
||||
# length(test$hits)
|
||||
|
||||
|
||||
|
||||
|
17
BIN-MYSPE.R
17
BIN-MYSPE.R
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-MYSPE unit
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.0.1
|
||||
#
|
||||
# Date: 2017 09 21
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
|
||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||
#
|
||||
# TODO:
|
||||
@ -28,9 +29,9 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------
|
||||
#TOC> 1 Preparations 38
|
||||
#TOC> 2 Suitable MYSPE Species 50
|
||||
#TOC> 3 Adopt "MYSPE" 64
|
||||
#TOC> 1 Preparations 39
|
||||
#TOC> 2 Suitable MYSPE Species 51
|
||||
#TOC> 3 Adopt "MYSPE" 65
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -56,10 +57,10 @@ if (! exists("myStudentNumber")) {
|
||||
|
||||
# A detailed description of the process of compiling the list of genome
|
||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||
# ABC-makeMYSPElist.R
|
||||
# ./scripts/ABC-makeMYSPElist.R
|
||||
|
||||
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
|
||||
# selecting and combining data from various public-domain data resources.
|
||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
||||
# of selecting and combining data from public-domain data resources.
|
||||
|
||||
# = 3 Adopt "MYSPE" =======================================================
|
||||
|
||||
|
@ -71,8 +71,8 @@
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
|
||||
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "113", "end" : "211"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "131", "end" : "215"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
|
||||
|
@ -3,11 +3,12 @@
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.1.1
|
||||
# Version: 1.1.2
|
||||
#
|
||||
# Date: 2016 09 - 2017 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# V 1.1 Update 2017
|
||||
# V 1.0 First code 2016
|
||||
#
|
||||
@ -184,12 +185,12 @@ length(GOLDspecies)
|
||||
# amount of error handling involved that is not supported by the API in a
|
||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||
# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
|
||||
# of communicating with servers and parsing responses - everyday fare in the
|
||||
# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
|
||||
# in currently available packages.
|
||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||
# standard task of communicating with servers and parsing responses - everyday
|
||||
# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||
# parser in currently available packages.
|
||||
|
||||
# source("BLAST.R") # load the function and its utilities
|
||||
# source("./scripts/BLAST.R") # load the function and its utilities
|
||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||
# db = "refseq_protein", # database to search in
|
@ -4,14 +4,16 @@
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||
#
|
||||
#
|
||||
# Version: 2.0
|
||||
# Date: 2016 09 - 2017 09
|
||||
# Version: 2.1
|
||||
# Date: 2016 09 - 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
# 2.0 Completely rewritten because the interface completely changed.
|
||||
# Code adpated in part from NCBI Perl sample code:
|
||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||
@ -68,8 +70,9 @@ BLAST <- function(q,
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
|
||||
if (rid == "") { # we skip, and proceed directly to retrieval
|
||||
# if rid is not the empty string
|
||||
if (rid == "") { # if rid is not the empty string we skip the
|
||||
# initial search and and proceed directly to retrieval
|
||||
|
||||
|
||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
@ -216,8 +219,10 @@ BLAST <- function(q,
|
||||
# Merge these lines to the preceding lines and delete them.
|
||||
#
|
||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
if (length(x) > 0) {
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
}
|
||||
|
||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||
# redundant, identical sequences. Keep only the first instance.
|
||||
@ -253,18 +258,32 @@ BLAST <- function(q,
|
||||
return(results)
|
||||
}
|
||||
|
||||
parseBLASTalignment <- function(hit) {
|
||||
# parse one BLAST hit;
|
||||
# return a list
|
||||
|
||||
if (length(grep("Length", hit)) > 1) {
|
||||
stop("Parsing function can't handle multiple HSPs (yet).")
|
||||
}
|
||||
parseBLASTalignment <- function(hits, idx) {
|
||||
# Parse one BLAST hit from a BLAST result
|
||||
# Parameters:
|
||||
# hits list contains the BLAST hits
|
||||
# idx int index of the requested hit
|
||||
# Value:
|
||||
# list $def chr defline
|
||||
# $accession chr accession number
|
||||
# $organism chr complete organism definition
|
||||
# $species chr binomial species
|
||||
# $E num E value
|
||||
# $lengthAli num length of the alignment
|
||||
# $nIdentitites num number of identities
|
||||
# $nGaps num number of gaps
|
||||
# $Qbounds num 2-element vector of query start-end
|
||||
# $Sbounds num 2-element vector of subject start-end
|
||||
# $Qseq chr query sequence
|
||||
# $midSeq chr midline string
|
||||
# $Sseq chr subject sequence
|
||||
|
||||
h <- list()
|
||||
|
||||
hit <- hits$hits[[idx]]
|
||||
|
||||
# FASTA defline
|
||||
h$def <- hit[1]
|
||||
h$def <- hit$def
|
||||
|
||||
# accesion number (ID), use the first if there are several, separated by "|"
|
||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||
@ -276,70 +295,38 @@ parseBLASTalignment <- function(hit) {
|
||||
|
||||
# species
|
||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||
if (length(x) < 2) {
|
||||
h$species <- NA
|
||||
if (length(x) >= 2) {
|
||||
h$species <- paste(x[1], x[2])
|
||||
} else if (length(x) == 1) {
|
||||
h$species <- paste(x[1], "sp.")
|
||||
} else {
|
||||
h$species <- paste(x[1:2], collapse = " ")
|
||||
h$species <- NA
|
||||
}
|
||||
|
||||
# E-value
|
||||
x <- hit[grep("Expect\\s*=", hit)]
|
||||
patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
|
||||
h$E <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
|
||||
h$E <- hit$E
|
||||
|
||||
# length of hit and # identities
|
||||
x <- hit[grep("Identities\\s*=", hit)]
|
||||
patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
|
||||
m <- regexec(patt, x)
|
||||
h$lengthAli <- as.numeric(regmatches(x, m)[[1]][2])
|
||||
h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
|
||||
h$lengthAli <- hit$lengthAli
|
||||
h$nIdentities <- hit$nIdentities
|
||||
|
||||
# number of gaps
|
||||
x <- hit[grep("Gaps\\s*=", hit)]
|
||||
patt <- "Gaps\\s*=\\s*([0-9]+)"
|
||||
h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
|
||||
h$nGaps <- hit$nGaps
|
||||
|
||||
# first and last positions
|
||||
iAli <- grep("^Query\\s+", hit)
|
||||
h$Qbounds <- getAliBounds(hit[iAli])
|
||||
h$Sbounds <- getAliBounds(hit[iAli + 2])
|
||||
h$Qbounds <- hit$Qbounds
|
||||
h$Sbounds <- hit$Sbounds
|
||||
|
||||
# aligned sequences
|
||||
|
||||
h$Qseq <- character()
|
||||
h$midSeq <- character()
|
||||
h$Sseq <- character()
|
||||
|
||||
for (i in iAli) {
|
||||
patt <- "^Query\\s+[0-9]+\\s*"
|
||||
first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
|
||||
|
||||
patt <- "\\s*[0-9]*\\s*$"
|
||||
last <- regexec(patt, hit[i])[[1]][1] - 1
|
||||
|
||||
h$Qseq <- paste0(h$Qseq, substr(hit[i], first, last))
|
||||
h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
|
||||
h$Sseq <- paste0(h$Sseq, substr(hit[i + 2], first, last))
|
||||
}
|
||||
h$Qseq <- hit$Qseq
|
||||
h$midSeq <- hit$midSeq
|
||||
h$Sseq <- hit$Sseq
|
||||
|
||||
return(h)
|
||||
}
|
||||
|
||||
|
||||
getAliBounds <- function(s) {
|
||||
# get first and last position from a vector of BLAST alignments s
|
||||
# value: numeric vector of first and last position
|
||||
patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
|
||||
first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
|
||||
|
||||
patt <- "\\s*([0-9]+)\\s*$"
|
||||
last <- as.numeric(regmatches(s[length(s)],
|
||||
regexec(patt, s[length(s)]))[[1]][2])
|
||||
return(c (first, last))
|
||||
}
|
||||
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
# define query:
|
Loading…
Reference in New Issue
Block a user