Fixed parsing and added missing logic

This commit is contained in:
hyginn 2017-11-16 11:47:53 -05:00
parent 206e2e14bb
commit a58e35f060
2 changed files with 61 additions and 36 deletions

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 23
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Fixed parsing logic.
# 1.0 First live version 2017.
# 0.1 First code copied from 2016 material.
#
@ -26,14 +27,14 @@
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------
#TOC> 1 Preparations 41
#TOC> 2 Defining the APSES domain 54
#TOC> 3 Executing the BLAST search 76
#TOC> 4 Analysing results 98
#TOC>
#TOC>
#TOC> ==========================================================================
@ -83,23 +84,20 @@ source("./scripts/BLAST.R")
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
# cerevisiae:
BLASThits <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in
nHits = 10, #
E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in
nHits = 10, #
E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c
length(BLASThits) # There should be at least one hit there. Ask for advice
# in case this step fails.
length(BLASTresults$hits) # There should be at least one hit there. Ask for advice
# in case this step fails.
# = 4 Analysing results ===================================================
# The BLAST.R script has defined a convenience function to parse BLAST
# alignments.
(topHit <- parseBLASTalignment(BLASThits, idx = 1)) # Parse the top hit
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
# What is the refseq ID of the top hit
topHit$accession

View File

@ -7,11 +7,12 @@
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 2.1
# Date: 2016 09 - 2017 10
# Version: 3
# Date: 2016 09 - 2017 11
# Author: Boris Steipe
#
# Versions:
# 3 parsing logic had not been fully implemented; Fixed.
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed.
@ -44,9 +45,6 @@ BLAST <- function(q,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
# Version: 2.0
# Date: 2017-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
@ -258,11 +256,10 @@ BLAST <- function(q,
return(results)
}
parseBLASTalignment <- function(hits, idx) {
# Parse one BLAST hit from a BLAST result
parseBLASTalignment <- function(hit) {
# Parse data from a character vector containing a BLAST hit
# Parameters:
# hits list contains the BLAST hits
# idx int index of the requested hit
# hit char one BLAST hit as char vector
# Value:
# list $def chr defline
# $accession chr accession number
@ -278,12 +275,20 @@ parseBLASTalignment <- function(hits, idx) {
# $midSeq chr midline string
# $Sseq chr subject sequence
getToken <- function(patt, v) {
# get the first token identified by pattern patt in character vector v
v <- v[grep(patt, v)]
if (length(v) > 1) { v <- v[1] }
if (length(v) == 0) { token <- NA
} else {
token <- regmatches(v, regexec(patt, v))[[1]][2] }
return(token)
}
h <- list()
hit <- hits$hits[[idx]]
# FASTA defline
h$def <- hit$def
h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
@ -304,24 +309,46 @@ parseBLASTalignment <- function(hits, idx) {
}
# E-value
h$E <- hit$E
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
# length of hit and # identities
h$lengthAli <- hit$lengthAli
h$nIdentities <- hit$nIdentities
# length of alignment
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
# number of identities
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
# number of gaps
h$nGaps <- hit$nGaps
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
# split up alignment section
idx <- grep("^Query ", hit)
Que <- hit[idx]
Mid <- hit[idx + 1]
Sbj <- hit[idx + 2]
# first and last positions
h$Qbounds <- hit$Qbounds
h$Sbounds <- hit$Sbounds
h$Qbounds <- c(start = 0, end = 0)
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
h$Sbounds <- c(start = 0, end = 0)
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
# aligned sequences
for (i in seq_along(Que)) {
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
m <- regexec(patt, Que[i])
iFirst <- m[[1]][2]
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
Que[i] <- substring(Que[i], iFirst, iLast)
Mid[i] <- substring(Mid[i], iFirst, iLast)
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
}
h$Qseq <- hit$Qseq
h$midSeq <- hit$midSeq
h$Sseq <- hit$Sseq
h$Qseq <- paste0(Que, collapse = "")
h$midSeq <- paste0(Mid, collapse = "")
h$Sseq <- paste0(Sbj, collapse = "")
return(h)
}