Fixed parsing and added missing logic
This commit is contained in:
parent
206e2e14bb
commit
a58e35f060
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-BLAST unit.
|
# R code accompanying the BIN-ALI-BLAST unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 23
|
# Date: 2017 10 23
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Fixed parsing logic.
|
||||||
# 1.0 First live version 2017.
|
# 1.0 First live version 2017.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -26,14 +27,14 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------
|
#TOC> ---------------------------------------------
|
||||||
#TOC> 1 Preparations 41
|
#TOC> 1 Preparations 41
|
||||||
#TOC> 2 Defining the APSES domain 54
|
#TOC> 2 Defining the APSES domain 54
|
||||||
#TOC> 3 Executing the BLAST search 76
|
#TOC> 3 Executing the BLAST search 76
|
||||||
#TOC> 4 Analysing results 98
|
#TOC> 4 Analysing results 98
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -83,23 +84,20 @@ source("./scripts/BLAST.R")
|
|||||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
||||||
# cerevisiae:
|
# cerevisiae:
|
||||||
|
|
||||||
BLASThits <- BLAST(apses, # MYSPE APSES domain sequence
|
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
|
||||||
db = "refseq_protein", # database to search in
|
db = "refseq_protein", # database to search in
|
||||||
nHits = 10, #
|
nHits = 10, #
|
||||||
E = 0.01, #
|
E = 0.01, #
|
||||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||||
|
|
||||||
|
|
||||||
length(BLASThits) # There should be at least one hit there. Ask for advice
|
length(BLASTresults$hits) # There should be at least one hit there. Ask for advice
|
||||||
# in case this step fails.
|
# in case this step fails.
|
||||||
|
|
||||||
|
|
||||||
# = 4 Analysing results ===================================================
|
# = 4 Analysing results ===================================================
|
||||||
|
|
||||||
# The BLAST.R script has defined a convenience function to parse BLAST
|
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
||||||
# alignments.
|
|
||||||
|
|
||||||
(topHit <- parseBLASTalignment(BLASThits, idx = 1)) # Parse the top hit
|
|
||||||
|
|
||||||
# What is the refseq ID of the top hit
|
# What is the refseq ID of the top hit
|
||||||
topHit$accession
|
topHit$accession
|
||||||
|
@ -7,11 +7,12 @@
|
|||||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Version: 2.1
|
# Version: 3
|
||||||
# Date: 2016 09 - 2017 10
|
# Date: 2016 09 - 2017 11
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 3 parsing logic had not been fully implemented; Fixed.
|
||||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||||
# 2.0 Completely rewritten because the interface completely changed.
|
# 2.0 Completely rewritten because the interface completely changed.
|
||||||
@ -44,9 +45,6 @@ BLAST <- function(q,
|
|||||||
myTimeout = 120) {
|
myTimeout = 120) {
|
||||||
# Purpose:
|
# Purpose:
|
||||||
# Basic BLAST search
|
# Basic BLAST search
|
||||||
# Version: 2.0
|
|
||||||
# Date: 2017-09
|
|
||||||
# Author: Boris Steipe
|
|
||||||
#
|
#
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# q: query - either a valid ID or a sequence
|
# q: query - either a valid ID or a sequence
|
||||||
@ -258,11 +256,10 @@ BLAST <- function(q,
|
|||||||
return(results)
|
return(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
parseBLASTalignment <- function(hits, idx) {
|
parseBLASTalignment <- function(hit) {
|
||||||
# Parse one BLAST hit from a BLAST result
|
# Parse data from a character vector containing a BLAST hit
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# hits list contains the BLAST hits
|
# hit char one BLAST hit as char vector
|
||||||
# idx int index of the requested hit
|
|
||||||
# Value:
|
# Value:
|
||||||
# list $def chr defline
|
# list $def chr defline
|
||||||
# $accession chr accession number
|
# $accession chr accession number
|
||||||
@ -278,12 +275,20 @@ parseBLASTalignment <- function(hits, idx) {
|
|||||||
# $midSeq chr midline string
|
# $midSeq chr midline string
|
||||||
# $Sseq chr subject sequence
|
# $Sseq chr subject sequence
|
||||||
|
|
||||||
|
getToken <- function(patt, v) {
|
||||||
|
# get the first token identified by pattern patt in character vector v
|
||||||
|
v <- v[grep(patt, v)]
|
||||||
|
if (length(v) > 1) { v <- v[1] }
|
||||||
|
if (length(v) == 0) { token <- NA
|
||||||
|
} else {
|
||||||
|
token <- regmatches(v, regexec(patt, v))[[1]][2] }
|
||||||
|
return(token)
|
||||||
|
}
|
||||||
|
|
||||||
h <- list()
|
h <- list()
|
||||||
|
|
||||||
hit <- hits$hits[[idx]]
|
|
||||||
|
|
||||||
# FASTA defline
|
# FASTA defline
|
||||||
h$def <- hit$def
|
h$def <- hit[1]
|
||||||
|
|
||||||
# accesion number (ID), use the first if there are several, separated by "|"
|
# accesion number (ID), use the first if there are several, separated by "|"
|
||||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||||
@ -304,24 +309,46 @@ parseBLASTalignment <- function(hits, idx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# E-value
|
# E-value
|
||||||
h$E <- hit$E
|
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
|
||||||
|
|
||||||
# length of hit and # identities
|
# length of alignment
|
||||||
h$lengthAli <- hit$lengthAli
|
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
|
||||||
h$nIdentities <- hit$nIdentities
|
|
||||||
|
# number of identities
|
||||||
|
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
|
||||||
|
|
||||||
# number of gaps
|
# number of gaps
|
||||||
h$nGaps <- hit$nGaps
|
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
|
||||||
|
|
||||||
|
# split up alignment section
|
||||||
|
idx <- grep("^Query ", hit)
|
||||||
|
Que <- hit[idx]
|
||||||
|
Mid <- hit[idx + 1]
|
||||||
|
Sbj <- hit[idx + 2]
|
||||||
|
|
||||||
# first and last positions
|
# first and last positions
|
||||||
h$Qbounds <- hit$Qbounds
|
h$Qbounds <- c(start = 0, end = 0)
|
||||||
h$Sbounds <- hit$Sbounds
|
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
|
||||||
|
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
|
||||||
|
|
||||||
|
h$Sbounds <- c(start = 0, end = 0)
|
||||||
|
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
|
||||||
|
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
|
||||||
|
|
||||||
# aligned sequences
|
# aligned sequences
|
||||||
|
for (i in seq_along(Que)) {
|
||||||
|
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
|
||||||
|
m <- regexec(patt, Que[i])
|
||||||
|
iFirst <- m[[1]][2]
|
||||||
|
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
|
||||||
|
Que[i] <- substring(Que[i], iFirst, iLast)
|
||||||
|
Mid[i] <- substring(Mid[i], iFirst, iLast)
|
||||||
|
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
|
||||||
|
}
|
||||||
|
|
||||||
h$Qseq <- hit$Qseq
|
h$Qseq <- paste0(Que, collapse = "")
|
||||||
h$midSeq <- hit$midSeq
|
h$midSeq <- paste0(Mid, collapse = "")
|
||||||
h$Sseq <- hit$Sseq
|
h$Sseq <- paste0(Sbj, collapse = "")
|
||||||
|
|
||||||
return(h)
|
return(h)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user