Updates for BIN-ALI-BLAST

This commit is contained in:
hyginn 2017-10-23 12:37:09 -04:00
parent 59ab6c573f
commit bc4afc97aa
5 changed files with 130 additions and 286 deletions

View File

@ -3,253 +3,108 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# Version: 0.1
# Version: 1.0
#
# Date: 2017 08 28
# Date: 2017 10 23
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 First live version 2017.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
#
#
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
#
# ToDo:
# Notes: The bioconducter "annotate" package contains code for BLAST searches,
# in case you need to do something more involved.
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------
#TOC> 1 Packages 41
#TOC> 2 Defining the APSES domain 50
#TOC> 3 Executing the BLAST search 72
#TOC> 4 Analysing results 94
#TOC>
#TOC> ==========================================================================
# Dependencies: myEmail must exist as a global variable with
# your valid email adress
# waitTimer() must be loaded (it should have been loaded from
# .utilities.R, which was sourced via .Rprofile)
# = 1 Packages ============================================================
# library to interface with WebServers and process their XML/HTML
# responses
if (!require(xml2, quietly = TRUE)) {
install.packages("xml2")
library(xml2)
}
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)
}
# = 2 Defining the APSES domain ===========================================
parseBLAST_XML <- function(hit) {
# parse one BLAST hit XML node with the xml2 package;
# return a list
# Load your protein database
source("makeProteinDB.R")
h <- list()
h$id <- xml_text(xml_find_first(hit, ".//Hit_accession"))
h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
h$bestE <- Inf
h$sumLen <- 0
h$sumId <- 0
h$sumGap <- 0
hsps <- xml_find_all(hit, ".//Hsp")
h$Hsp <- list()
h$nHsps <- length(hsps)
if (h$nHsps > 0) {
for (i in 1:length(hsps)) {
h$Hsp[[i]] <- list()
h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue")
h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from")
h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to")
h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from")
h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to")
h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps")
h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len")
h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
# Get the APSES domain sequence for MBP1_MYSPE feature annotation. (You have
# entered this data in the BIN-ALI-Optimal_sequence_alignment unit.)
h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
}
}
return(h)
}
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
xml_numeric <- function(n, p) {
# Utility: return first node matching xpath p in XML node n as numeric
return(as.numeric(xml_text(xml_find_first(n, p))))
}
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
# BLAST search.
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 3,
limits = "\"\"",
email = myEMail,
rid = "",
quietly = FALSE) {
# Purpose:
# Basic BLAST search
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# email: a valid email address, defaults to global value myEMail
# quietly: controls printing of wait-time progress bar
# Value:
# result: list of resulting hits and some metadata
# = 3 Executing the BLAST search ==========================================
results <- list()
results$rid <- rid
results$rtoe <- 0
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
# through its Web API, and to parse results. Have a look at the script, then
# source it:
if (rid == "") { # prepare, send and analyse query
results$query <- paste(
"https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"QUERY=", q,
"&DATABASE=", db,
"&HITLIST_SIZE=", as.character(nHits),
"&EXPECT=", as.character(E),
"&PROGRAM=", "blastp",
"&ENTREZ_QUERY=", limits,
"&NOHEADER=", "true",
"&EMAIL=", email,
"&CMD=Put",
sep = "")
source("./scripts/BLAST.R")
# send it off ...
response <- read_xml(results$query, as_html = TRUE)
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
# cerevisiae:
# find the comment node that contains the information we need
# using an xpath expression
info <- xml_find_first(response,
"//comment()[contains(., \"QBlastInfo\")]")
info <- xml_text(info) # extract its contents
# parse
results$rid <- regmatches(info,
regexec("RID = (\\w+)", info))[[1]][2]
results$rtoe <- regmatches(info,
regexec("RTOE = (\\d+)", info))[[1]][2]
results$rtoe <- as.numeric(results$rtoe)
} # done analysing query
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
# retrieve results from BLAST server
URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"RID=", results$rid,
"&FORMAT_TYPE=", "XML",
"&EMAIL=", email,
"&CMD=Get",
sep = "")
raw <- GET(URL)
timeOut <- 300
nWait <- 0
while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
waitTimer(10)
nWait <- nWait + 1
raw <- GET(URL)
}
# If we get to here, we received some result. But what?
if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
stop(sprintf("Query >>%s<< didn't complete.", results$rid))
} else if (raw$headers["content-type"] == "application/xml") { # Good!
response <- read_xml(raw)
} else { # Unknown, abort.
stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
}
hits <- xml_find_all(response, ".//Hit")
if (length(hits) == 0) {
s <- "No hit returned.\n"
s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid))
stop(paste(s, collapse = ""))
}
results$hits <- list()
for (i in 1:length(hits)) {
results$hits[[i]] <- parseBLAST_XML(hits[i])
}
return(results)
}
BLASThits <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in
nHits = 10, #
E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c
# = 1 Tasks
length(BLASThits) # There should be at least one hit there. Ask for advice
# in case this step fails.
# = 4 Analysing results ===================================================
# The BLAST.R script has defined a convenience function to parse BLAST
# alignments.
(topHit <- parseBLASTalignment(BLASThits, idx = 1)) # Parse the top hit
# What is the refseq ID of the top hit
topHit$accession
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
# domain. If it is not, ask me for advice.
# ==== TESTS ===================================================================
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# q <- "NP_010227"
# fungi <- "txid4751[ORGN]"
#
# test <- BLAST("NP_010227",
# nHits = 1000,
# E = 0.01,
# limits = fungi)
# length(test$hits)

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit
#
# Version: 1.0
# Version: 1.0.1
#
# Date: 2017 09 21
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
#
# TODO:
@ -28,9 +29,9 @@
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------
#TOC> 1 Preparations 38
#TOC> 2 Suitable MYSPE Species 50
#TOC> 3 Adopt "MYSPE" 64
#TOC> 1 Preparations 39
#TOC> 2 Suitable MYSPE Species 51
#TOC> 3 Adopt "MYSPE" 65
#TOC>
#TOC> ==========================================================================
@ -56,10 +57,10 @@ if (! exists("myStudentNumber")) {
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC-makeMYSPElist.R
# ./scripts/ABC-makeMYSPElist.R
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
# selecting and combining data from various public-domain data resources.
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
# of selecting and combining data from public-domain data resources.
# = 3 Adopt "MYSPE" =======================================================

View File

@ -71,8 +71,8 @@
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "113", "end" : "211"},
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "131", "end" : "215"},
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},

View File

@ -3,11 +3,12 @@
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1.1
# Version: 1.1.2
#
# Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1.2 Moved BLAST.R to ./scripts directory
# V 1.1 Update 2017
# V 1.0 First code 2016
#
@ -184,12 +185,12 @@ length(GOLDspecies)
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file
# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
# of communicating with servers and parsing responses - everyday fare in the
# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
# in currently available packages.
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
# standard task of communicating with servers and parsing responses - everyday
# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
# parser in currently available packages.
# source("BLAST.R") # load the function and its utilities
# source("./scripts/BLAST.R") # load the function and its utilities
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in

View File

@ -4,14 +4,16 @@
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://ncbi.github.io/blast-cloud/dev/api.html
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 2.0
# Date: 2016 09 - 2017 09
# Version: 2.1
# Date: 2016 09 - 2017 10
# Author: Boris Steipe
#
# Versions:
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
@ -68,8 +70,9 @@ BLAST <- function(q,
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # we skip, and proceed directly to retrieval
# if rid is not the empty string
if (rid == "") { # if rid is not the empty string we skip the
# initial search and and proceed directly to retrieval
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
@ -216,8 +219,10 @@ BLAST <- function(q,
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
if (length(x) > 0) {
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
}
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
@ -253,18 +258,32 @@ BLAST <- function(q,
return(results)
}
parseBLASTalignment <- function(hit) {
# parse one BLAST hit;
# return a list
if (length(grep("Length", hit)) > 1) {
stop("Parsing function can't handle multiple HSPs (yet).")
}
parseBLASTalignment <- function(hits, idx) {
# Parse one BLAST hit from a BLAST result
# Parameters:
# hits list contains the BLAST hits
# idx int index of the requested hit
# Value:
# list $def chr defline
# $accession chr accession number
# $organism chr complete organism definition
# $species chr binomial species
# $E num E value
# $lengthAli num length of the alignment
# $nIdentitites num number of identities
# $nGaps num number of gaps
# $Qbounds num 2-element vector of query start-end
# $Sbounds num 2-element vector of subject start-end
# $Qseq chr query sequence
# $midSeq chr midline string
# $Sseq chr subject sequence
h <- list()
hit <- hits$hits[[idx]]
# FASTA defline
h$def <- hit[1]
h$def <- hit$def
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
@ -276,70 +295,38 @@ parseBLASTalignment <- function(hit) {
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) < 2) {
h$species <- NA
if (length(x) >= 2) {
h$species <- paste(x[1], x[2])
} else if (length(x) == 1) {
h$species <- paste(x[1], "sp.")
} else {
h$species <- paste(x[1:2], collapse = " ")
h$species <- NA
}
# E-value
x <- hit[grep("Expect\\s*=", hit)]
patt <- "Expect\\s*=\\s*([0-9.eE\\-]+)" #
h$E <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
h$E <- hit$E
# length of hit and # identities
x <- hit[grep("Identities\\s*=", hit)]
patt <- "Identities\\s*=\\s*([0-9]+)/([0-9]+)"
m <- regexec(patt, x)
h$lengthAli <- as.numeric(regmatches(x, m)[[1]][2])
h$nIdentities <- as.numeric(regmatches(x, m)[[1]][3])
h$lengthAli <- hit$lengthAli
h$nIdentities <- hit$nIdentities
# number of gaps
x <- hit[grep("Gaps\\s*=", hit)]
patt <- "Gaps\\s*=\\s*([0-9]+)"
h$nGaps <- as.numeric(regmatches(x, regexec(patt, x))[[1]][2])
h$nGaps <- hit$nGaps
# first and last positions
iAli <- grep("^Query\\s+", hit)
h$Qbounds <- getAliBounds(hit[iAli])
h$Sbounds <- getAliBounds(hit[iAli + 2])
h$Qbounds <- hit$Qbounds
h$Sbounds <- hit$Sbounds
# aligned sequences
h$Qseq <- character()
h$midSeq <- character()
h$Sseq <- character()
for (i in iAli) {
patt <- "^Query\\s+[0-9]+\\s*"
first <- attr(regexec(patt, hit[i])[[1]], "match.length") + 1
patt <- "\\s*[0-9]*\\s*$"
last <- regexec(patt, hit[i])[[1]][1] - 1
h$Qseq <- paste0(h$Qseq, substr(hit[i], first, last))
h$midSeq <- paste0(h$midSeq, substr(hit[i + 1], first, last))
h$Sseq <- paste0(h$Sseq, substr(hit[i + 2], first, last))
}
h$Qseq <- hit$Qseq
h$midSeq <- hit$midSeq
h$Sseq <- hit$Sseq
return(h)
}
getAliBounds <- function(s) {
# get first and last position from a vector of BLAST alignments s
# value: numeric vector of first and last position
patt <- "^(Query|Sbjct)\\s+([0-9]+)\\s"
first <- as.numeric(regmatches(s[1], regexec(patt, s[1]))[[1]][3])
patt <- "\\s*([0-9]+)\\s*$"
last <- as.numeric(regmatches(s[length(s)],
regexec(patt, s[length(s)]))[[1]][2])
return(c (first, last))
}
# ==== TESTS ===================================================================
# define query: