add fetchMSAmotif() function

This commit is contained in:
hyginn 2017-11-01 09:43:32 -04:00
parent d3c42da51b
commit 7cc2853d00
2 changed files with 57 additions and 6 deletions

View File

@ -133,6 +133,50 @@ waitTimer <- function(t, nIntervals = 50) {
}
fetchMSAmotif <- function(ali, mot) {
# retrieve a subset from ali that spans the sequence in mot.
# Parameters:
# ali MsaAAMultipleAlignment object
# mot chr substring within ali
# Value: AAStringset
if (class(ali) != "MsaAAMultipleAlignment" &&
class(ali) != "MsaDNAMultipleAlignment" &&
class(ali) != "MsaRNAMultipleAlignment") {
stop("ali has to be an msa multiple alignment object.")
}
if (class(mot) != "character") {
stop("mot has to be a character object.")
}
x <- gsub("-", "", as.character(ali)) # pure sequence, no hyphens
idx <- grep(mot, x)[1] # first sequence containing mot. If no match,
# idx becomes NA
if (is.na(idx)) {
stop("mot is not a subsequence in ali.")
}
# Find the match range
m <- regexpr(mot, x[idx])
motifStart <- as.numeric(m)
motifEnd <- attr(m, "match.length") + motifStart - 1
# Count characters, skip hyphens ...
x <- unlist(strsplit(as.character(ali)[idx], ""))
x <- x != "-"
x <- as.numeric(x)
x <- cumsum(x)
return(subseq(ali@unmasked,
start = which(x == motifStart)[1], # get the first position
end = which(x == motifEnd)[1]))
}
# ====== PDB ID selection ======================================================
selectPDBrep <- function(n) {

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-MSA unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 23
# Date: 2017 10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Added fetchMSAmotif()
# 1.0 Fully refactored and rewritten for 2017
# 0.1 First code copied from 2016 material.
#
@ -26,7 +27,7 @@
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Preparations 51
@ -44,7 +45,7 @@
#TOC> 6 Sequence Logos 539
#TOC> 6.1 Subsetting an alignment by motif 548
#TOC> 6.2 Plot a Sequence Logo 591
#TOC>
#TOC>
#TOC> ==========================================================================
@ -53,7 +54,7 @@
# You need to reload you protein database, including changes that might
# have been made to the reference files. If you have worked with the
# prerequiste units, you should have a script named "makeProteinDB.R"
# that will create the myDB object with aprotein and feature database.
# that will create the myDB object with a protein and feature database.
# Ask for advice if not.
source("makeProteinDB.R")
@ -242,7 +243,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
# - adjust the sequence names
# - convert to msaAAMultipleAlignment object
# === 4.1.1 importing an .aln file
# === 4.1.1 importing an .aln file
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
if (! require(seqinr, quietly=TRUE)) {
@ -587,6 +588,12 @@ x <- cumsum(x)
(motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd))
# Packaging this into a function is convenient to have, therefore I have added
# such a function to the .utilities.R script: fetchMSAmotif(). Try it:
wing <- "HEKVQGGFGKYQGTWV" # the MBP1_SACCE APSES "wing" sequence
writeALN(fetchMSAmotif(msaM, wing))
# == 6.2 Plot a Sequence Logo ==============================================