add fetchMSAmotif() function

This commit is contained in:
hyginn 2017-11-01 09:43:32 -04:00
parent d3c42da51b
commit 7cc2853d00
2 changed files with 57 additions and 6 deletions

View File

@ -133,6 +133,50 @@ waitTimer <- function(t, nIntervals = 50) {
} }
fetchMSAmotif <- function(ali, mot) {
# retrieve a subset from ali that spans the sequence in mot.
# Parameters:
# ali MsaAAMultipleAlignment object
# mot chr substring within ali
# Value: AAStringset
if (class(ali) != "MsaAAMultipleAlignment" &&
class(ali) != "MsaDNAMultipleAlignment" &&
class(ali) != "MsaRNAMultipleAlignment") {
stop("ali has to be an msa multiple alignment object.")
}
if (class(mot) != "character") {
stop("mot has to be a character object.")
}
x <- gsub("-", "", as.character(ali)) # pure sequence, no hyphens
idx <- grep(mot, x)[1] # first sequence containing mot. If no match,
# idx becomes NA
if (is.na(idx)) {
stop("mot is not a subsequence in ali.")
}
# Find the match range
m <- regexpr(mot, x[idx])
motifStart <- as.numeric(m)
motifEnd <- attr(m, "match.length") + motifStart - 1
# Count characters, skip hyphens ...
x <- unlist(strsplit(as.character(ali)[idx], ""))
x <- x != "-"
x <- as.numeric(x)
x <- cumsum(x)
return(subseq(ali@unmasked,
start = which(x == motifStart)[1], # get the first position
end = which(x == motifEnd)[1]))
}
# ====== PDB ID selection ====================================================== # ====== PDB ID selection ======================================================
selectPDBrep <- function(n) { selectPDBrep <- function(n) {

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-MSA unit. # R code accompanying the BIN-ALI-MSA unit.
# #
# Version: 1.0 # Version: 1.1
# #
# Date: 2017 10 23 # Date: 2017 10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.1 Added fetchMSAmotif()
# 1.0 Fully refactored and rewritten for 2017 # 1.0 Fully refactored and rewritten for 2017
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
@ -26,7 +27,7 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------------ #TOC> ------------------------------------------------------------
#TOC> 1 Preparations 51 #TOC> 1 Preparations 51
@ -44,7 +45,7 @@
#TOC> 6 Sequence Logos 539 #TOC> 6 Sequence Logos 539
#TOC> 6.1 Subsetting an alignment by motif 548 #TOC> 6.1 Subsetting an alignment by motif 548
#TOC> 6.2 Plot a Sequence Logo 591 #TOC> 6.2 Plot a Sequence Logo 591
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -53,7 +54,7 @@
# You need to reload you protein database, including changes that might # You need to reload you protein database, including changes that might
# have been made to the reference files. If you have worked with the # have been made to the reference files. If you have worked with the
# prerequiste units, you should have a script named "makeProteinDB.R" # prerequiste units, you should have a script named "makeProteinDB.R"
# that will create the myDB object with aprotein and feature database. # that will create the myDB object with a protein and feature database.
# Ask for advice if not. # Ask for advice if not.
source("makeProteinDB.R") source("makeProteinDB.R")
@ -242,7 +243,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
# - adjust the sequence names # - adjust the sequence names
# - convert to msaAAMultipleAlignment object # - convert to msaAAMultipleAlignment object
# === 4.1.1 importing an .aln file # === 4.1.1 importing an .aln file
# The seqinr package has a function to read CLUSTAL W formatted .aln files ... # The seqinr package has a function to read CLUSTAL W formatted .aln files ...
if (! require(seqinr, quietly=TRUE)) { if (! require(seqinr, quietly=TRUE)) {
@ -587,6 +588,12 @@ x <- cumsum(x)
(motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd)) (motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd))
# Packaging this into a function is convenient to have, therefore I have added
# such a function to the .utilities.R script: fetchMSAmotif(). Try it:
wing <- "HEKVQGGFGKYQGTWV" # the MBP1_SACCE APSES "wing" sequence
writeALN(fetchMSAmotif(msaM, wing))
# == 6.2 Plot a Sequence Logo ============================================== # == 6.2 Plot a Sequence Logo ==============================================