MSA unit, and supporting files
This commit is contained in:
parent
044990d1c9
commit
553b042f20
@ -2,10 +2,11 @@
|
|||||||
#
|
#
|
||||||
# Miscellaneous R code to suppport the project
|
# Miscellaneous R code to suppport the project
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
# Date: 2017 09
|
# Date: 2017 09 - 2017 10
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
|
# V 1.3 load msa support functions
|
||||||
# V 1.2 update database utilities to support 2017 version of JSON sources
|
# V 1.2 update database utilities to support 2017 version of JSON sources
|
||||||
# V 1.1 2017 updates for ABC-units
|
# V 1.1 2017 updates for ABC-units
|
||||||
# V 1.0 First code
|
# V 1.0 First code
|
||||||
@ -18,6 +19,8 @@
|
|||||||
# ====== SCRIPTS =============================================================
|
# ====== SCRIPTS =============================================================
|
||||||
|
|
||||||
source("./scripts/ABC-dbUtilities.R")
|
source("./scripts/ABC-dbUtilities.R")
|
||||||
|
source("./scripts/ABC-writeALN.R")
|
||||||
|
source("./scripts/ABC-writeMFA.R")
|
||||||
|
|
||||||
|
|
||||||
# ====== SUPPORT FUNCTIONS =====================================================
|
# ====== SUPPORT FUNCTIONS =====================================================
|
||||||
|
675
BIN-ALI-MSA.R
675
BIN-ALI-MSA.R
@ -3,94 +3,148 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-MSA unit.
|
# R code accompanying the BIN-ALI-MSA unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 10 23
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 Fully refactored and rewritten for 2017
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 Multiple sequence alignment
|
|
||||||
|
|
||||||
# We will compute a multiple sequence alignment using the "muscle" algorithm
|
#TOC> ==========================================================================
|
||||||
# which is available throught the Bioconductor msa package.
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> ------------------------------------------------------------
|
||||||
|
#TOC> 1 Preparations 51
|
||||||
|
#TOC> 2 Aligning full length MBP1 proteins 99
|
||||||
|
#TOC> 2.1 Preparing Sequences 110
|
||||||
|
#TOC> 2.2 Compute the MSA 135
|
||||||
|
#TOC> 3 Analyzing an MSA 156
|
||||||
|
#TOC> 4 Comparing MSAs 227
|
||||||
|
#TOC> 4.1 Importing an alignment to msa 236
|
||||||
|
#TOC> 4.1.1 importing an .aln file 245
|
||||||
|
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276
|
||||||
|
#TOC> 4.2 More alignments 313
|
||||||
|
#TOC> 4.3 Computing comparison metrics 325
|
||||||
|
#TOC> 5 Profile-Profile alignments 462
|
||||||
|
#TOC> 6 Sequence Logos 539
|
||||||
|
#TOC> 6.1 Subsetting an alignment by motif 548
|
||||||
|
#TOC> 6.2 Plot a Sequence Logo 591
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
if (!require(Biostrings, quietly=TRUE)) {
|
|
||||||
source("https://bioconductor.org/biocLite.R")
|
# = 1 Preparations ========================================================
|
||||||
|
|
||||||
|
# You need to reload you protein database, including changes that might
|
||||||
|
# have been made to the reference files. If you have worked with the
|
||||||
|
# prerequiste units, you should have a script named "makeProteinDB.R"
|
||||||
|
# that will create the myDB object with aprotein and feature database.
|
||||||
|
# Ask for advice if not.
|
||||||
|
source("makeProteinDB.R")
|
||||||
|
|
||||||
|
|
||||||
|
# Multiple sequence alignment algorithms are provided in
|
||||||
|
# the Bioconductor msa package.
|
||||||
|
|
||||||
|
if (! require(Biostrings, quietly=TRUE)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
biocLite("Biostrings")
|
biocLite("Biostrings")
|
||||||
|
library(Biostrings)
|
||||||
}
|
}
|
||||||
data(BLOSUM62)
|
# Package information:
|
||||||
|
# library(help = Biostrings) # basic information
|
||||||
|
# browseVignettes("Biostrings") # available vignettes
|
||||||
|
# data(package = "Biostrings") # available datasets
|
||||||
|
|
||||||
if (!require(msa, quietly=TRUE)) {
|
|
||||||
source("https://bioconductor.org/biocLite.R")
|
if (! require(msa, quietly=TRUE)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
biocLite("msa")
|
biocLite("msa")
|
||||||
library(msa)
|
library(msa)
|
||||||
}
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help=msa) # basic information
|
||||||
|
# browseVignettes("msa") # available vignettes
|
||||||
|
# data(package = "msa") # available datasets
|
||||||
|
|
||||||
# If the installation asks you if you want to update older packages, always
|
|
||||||
# answer "a" for "all" unless you have an important reason not to. But if the
|
# If an installation asks you if you want to update older packages, I recommend
|
||||||
# installer asks you whether you want to compile from source, answer"n" for "no"
|
# to always answer "a" for "all" unless you have an important reason not to. But
|
||||||
# unless you need new functionality in a particular bleeding-edge version of a
|
# if the installer asks you whether you want to compile from source, answer "n"
|
||||||
# package.
|
# for "no" unless you need new functionality in a particular bleeding-edge
|
||||||
|
# version of a package.
|
||||||
|
|
||||||
help(package = "msa")
|
help(package = "msa")
|
||||||
|
|
||||||
# We have used biostrings' AAString() function before; for multiple
|
|
||||||
# alignments we need an AAStringSet(). We can simply feed it
|
|
||||||
# a vector of sequences:
|
|
||||||
|
|
||||||
# Let's make a shorthand for selection of Mbp1 proteins from our database: a
|
# = 2 Aligning full length MBP1 proteins ==================================
|
||||||
# vector of indices for all of the rows in the protein table that contain
|
|
||||||
# proteins whose name begins with MBP1.
|
|
||||||
iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)
|
|
||||||
|
|
||||||
# Next we create an AAStringSet for all of those proteins
|
# In the Wiki part of this unit you have
|
||||||
seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])
|
# - aligned full length MBP1 protein sequences at the EBI using T-Coffee
|
||||||
|
# - saved the resulting alignment in CLUSTAL format
|
||||||
|
# to the file "MBP1orthologuesTC.aln"
|
||||||
|
|
||||||
# ... and align (which is very simple) ...
|
# In this section we will calculate an MSA of the same sequences using
|
||||||
msaMuscle(
|
# algorithms in the msa packages, and we will compare the two alignments.
|
||||||
seqSet,
|
|
||||||
order = "aligned")
|
|
||||||
|
|
||||||
# ... but to help us make sense of the alignment we need to add the names for
|
|
||||||
# the sequences. names for a seqSet object are held in the ranges slot...
|
|
||||||
|
|
||||||
seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]
|
|
||||||
|
|
||||||
seqSet
|
|
||||||
|
|
||||||
# This little step of adding names is actually really
|
|
||||||
# very important. That's because the aligned sequences
|
|
||||||
# are meaningless strings of characters unless we can
|
|
||||||
# easily identify their biological relationships.
|
|
||||||
# Creating MSAs that are only identified by e.g. their
|
|
||||||
# RefSeq ids is a type of cargo-cult bioinformatics
|
|
||||||
# that we encounter a lot. The point of the alignment
|
|
||||||
# is not to create it, but to interpret it!
|
|
||||||
|
|
||||||
|
|
||||||
# Let's align again, and assign the result ...
|
# == 2.1 Preparing Sequences ===============================================
|
||||||
msa1 <- msaMuscle(
|
|
||||||
seqSet,
|
|
||||||
order = "aligned")
|
|
||||||
|
|
||||||
msa1
|
# We have used the biostrings' AAString() function before; for multiple
|
||||||
|
# alignments we need an AAStringSet(). AAStringSets are produced from vectors
|
||||||
|
# of sequence.
|
||||||
|
|
||||||
|
sel <- grep("MBP1", myDB$protein$name)
|
||||||
|
MBP1set <- AAStringSet(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
|
# To help us make sense of the alignment we need to add the names for
|
||||||
|
# the sequences. Names for a seqSet object are held in the ranges slot...
|
||||||
|
|
||||||
|
MBP1set@ranges@NAMES <- myDB$protein$name[sel]
|
||||||
|
|
||||||
|
MBP1set
|
||||||
|
|
||||||
|
# You should have eleven sequences in this set, ask for advice if you don't.
|
||||||
|
|
||||||
|
# The little step of adding names is actually really very important. That's
|
||||||
|
# because the aligned sequences are meaningless strings of characters unless we
|
||||||
|
# can easily identify their biological relationships. Creating MSAs that are
|
||||||
|
# only identified by e.g. their RefSeq ids is a type of cargo-cult
|
||||||
|
# bioinformatics that we encounter a lot. The point of the alignment is not to
|
||||||
|
# create it, but to interpret it!
|
||||||
|
|
||||||
|
# == 2.2 Compute the MSA ===================================================
|
||||||
|
|
||||||
|
# The alignment itself is very simple. msa has msaMuscle() and msaClustalOmega()
|
||||||
|
# to produce alignments. (It also has msaClustalW() which is kind of
|
||||||
|
# embarrassing since that hasn't been the algorithm of first choice for
|
||||||
|
# decades. Don't use that one for real work.)
|
||||||
|
|
||||||
|
|
||||||
|
# Let's run an alignment with "Muscle"
|
||||||
|
(msaM <- msaMuscle( MBP1set, order = "aligned"))
|
||||||
|
|
||||||
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
|
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
|
||||||
print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
|
print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
||||||
|
|
||||||
|
|
||||||
# You see that the alignment object has sequence strings with hyphens as
|
# You see that the alignment object has sequence strings with hyphens as
|
||||||
@ -98,93 +152,460 @@ print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
|
|||||||
# the order has not been preserved, but the most similar sequences are now
|
# the order has not been preserved, but the most similar sequences are now
|
||||||
# adjacent to each other.
|
# adjacent to each other.
|
||||||
|
|
||||||
# Lets write the alignment to one of the common file formats: a multi-fasta
|
|
||||||
# file.
|
|
||||||
|
|
||||||
# Why oh why does the msa package not have a function to do this !!!
|
# = 3 Analyzing an MSA ====================================================
|
||||||
# Like, seriously ...
|
|
||||||
|
|
||||||
# ==== writeMFA
|
# You probabaly realize that computing an MSA is not that hard. It's not
|
||||||
|
# entirely trivial to collect meaningful sequences via e.g. PSI-BLAST ... but
|
||||||
|
# then computing the alignment gives you a result quickly. But what does it
|
||||||
|
# mean? What information does the MSA contain?
|
||||||
|
|
||||||
# Here is our own function to write a AAStringSet object to a multi-FASTA file.
|
# Let's have a first look at conserved vs. diverged regions of the MSA. msa
|
||||||
writeMFA <- function(ali, file, blockSize = 50) {
|
# provides the function msaConservationScore() which outputs a vector of scores.
|
||||||
if (missing(ali)) {
|
# The scores are the sum of pairscores for the column: for example a perfectly
|
||||||
stop("Input object missing from arguments with no default.")
|
# conserved column of histidines would have the following score in our MSA
|
||||||
}
|
# of eleven sequences:
|
||||||
if (missing(file)) {
|
# - one (H, H) pair score is 8 in BLOSUM62;
|
||||||
writeToFile <- FALSE
|
# - there are (n^2 - n) / 2 pairs that can be formed between amino acids
|
||||||
}
|
# in a column from n sequences;
|
||||||
else {
|
# - therefore the column score is 8 * (11^2 - 11) / 2 == 440
|
||||||
writeToFile <- TRUE
|
|
||||||
sink(file) # divert output to file
|
data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package
|
||||||
}
|
|
||||||
# Extract the raw data from the objects depending on
|
msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
|
||||||
# their respective class and put this
|
plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
|
||||||
# into a named vector of strings.
|
|
||||||
if (class(ali)[1] == "MsaAAMultipleAlignment") {
|
# That plot shows the well-aligned regions (domains ?) of the sequence, but it
|
||||||
strings <- character(nrow(ali))
|
# could use some smoothing. Options for smoothing such plots include calculating
|
||||||
for (i in 1:nrow(ali)) {
|
# averages in sliding windows ("moving average"), and lowess() smoothing. Here
|
||||||
strings[i] <- as.character(ali@unmasked[i])
|
# is a quick demo of a moving average smoothing, to illustrate the principle.
|
||||||
names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
|
|
||||||
}
|
wRadius <- 15 # we take the mean of all values around a point +- wRadius
|
||||||
}
|
len <- length(msaMScores)
|
||||||
else if (class(ali)[1] == "AAStringSet") {
|
v <- msaMScores
|
||||||
strings <- character(length(ali))
|
for (i in (1 + wRadius):(len - wRadius)) {
|
||||||
for (i in 1:length(ali)) {
|
v[i] <- mean(msaMScores[(i - wRadius):(i + wRadius)]) # mean of values in
|
||||||
strings[i] <- as.character(ali[i])
|
# window around i
|
||||||
names(strings)[i] <- ali@ranges@NAMES[i]
|
}
|
||||||
}
|
points(v, col = "#FFFFFF", type = "l", lwd = 4.5)
|
||||||
}
|
points(v, col = "#3DAEB2", type = "l", lwd = 3)
|
||||||
else {
|
|
||||||
stop(paste("Input object of class",
|
|
||||||
class(ali)[1],
|
|
||||||
"can't be handled by this function."))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
for (i in 1:length(strings)) {
|
# You can set a threshold and use rle() to define ranges of values that fall
|
||||||
# output FASTA header
|
# above and below the threshold, and thus approximate domain boundaries:
|
||||||
cat(paste(">",
|
thrsh <- 30
|
||||||
names(strings)[i],
|
(highScoringRanges <- rle(v > thrsh))
|
||||||
"\n",
|
(idx <- c(1, cumsum(highScoringRanges$lengths)))
|
||||||
sep=""))
|
for (i in seq_along(highScoringRanges$lengths)) {
|
||||||
# output the sequence block by block ...
|
if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
|
||||||
nLine <- 1
|
rect(idx[i], thrsh, idx[i+1], max(v), # ... draw a rectangle
|
||||||
from <- 1
|
col = "#205C5E33") # ... with a transparent color.
|
||||||
while (from < nchar(strings[i])) {
|
cat(sprintf("Possible domain from %d to %d\n", idx[i], idx[i+1]))
|
||||||
to <- from + blockSize - 1
|
|
||||||
cat(paste(substr(strings[i], from, to), "\n", sep=""))
|
|
||||||
from <- to + 1
|
|
||||||
}
|
|
||||||
cat("\n") # output an empty line
|
|
||||||
}
|
|
||||||
if (writeToFile) {
|
|
||||||
sink() # Done. Close the diversion.
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# confirm that the function works
|
# Getting this right requires a bit of fiddling with the window radius and
|
||||||
writeMFA(seqSet)
|
# threshold (experiment with that a bit), but once we are satisfied, we can use
|
||||||
writeMFA(msa1)
|
# the boundaries to print the MSA alignments separately for domains.
|
||||||
|
# Unfortunately the msa package provides no native way to extract blocks of the
|
||||||
|
# alignment for further processing, but your .utilities file loads a function to
|
||||||
|
# write alignment objects and sequence sets to .aln formatted output. Have a
|
||||||
|
# look:
|
||||||
|
|
||||||
# We could use this function to write the raw and aligned sequences to file like
|
writeALN
|
||||||
# so:
|
|
||||||
|
|
||||||
# writeMFA(seqSet, file = "APSES_proteins.mfa")
|
# Print out the aligned blocks
|
||||||
# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")
|
for (i in seq_along(highScoringRanges$lengths)) {
|
||||||
|
if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
|
||||||
# ... but since we don't actually need the data on file now, just copy the code
|
cat(sprintf("\n\nPossible domain from %d to %d\n", idx[i], idx[i+1]))
|
||||||
# that would create the msa to your myCode file so you can quickly reproduce it.
|
writeALN(msaM, range = c(idx[i], idx[i+1]))
|
||||||
|
}
|
||||||
# == Task:
|
}
|
||||||
# Print the output of print(msa1) on a sheet of paper and bring it to
|
|
||||||
# class for the next quiz.
|
|
||||||
|
|
||||||
# That'll do.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tasks
|
# = 4 Comparing MSAs ======================================================
|
||||||
|
|
||||||
|
# Let's compare the results of different alignment algorithms. We computed a
|
||||||
|
# vector of scores above, and we can compare that for different alignment
|
||||||
|
# algorithms. This is not trivial, so we'll need to look at that data in
|
||||||
|
# different ways and explore it. But first, let's get more alignments to compare
|
||||||
|
# with.
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.1 Importing an alignment to msa =====================================
|
||||||
|
|
||||||
|
# We computed a T-Coffee alignment at the EBI. msa has no native import function
|
||||||
|
# so we need to improvise, and it's a bit of a pain to do - but a good
|
||||||
|
# illustration of startegies to convert data into any kind of object:
|
||||||
|
# - read an .aln file
|
||||||
|
# - adjust the sequence names
|
||||||
|
# - convert to msaAAMultipleAlignment object
|
||||||
|
|
||||||
|
# === 4.1.1 importing an .aln file
|
||||||
|
|
||||||
|
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
||||||
|
if (! require(seqinr, quietly=TRUE)) {
|
||||||
|
install.packages(seqinr)
|
||||||
|
library(seqinr)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help=seqinr) # basic information
|
||||||
|
# browseVignettes("seqinr") # available vignettes
|
||||||
|
# data(package = "seqinr") # available datasets
|
||||||
|
|
||||||
|
# read the donwloaded file
|
||||||
|
tmp <- read.alignment("msaT.aln", format = "clustal")
|
||||||
|
|
||||||
|
# read.alignment() returns a list. $seq is a list of strings, one for each
|
||||||
|
# complete alignment. However, they are converted to lower case.
|
||||||
|
x <- toupper(unlist(tmp$seq)) # get the sequences, uppercase
|
||||||
|
|
||||||
|
# $nam contains the names.
|
||||||
|
(names(x) <- tmp$nam)
|
||||||
|
|
||||||
|
# Note that the names in the file are refseq IDs, we need to replace the
|
||||||
|
# RefSeqIDs with the database names:
|
||||||
|
for (i in seq_along(x)) {
|
||||||
|
# find the index of the RefSeqID
|
||||||
|
id <- gsub("\\..*$", "", names(x)[i]) # fetch the name, drop the version
|
||||||
|
sel <- which(myDB$protein$RefSeqID == id) # get the index
|
||||||
|
names(x)[i] <- myDB$protein$name[sel] # get the name
|
||||||
|
}
|
||||||
|
|
||||||
|
# === 4.1.2 Creating an MsaAAMultipleAlignment object
|
||||||
|
|
||||||
|
# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
|
||||||
|
# in their @unmasked slot, and a few additional items. Rather then build the
|
||||||
|
# object from scratch, we copy an axisting object, and overwrite the dta in its
|
||||||
|
# slots with what we need. Our goal is pragmatic, we want an object that msa's
|
||||||
|
# functions will accept as input.
|
||||||
|
|
||||||
|
# First: convert our named char vector into an AAstringSet
|
||||||
|
x <- AAStringSet(x)
|
||||||
|
|
||||||
|
# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
|
||||||
|
# defined what such an object should look like, with the SetClass() function. To
|
||||||
|
# create a new object, simply use the new() function, define the class that the
|
||||||
|
# object would have, and fill the slots with something that has the right type.
|
||||||
|
# How do we know the right type and the slot names? Easy! We just use
|
||||||
|
# str(<object>) to get the information.
|
||||||
|
|
||||||
|
str(msaM)
|
||||||
|
|
||||||
|
msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
|
||||||
|
unmasked = x, # "unmasked" slot takes an AASringSet
|
||||||
|
version = "T-Coffee", # "version" slot takes a string
|
||||||
|
params = list(), # "params" takes a list(), we leave the
|
||||||
|
# list empty, but we could add the
|
||||||
|
# alignment parameters that we used at
|
||||||
|
# the EBI here.
|
||||||
|
call = "imported from T-coffee alignment") # also a string
|
||||||
|
|
||||||
|
str(msaT)
|
||||||
|
|
||||||
|
|
||||||
|
msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
|
||||||
|
# use the msa package functions on it
|
||||||
|
|
||||||
|
msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
|
# == 4.2 More alignments ===================================================
|
||||||
|
|
||||||
|
# Next, we calculate alignments with msa's two other alignment options:
|
||||||
|
# CLUSTAL Omega
|
||||||
|
(msaO <- msaClustalOmega( MBP1set, order = "aligned"))
|
||||||
|
msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
|
# CLUSTAL W
|
||||||
|
(msaW <- msaClustalW( MBP1set, order = "aligned"))
|
||||||
|
msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.3 Computing comparison metrics ======================================
|
||||||
|
|
||||||
|
# Ready to compare.
|
||||||
|
|
||||||
|
# ... sum of alignment scores of alignment divided by sum of alignment scores
|
||||||
|
# of reference alignment (arbitrarily using CLUSTAL W as reference)
|
||||||
|
|
||||||
|
sRef <- sum(msaWScores)
|
||||||
|
sum(msaWScores) / sRef # CLUSTAl W
|
||||||
|
sum(msaOScores) / sRef # CLUSTAL O
|
||||||
|
sum(msaTScores) / sRef # T-COFFEE
|
||||||
|
sum(msaMScores) / sRef # MUSCLE
|
||||||
|
|
||||||
|
# ... mean alignment scores (higher is better)
|
||||||
|
|
||||||
|
mean(msaWScores) # CLUSTAl W
|
||||||
|
mean(msaOScores) # CLUSTAL O
|
||||||
|
mean(msaTScores) # T-COFFEE
|
||||||
|
mean(msaMScores) # MUSCLE
|
||||||
|
|
||||||
|
# total number of gaps (lower is better)
|
||||||
|
countGaps <- function(ali) {
|
||||||
|
x <- paste0(as.character(ali), collapse = "")
|
||||||
|
aa <- nchar(gsub("-", "", x))
|
||||||
|
return(nchar(x) - aa)
|
||||||
|
}
|
||||||
|
|
||||||
|
countGaps(msaW) # CLUSTAl W
|
||||||
|
countGaps(msaO) # CLUSTAL O
|
||||||
|
countGaps(msaT) # T-COFFEE
|
||||||
|
countGaps(msaM) # MUSCLE
|
||||||
|
|
||||||
|
# number of indels in alignment (lower is less fragmented)
|
||||||
|
countIndels <- function(ali) {
|
||||||
|
x <- paste0(as.character(ali), collapse = "") # collapse into single string
|
||||||
|
x <- unlist(strsplit(x, "")) # split into characters
|
||||||
|
x <- x == "-" # convert into boolean
|
||||||
|
x <- rle(x) # calculate rle
|
||||||
|
# every run of TRUE is one indel event
|
||||||
|
return(sum(x$values))
|
||||||
|
}
|
||||||
|
|
||||||
|
countIndels(msaW) # CLUSTAl W
|
||||||
|
countIndels(msaO) # CLUSTAL O
|
||||||
|
countIndels(msaT) # T-COFFEE
|
||||||
|
countIndels(msaM) # MUSCLE
|
||||||
|
|
||||||
|
# Let's look at the distribution of alignment scores:
|
||||||
|
boxplot(list(CLUSTAL.W = msaWScores,
|
||||||
|
CLUSTAL.O = msaOScores,
|
||||||
|
T.COFFEE = msaTScores,
|
||||||
|
MUSCLE = msaMScores),
|
||||||
|
col = c("#7D556488", "#74628F88", "#5E78A188", "#3DAEB288"))
|
||||||
|
|
||||||
|
# CLUSTAL W and CLUSTAL O don't look all that different. T-Coffee tends to have
|
||||||
|
# a tighter distribution with less negative scores. Muscle has a slightly higher
|
||||||
|
# mean and generally higher scores.
|
||||||
|
|
||||||
|
# Boxplots are convenient, but don't give us much detail about the shape of the
|
||||||
|
# distribution. For that, we need histograms, or density plots.
|
||||||
|
|
||||||
|
plot(density(msaWScores),
|
||||||
|
type = "l",
|
||||||
|
col = "#7D5564",
|
||||||
|
lwd = 1.5,
|
||||||
|
ylim = c(0, (max(density(msaWScores)$y) * 1.3)),
|
||||||
|
main = "Comparing MSA algorithms",
|
||||||
|
xlab = "Alignment Score",
|
||||||
|
ylab = "Density")
|
||||||
|
points(density(msaOScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#74628F")
|
||||||
|
points(density(msaTScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#5E78A1")
|
||||||
|
points(density(msaMScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#3DAEB2")
|
||||||
|
legend("topright",
|
||||||
|
legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
|
||||||
|
col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
|
||||||
|
lwd = 2,
|
||||||
|
cex = 0.7,
|
||||||
|
bty = "n")
|
||||||
|
|
||||||
|
# The desnity plots confirm in more detail that CLUSTAL W misses some of the
|
||||||
|
# higher-scoring possibilities, that wherever CLUSTAL O is bad, it is quite bad,
|
||||||
|
# that T-COFFEE has fewer poorly scoring columns but misses some of the better
|
||||||
|
# scoring possibilities, and that MUSCLE appears to do best overall.
|
||||||
|
|
||||||
|
# Can we attribute these differences to sections of the alignment in which the
|
||||||
|
# algorithms did better or worse? Let's plot the scores cumulatively. The
|
||||||
|
# alignments have different lengths, so we plot the scores on the respective
|
||||||
|
# fraction of the alignement length.
|
||||||
|
|
||||||
|
plot(seq(0, 1, length.out = length(msaWScores)), # x- axis: fraction of length
|
||||||
|
cumsum(msaWScores),
|
||||||
|
type = "l",
|
||||||
|
col = "#7D5564",
|
||||||
|
lwd = 1.5,
|
||||||
|
ylim = c(0, max(cumsum(msaMScores))),
|
||||||
|
main = "Comparing MSA algorithms",
|
||||||
|
xlab = "Alignment Position",
|
||||||
|
ylab = "Cumulative Alignment Score")
|
||||||
|
points(seq(0, 1, length.out = length(msaOScores)), # x- axis: fraction of length
|
||||||
|
cumsum(msaOScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#74628F")
|
||||||
|
points(seq(0, 1, length.out = length(msaTScores)), # x- axis: fraction of length
|
||||||
|
cumsum(msaTScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#5E78A1")
|
||||||
|
points(seq(0, 1, length.out = length(msaMScores)), # x- axis: fraction of length
|
||||||
|
cumsum(msaMScores),
|
||||||
|
type = "l",
|
||||||
|
lwd = 1.5,
|
||||||
|
col = "#3DAEB2")
|
||||||
|
legend("bottomright",
|
||||||
|
legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
|
||||||
|
col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
|
||||||
|
lwd = 2,
|
||||||
|
cex = 0.7,
|
||||||
|
bty = "n")
|
||||||
|
|
||||||
|
# Your alignment is going to be differnte from mine, due to the inclusion of
|
||||||
|
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
||||||
|
# achieves this with fewer indels then most, and the lowest number of gaps of
|
||||||
|
# all algorithms.
|
||||||
|
|
||||||
|
# To actually compare regions of alignments, we need to align alignments.
|
||||||
|
|
||||||
|
|
||||||
|
# = 5 Profile-Profile alignments ==========================================
|
||||||
|
|
||||||
|
|
||||||
|
# Profile-profile alignments are the most powerful way to pick up distant
|
||||||
|
# relationships between sequence families. The can be used, for example to build
|
||||||
|
# a profile from structural superpositions of crystal structures, and then map a
|
||||||
|
# MSA alignment onto those features. Here we will use profile-profile comparison
|
||||||
|
# to compare two MSAs with each other, by aligning them. The algorithm is
|
||||||
|
# provided by the DECIPHER package.
|
||||||
|
|
||||||
|
if (! require(DECIPHER, quietly=TRUE)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
|
biocLite("DECIPHER")
|
||||||
|
library(DECIPHER)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = DECIPHER) # basic information
|
||||||
|
# browseVignettes("DECIPHER") # available vignettes
|
||||||
|
# data(package = "DECIPHER") # available datasets
|
||||||
|
|
||||||
|
# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
|
||||||
|
# CLUSTAL W alignments: we could do this directly ...
|
||||||
|
AlignProfiles(msaW@unmasked, msaM@unmasked)
|
||||||
|
|
||||||
|
# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
|
||||||
|
# alignment into the same order as the MUSCLE alignment:
|
||||||
|
m <- as.character(msaM)
|
||||||
|
w <- as.character(msaW)[names(m)]
|
||||||
|
|
||||||
|
(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
|
||||||
|
|
||||||
|
# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
|
||||||
|
# writeALN function to show it. Here is an arbitrary block, from somewhere in
|
||||||
|
# the middle of the alignment:
|
||||||
|
|
||||||
|
writeALN(ppa, range = c(751, 810))
|
||||||
|
|
||||||
|
# If you look at this for a while, you can begin to figue out where the
|
||||||
|
# algorithms made different decisions about where to insert gaps, and how to
|
||||||
|
# move segments of sequence around. But matters become more clear if we
|
||||||
|
# post-process this profile-profile alignment. Let's replace all hyphens that
|
||||||
|
# the pp-alignment has inserted with blanks, and let's add a separator line down
|
||||||
|
# the middle between the two alignments.
|
||||||
|
|
||||||
|
x <- unlist(strsplit(as.character(ppa), "")) # unlist all
|
||||||
|
dim(x) <- c(width(ppa)[1], length(ppa)) # form into matrix by columns
|
||||||
|
x <- t(x) # transpose the matrix
|
||||||
|
(a1 <- 1:(nrow(x)/2)) # rows of alignment 1
|
||||||
|
(a2 <- ((nrow(x)/2) + 1):nrow(x)) # rows of alignment 2
|
||||||
|
for (i in 1:ncol(x)) {
|
||||||
|
if (all(x[a1, i] == "-")) { x[a1, i] <- " " } # blank hyphens that shift
|
||||||
|
if (all(x[a2, i] == "-")) { x[a2, i] <- " " } # original alignment blocks
|
||||||
|
}
|
||||||
|
|
||||||
|
# collapse the matrix into strings
|
||||||
|
ppa2 <- character()
|
||||||
|
for (i in 1:nrow(x)) {
|
||||||
|
ppa2[i] <- paste0(x[i, ], collapse = "")
|
||||||
|
names(ppa2)[i] <- names(ppa)[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
# add a separator line
|
||||||
|
x <- paste0(rep("-", width(ppa)[1]), collapse = "")
|
||||||
|
ppa2 <- c(ppa2[a1], x, ppa2[a2])
|
||||||
|
|
||||||
|
# inspect
|
||||||
|
writeALN(ppa2, range = c(800, 960))
|
||||||
|
|
||||||
|
# Again, go explore, and get a sense of what's going on. You may find that
|
||||||
|
# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
|
||||||
|
# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
|
||||||
|
# expect from an algorithm that builds alignments from pairwise local
|
||||||
|
# alignments, without global refinement.
|
||||||
|
|
||||||
|
|
||||||
|
# = 6 Sequence Logos ======================================================
|
||||||
|
|
||||||
|
# To visualize the information that we can get about structure and function with
|
||||||
|
# an MSA, we'll calculate a sequence logo of the Mbp1 recognition helix - the
|
||||||
|
# part of the structure that inserts into the major groove of the DNA and
|
||||||
|
# provides sequence specificity to the DNA binding of this transcription factor.
|
||||||
|
# Helix-B in Mbp1 with four residues upstream and downstream spans the sequence
|
||||||
|
# 43-ANFAKAKRTRILEKEVLKE-61
|
||||||
|
|
||||||
|
# == 6.1 Subsetting an alignment by motif ==================================
|
||||||
|
|
||||||
|
# Finding the location of such an substring in an alignment is not entirely
|
||||||
|
# trivial, because the alignment might have produced indels in that sequence.
|
||||||
|
# Our strategy can be:
|
||||||
|
# - fetch the sequence from the alignment
|
||||||
|
# - remove all hyphens
|
||||||
|
# - find the range where the target sequence matches
|
||||||
|
# - count how many characters in all there are in the aligned sequence, up
|
||||||
|
# to the start and end of the match
|
||||||
|
# - these numbers define the range of the match in the alignment.
|
||||||
|
|
||||||
|
x <- as.character(msaM)["MBP1_SACCE"]
|
||||||
|
xAA <- gsub("-", "", x)
|
||||||
|
|
||||||
|
motif <- "ANFAKAKRTRILEKEVLKE"
|
||||||
|
(m <- regexpr(motif, xAA)) # matched in position 43, with a length of 19
|
||||||
|
(motifStart <- as.numeric(m))
|
||||||
|
(motifEnd <- attr(m, "match.length") + motifStart - 1)
|
||||||
|
|
||||||
|
# To count characters, we split the string into single characters ...
|
||||||
|
x <- unlist(strsplit(x, ""))
|
||||||
|
|
||||||
|
# ... convert this into a boolean, which is true if the character is not
|
||||||
|
# a hyphen ...
|
||||||
|
x <- x != "-"
|
||||||
|
|
||||||
|
# ... cast this into a numeric, which turns TRUE into 1 and FALSE into 0 ...
|
||||||
|
x <- as.numeric(x)
|
||||||
|
|
||||||
|
# ... and sum up the cumulative sum.
|
||||||
|
x <- cumsum(x)
|
||||||
|
|
||||||
|
# Now we can find where the 43'd and 61'st characters are located in the
|
||||||
|
# alignment string ...
|
||||||
|
(aliStart <- which(x == motifStart)[1]) # get the first hit if there are more
|
||||||
|
(aliEnd <- which(x == motifEnd)[1])
|
||||||
|
|
||||||
|
# ... and subset the alignment
|
||||||
|
|
||||||
|
(motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd))
|
||||||
|
|
||||||
|
|
||||||
|
# == 6.2 Plot a Sequence Logo ==============================================
|
||||||
|
|
||||||
|
# There are now several good options to plot sequence logos in R, these include
|
||||||
|
# dagLogo, DiffLogo, Logolas, and motifStack. For our example we will use
|
||||||
|
# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
|
||||||
|
# the EBI.
|
||||||
|
|
||||||
|
if (! require(ggseqlogo, quietly=TRUE)) {
|
||||||
|
install.packages(("ggseqlogo"))
|
||||||
|
library(ggseqlogo)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help=ggseqlogo) # basic information
|
||||||
|
# browseVignettes("ggseqlogo") # available vignettes
|
||||||
|
# data(package = "ggseqlogo") # available datasets
|
||||||
|
|
||||||
|
ggseqlogo(as.character(motifAli))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
135
scripts/ABC-writeALN.R
Normal file
135
scripts/ABC-writeALN.R
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
# ABC-writeALN.R
|
||||||
|
#
|
||||||
|
# ToDo: calculate consensus line
|
||||||
|
# append sequence numbers
|
||||||
|
# Notes:
|
||||||
|
#
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
writeALN <- function(ali,
|
||||||
|
range,
|
||||||
|
note = "",
|
||||||
|
myCon = stdout(),
|
||||||
|
blockWidth = 60) {
|
||||||
|
# Purpose:
|
||||||
|
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||||
|
# a file in multi-FASTA format.
|
||||||
|
# Version: 2.0
|
||||||
|
# Date: 2017 10
|
||||||
|
# Author: Boris Steipe
|
||||||
|
#
|
||||||
|
# Parameters:
|
||||||
|
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||||
|
# vector.
|
||||||
|
# range num a two-integer vector of start and end positions if
|
||||||
|
# only a range of the MSA should be written, e.g.
|
||||||
|
# a domain. Defaults to the full alignment length.
|
||||||
|
# note chr a vector of character that is appended to the name
|
||||||
|
# of a sequence in the FASTA header. Recycling of
|
||||||
|
# shorter vectors applies, thus a vector of length one
|
||||||
|
# is added to all headers.
|
||||||
|
# myCon a connection (cf. the con argument for writeLines).
|
||||||
|
# Defaults to stdout()
|
||||||
|
# blockWidth int width of sequence block. Default 80 characters.
|
||||||
|
# Value:
|
||||||
|
# NA the function is invoked for its side effect of printing an
|
||||||
|
# alignment to stdout() or file.
|
||||||
|
|
||||||
|
blockWidth <- as.integer(blockWidth)
|
||||||
|
if (is.na(blockWidth)) {
|
||||||
|
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||||
|
}
|
||||||
|
if (blockWidth < 1) {
|
||||||
|
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||||
|
}
|
||||||
|
if (blockWidth > 60) {
|
||||||
|
stop("PANIC: \"blockWidth\" for CLUSTAL format can't be greater than 60.")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract the raw data from the objects depending on their respective class
|
||||||
|
# and put it into a named vector of strings.
|
||||||
|
|
||||||
|
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||||
|
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||||
|
class(ali) == "MsaDNAMultipleAlignment" |
|
||||||
|
class(ali) == "MsaRNAMultipleAlignment") {
|
||||||
|
ali <- ali@unmasked
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process XStringSet
|
||||||
|
if (class(ali) == "AAStringSet" |
|
||||||
|
class(ali) == "DNAStringSet" |
|
||||||
|
class(ali) == "RNAStringSet") {
|
||||||
|
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||||
|
# we don't _have_ to load Biostrings
|
||||||
|
} else if (class(ali) == "character") {
|
||||||
|
sSet <- ali
|
||||||
|
} else {
|
||||||
|
stop(paste("Input object of class",
|
||||||
|
class(ali),
|
||||||
|
"can't be handled by this function."))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (missing(range)) {
|
||||||
|
range <- 1
|
||||||
|
range[2] <- max(nchar(sSet))
|
||||||
|
} else {
|
||||||
|
range <- as.integer(range)
|
||||||
|
if(length(range) != 2 ||
|
||||||
|
any(is.na(range)) ||
|
||||||
|
range[1] > range[2] ||
|
||||||
|
range[1] < 1) {
|
||||||
|
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Right-pad any sequence with "-" that is shorter than ranges[2]
|
||||||
|
for (i in seq_along(sSet)) {
|
||||||
|
if (nchar(sSet[i]) < range[2]) {
|
||||||
|
sSet[i] <- paste0(sSet[i],
|
||||||
|
paste0(rep("-", range[2] - nchar(sSet[i])),
|
||||||
|
collapse = ""))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Right-pad sequence names
|
||||||
|
sNames <- names(sSet)
|
||||||
|
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
|
||||||
|
for (i in seq_along(sNames)) {
|
||||||
|
sNames[i] <- paste0(sNames[i],
|
||||||
|
paste0(rep(" ", len - nchar(sNames[i])),
|
||||||
|
collapse = ""))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Process each sequence
|
||||||
|
txt <- paste0("CLUSTAL W format. ", note)
|
||||||
|
txt[2] <- ""
|
||||||
|
|
||||||
|
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||||
|
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||||
|
|
||||||
|
for (i in seq_along(iStarts)) {
|
||||||
|
for (j in seq_along(sSet)) {
|
||||||
|
txt <- c(txt,
|
||||||
|
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
|
||||||
|
}
|
||||||
|
txt <- c(txt, "") # append a blank consenus line
|
||||||
|
txt <- c(txt, "") # append a separator line
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLines(txt, con= myCon)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==== TESTS =================================================================
|
||||||
|
# Enter your function tests here...
|
||||||
|
|
||||||
|
if (FALSE) {
|
||||||
|
# test ...
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# [END]
|
115
scripts/ABC-writeMFA.R
Normal file
115
scripts/ABC-writeMFA.R
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
# ABC-writeMFA.R
|
||||||
|
#
|
||||||
|
# ToDo:
|
||||||
|
# Notes:
|
||||||
|
#
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
writeMFA <- function(ali,
|
||||||
|
range,
|
||||||
|
note = "",
|
||||||
|
myCon = stdout(),
|
||||||
|
blockWidth = 80) {
|
||||||
|
# Purpose:
|
||||||
|
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||||
|
# a file in multi-FASTA format.
|
||||||
|
# Version: 2.0
|
||||||
|
# Date: 2017 10
|
||||||
|
# Author: Boris Steipe
|
||||||
|
#
|
||||||
|
# Parameters:
|
||||||
|
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||||
|
# vector
|
||||||
|
# range num a two-integer vector of start and end positions if
|
||||||
|
# only a range of the MSA should be written, e.g.
|
||||||
|
# a domain. Defaults to the full sequence length.
|
||||||
|
# note chr a vector of character that is appended to the name
|
||||||
|
# of a sequence in the FASTA header. Recycling of
|
||||||
|
# shorter vectors applies, thus a vector of length one
|
||||||
|
# is added to all headers.
|
||||||
|
# myCon a connection (cf. the con argument for writeLines).
|
||||||
|
# Defaults to stdout()
|
||||||
|
# blockWidth int width of sequence block. Default 80 characters.
|
||||||
|
# Value:
|
||||||
|
# NA the function is invoked for its side effect of printing an
|
||||||
|
# alignment to stdout() or file.
|
||||||
|
|
||||||
|
blockWidth <- as.integer(blockWidth)
|
||||||
|
if (is.na(blockWidth)) {
|
||||||
|
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||||
|
}
|
||||||
|
if (blockWidth < 1){
|
||||||
|
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract the raw data from the objects depending on their respective class
|
||||||
|
# and put it into a named vector of strings.
|
||||||
|
|
||||||
|
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||||
|
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||||
|
class(ali) == "MsaDNAMultipleAlignment" |
|
||||||
|
class(ali) == "MsaRNAMultipleAlignment") {
|
||||||
|
ali <- ali@unmasked
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process XStringSet
|
||||||
|
if (class(ali) == "AAStringSet" |
|
||||||
|
class(ali) == "DNAStringSet" |
|
||||||
|
class(ali) == "RNAStringSet") {
|
||||||
|
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||||
|
# we don't _have_ to load Biostrings
|
||||||
|
} else if (class(ali) == "character") {
|
||||||
|
sSet <- ali
|
||||||
|
} else {
|
||||||
|
stop(paste("Input object of class",
|
||||||
|
class(ali),
|
||||||
|
"can't be handled by this function."))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (missing(range)) {
|
||||||
|
range <- 1
|
||||||
|
range[2] <- max(nchar(sSet))
|
||||||
|
} else {
|
||||||
|
range <- as.integer(range)
|
||||||
|
if(length(range) != 2 ||
|
||||||
|
any(is.na(range)) ||
|
||||||
|
range[1] > range[2] ||
|
||||||
|
range[1] < 1) {
|
||||||
|
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each sequence
|
||||||
|
txt <- character()
|
||||||
|
headers <- paste(names(sSet), note)
|
||||||
|
for (i in seq_along(sSet)) {
|
||||||
|
|
||||||
|
# output FASTA header
|
||||||
|
txt <- c(txt, sprintf(">%s", headers[i]))
|
||||||
|
|
||||||
|
# output the sequence in blocks of blockWidth per line ...
|
||||||
|
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||||
|
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||||
|
|
||||||
|
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
|
||||||
|
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
|
||||||
|
txt <- c(txt, thisSeq)
|
||||||
|
|
||||||
|
txt <- c(txt, "") # append an empty line for readability
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLines(txt, con= myCon)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==== TESTS =================================================================
|
||||||
|
# Enter your function tests here...
|
||||||
|
|
||||||
|
if (FALSE) {
|
||||||
|
# test ...
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# [END]
|
Loading…
Reference in New Issue
Block a user