From 553b042f2094788c5e7c9ed4774141d7143cc42b Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Sat, 28 Oct 2017 23:09:21 -0400
Subject: [PATCH] MSA unit, and supporting files

---
 .utilities.R           |   7 +-
 BIN-ALI-MSA.R          | 675 +++++++++++++++++++++++++++++++++--------
 scripts/ABC-writeALN.R | 135 +++++++++
 scripts/ABC-writeMFA.R | 115 +++++++
 4 files changed, 803 insertions(+), 129 deletions(-)
 create mode 100644 scripts/ABC-writeALN.R
 create mode 100644 scripts/ABC-writeMFA.R

diff --git a/.utilities.R b/.utilities.R
index ad19221..4339672 100644
--- a/.utilities.R
+++ b/.utilities.R
@@ -2,10 +2,11 @@
 #
 # Miscellaneous R code to suppport the project
 #
-# Version: 1.2
-# Date:    2017 09
+# Version: 1.3
+# Date:    2017  09 - 2017  10
 # Author:  Boris Steipe
 #
+# V 1.3    load msa support functions
 # V 1.2    update database utilities to support 2017 version of JSON sources
 # V 1.1    2017 updates for ABC-units
 # V 1.0    First code
@@ -18,6 +19,8 @@
 # ====== SCRIPTS =============================================================
 
 source("./scripts/ABC-dbUtilities.R")
+source("./scripts/ABC-writeALN.R")
+source("./scripts/ABC-writeMFA.R")
 
 
 # ====== SUPPORT FUNCTIONS =====================================================
diff --git a/BIN-ALI-MSA.R b/BIN-ALI-MSA.R
index 8f1a28b..5723148 100644
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
@@ -3,94 +3,148 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-MSA unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10  23
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    Fully refactored and rewritten for 2017
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================
 
-# = 1 Multiple sequence alignment
 
-# We will compute a multiple sequence alignment using the "muscle" algorithm
-# which is available throught the Bioconductor msa package.
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                        Line
+#TOC> ------------------------------------------------------------
+#TOC>   1        Preparations                                   51
+#TOC>   2        Aligning full length MBP1 proteins             99
+#TOC>   2.1      Preparing Sequences                           110
+#TOC>   2.2      Compute the MSA                               135
+#TOC>   3        Analyzing an MSA                              156
+#TOC>   4        Comparing MSAs                                227
+#TOC>   4.1      Importing an alignment to msa                 236
+#TOC>   4.1.1    importing an .aln file                        245
+#TOC>   4.1.2    Creating an MsaAAMultipleAlignment object     276
+#TOC>   4.2      More alignments                               313
+#TOC>   4.3      Computing comparison metrics                  325
+#TOC>   5        Profile-Profile alignments                    462
+#TOC>   6        Sequence Logos                                539
+#TOC>   6.1      Subsetting an alignment by motif              548
+#TOC>   6.2      Plot a Sequence Logo                          591
+#TOC> 
+#TOC> ==========================================================================
 
-if (!require(Biostrings, quietly=TRUE)) {
-  source("https://bioconductor.org/biocLite.R")
+
+# =    1  Preparations  ========================================================
+
+# You need to reload you protein database, including changes that might
+# have been made to the reference files. If you have worked with the
+# prerequiste units, you should have a script named "makeProteinDB.R"
+# that will create the myDB object with aprotein and feature database.
+# Ask for advice if not.
+source("makeProteinDB.R")
+
+
+# Multiple sequence alignment algorithms are provided in
+# the Bioconductor msa package.
+
+if (! require(Biostrings, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
   biocLite("Biostrings")
+  library(Biostrings)
 }
-data(BLOSUM62)
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
 
-if (!require(msa, quietly=TRUE)) {
-  source("https://bioconductor.org/biocLite.R")
+
+if (! require(msa, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
   biocLite("msa")
   library(msa)
 }
+# Package information:
+#  library(help=msa)       # basic information
+#  browseVignettes("msa")  # available vignettes
+#  data(package = "msa")   # available datasets
 
-# If the installation asks you if you want to update older packages, always
-# answer "a" for "all" unless you have an important reason not to. But if the
-# installer asks you whether you want to compile from source, answer"n" for "no"
-# unless you need new functionality in a particular bleeding-edge version of a
-# package.
+
+# If an installation asks you if you want to update older packages, I recommend
+# to always answer "a" for "all" unless you have an important reason not to. But
+# if the installer asks you whether you want to compile from source, answer "n"
+# for "no" unless you need new functionality in a particular bleeding-edge
+# version of a package.
 
 help(package = "msa")
 
-# We have used biostrings' AAString() function before; for multiple
-# alignments we need an AAStringSet(). We can simply feed it
-# a vector of sequences:
 
-# Let's make a shorthand for selection of Mbp1 proteins from our database: a
-# vector of indices for all of the rows in the protein table that contain
-# proteins whose name begins with MBP1.
-iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)
+# =    2  Aligning full length MBP1 proteins  ==================================
 
-# Next we create an AAStringSet for all of those proteins
-seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])
+# In the Wiki part of this unit you have
+#   - aligned full length MBP1 protein sequences at the EBI using T-Coffee
+#   - saved the resulting alignment in CLUSTAL format
+#     to the file "MBP1orthologuesTC.aln"
 
-# ... and align (which is very simple) ...
-msaMuscle(
-  seqSet,
-  order = "aligned")
-
-# ... but to help us make sense of the alignment we need to add the names for
-# the sequences. names for a seqSet object are held in the ranges slot...
-
-seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]
-
-seqSet
-
-# This little step of adding names is actually really
-# very important. That's because the aligned sequences
-# are meaningless strings of characters unless we can
-# easily identify their biological relationships.
-# Creating MSAs that are only identified by e.g. their
-# RefSeq ids is a type of cargo-cult bioinformatics
-# that we encounter a lot. The point of the alignment
-# is not to create it, but to interpret it!
+# In this section we will calculate an MSA of the same sequences using
+# algorithms in the msa packages, and we will compare the two alignments.
 
 
-# Let's align again, and assign the result ...
-msa1 <-  msaMuscle(
-  seqSet,
-  order = "aligned")
+# ==   2.1  Preparing Sequences  ===============================================
 
-msa1
+# We have used the biostrings' AAString() function before; for multiple
+# alignments we need an AAStringSet(). AAStringSets are produced from vectors
+# of sequence.
+
+sel <- grep("MBP1", myDB$protein$name)
+MBP1set <- AAStringSet(myDB$protein$sequence[sel])
+
+# To help us make sense of the alignment we need to add the names for
+# the sequences. Names for a seqSet object are held in the ranges slot...
+
+MBP1set@ranges@NAMES <- myDB$protein$name[sel]
+
+MBP1set
+
+# You should have eleven sequences in this set, ask for advice if you don't.
+
+# The little step of adding names is actually really very important. That's
+# because the aligned sequences are meaningless strings of characters unless we
+# can easily identify their biological relationships. Creating MSAs that are
+# only identified by e.g. their RefSeq ids is a type of cargo-cult
+# bioinformatics that we encounter a lot. The point of the alignment is not to
+# create it, but to interpret it!
+
+# ==   2.2  Compute the MSA  ===================================================
+
+# The alignment itself is very simple. msa has msaMuscle() and msaClustalOmega()
+# to produce alignments. (It also has msaClustalW() which is kind of
+# embarrassing since that hasn't been the algorithm of first choice for
+# decades. Don't use that one for real work.)
+
+
+# Let's run an alignment with "Muscle"
+(msaM <-  msaMuscle( MBP1set, order = "aligned"))
 
 # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
-print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
+print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
 
 
 # You see that the alignment object has sequence strings with hyphens as
@@ -98,93 +152,460 @@ print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
 # the order has not been preserved, but the most similar sequences are now
 # adjacent to each other.
 
-# Lets write the alignment to one of the common file formats: a multi-fasta
-# file.
 
-# Why oh why does the msa package not have a function to do this !!!
-# Like, seriously ...
+# =    3  Analyzing an MSA  ====================================================
 
-# ==== writeMFA
+# You probabaly realize that computing an MSA is not that hard. It's not
+# entirely trivial to collect meaningful sequences via e.g. PSI-BLAST ... but
+# then computing the alignment gives you a result quickly. But what does it
+# mean? What information does the MSA contain?
 
-# Here is our own function to write a AAStringSet object to a multi-FASTA file.
-writeMFA <- function(ali, file, blockSize = 50) {
-  if (missing(ali)) {
-    stop("Input object missing from arguments with no default.")
-  }
-  if (missing(file)) {
-    writeToFile <- FALSE
-  }
-  else {
-    writeToFile <- TRUE
-    sink(file) # divert output to file
-  }
-  # Extract the raw data from the objects depending on
-  # their respective class and put this
-  # into a named vector of strings.
-  if (class(ali)[1] == "MsaAAMultipleAlignment") {
-    strings <- character(nrow(ali))
-    for (i in 1:nrow(ali)) {
-      strings[i] <- as.character(ali@unmasked[i])
-      names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
-    }
-  }
-  else if (class(ali)[1] == "AAStringSet") {
-    strings <- character(length(ali))
-    for (i in 1:length(ali)) {
-      strings[i] <- as.character(ali[i])
-      names(strings)[i] <- ali@ranges@NAMES[i]
-    }
-  }
-  else {
-    stop(paste("Input object of class",
-               class(ali)[1],
-               "can't be handled by this function."))
-  }
+# Let's have a first look at conserved vs. diverged regions of the MSA. msa
+# provides the function msaConservationScore() which outputs a vector of scores.
+# The scores are the sum of pairscores for the column: for example a perfectly
+# conserved column of histidines would have the following score in our MSA
+# of eleven sequences:
+#   -  one (H, H) pair score is 8 in BLOSUM62;
+#   -  there are (n^2 - n) / 2 pairs that can be formed between amino acids
+#        in a column from n sequences;
+#   -  therefore the column score is 8 * (11^2 - 11) / 2 == 440
+
+data("BLOSUM62")  # fetch the BLOSUM62 package from the Biostrings package
+
+msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
+plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
+
+# That plot shows the well-aligned regions (domains ?) of the sequence, but it
+# could use some smoothing. Options for smoothing such plots include calculating
+# averages in sliding windows ("moving average"), and lowess() smoothing. Here
+# is a quick demo of a moving average smoothing, to illustrate the principle.
+
+wRadius <- 15     # we take the mean of all values around a point +- wRadius
+len <- length(msaMScores)
+v <- msaMScores
+for (i in (1 + wRadius):(len - wRadius)) {
+  v[i] <- mean(msaMScores[(i - wRadius):(i + wRadius)]) # mean of values in
+                                                        # window around i
+}
+points(v, col = "#FFFFFF", type = "l", lwd = 4.5)
+points(v, col = "#3DAEB2", type = "l", lwd = 3)
 
 
-  for (i in 1:length(strings)) {
-    # output FASTA header
-    cat(paste(">",
-              names(strings)[i],
-              "\n",
-              sep=""))
-    # output the sequence block by block ...
-    nLine <- 1
-    from <- 1
-    while (from < nchar(strings[i])) {
-      to <- from + blockSize - 1
-      cat(paste(substr(strings[i], from, to), "\n", sep=""))
-      from <- to + 1
-    }
-    cat("\n") # output an empty line
-  }
-  if (writeToFile) {
-    sink()  # Done. Close the diversion.
+# You can set a threshold and use rle() to define ranges of values that fall
+# above and below the threshold, and thus approximate domain boundaries:
+thrsh <- 30
+(highScoringRanges <- rle(v > thrsh))
+(idx <- c(1, cumsum(highScoringRanges$lengths)))
+for (i in seq_along(highScoringRanges$lengths)) {
+  if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
+    rect(idx[i], thrsh, idx[i+1], max(v),    # ... draw a rectangle
+         col = "#205C5E33")                  # ... with a transparent color.
+    cat(sprintf("Possible domain from %d to %d\n", idx[i], idx[i+1]))
   }
 }
 
-# confirm that the function works
-writeMFA(seqSet)
-writeMFA(msa1)
+# Getting this right requires a bit of fiddling with the window radius and
+# threshold (experiment with that a bit), but once we are satisfied, we can use
+# the boundaries to print the MSA alignments separately for domains.
+# Unfortunately the msa package provides no native way to extract blocks of the
+# alignment for further processing, but your .utilities file loads a function to
+# write alignment objects and sequence sets to .aln formatted output. Have a
+# look:
 
-# We could use this function to write the raw and aligned sequences to file like
-# so:
+writeALN
 
-# writeMFA(seqSet, file = "APSES_proteins.mfa")
-# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")
-
-# ...  but since we don't actually need the data on file now, just copy the code
-# that would create the msa to your myCode file so you can quickly reproduce it.
-
-# == Task:
-# Print the output of print(msa1) on a sheet of paper and bring it to
-# class for the next quiz.
-
-# That'll do.
+# Print out the aligned blocks
+for (i in seq_along(highScoringRanges$lengths)) {
+  if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
+    cat(sprintf("\n\nPossible domain from %d to %d\n", idx[i], idx[i+1]))
+    writeALN(msaM, range = c(idx[i], idx[i+1]))
+  }
+}
 
 
 
-# = 1 Tasks
+# =    4  Comparing MSAs  ======================================================
+
+# Let's compare the results of different alignment algorithms. We computed a
+# vector of scores above, and we can compare that for different alignment
+# algorithms. This is not trivial, so we'll need to look at that data in
+# different ways and explore it. But first, let's get more alignments to compare
+# with.
+
+
+# ==   4.1  Importing an alignment to msa  =====================================
+
+# We computed a T-Coffee alignment at the EBI. msa has no native import function
+# so we need to improvise, and it's a bit of a pain to do - but a good
+# illustration of startegies to convert data into any kind of object:
+#   -  read an .aln file
+#   -  adjust the sequence names
+#   -  convert to msaAAMultipleAlignment object
+
+# ===  4.1.1  importing an .aln file                   
+
+# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
+if (! require(seqinr, quietly=TRUE)) {
+  install.packages(seqinr)
+  library(seqinr)
+}
+# Package information:
+#  library(help=seqinr)       # basic information
+#  browseVignettes("seqinr")  # available vignettes
+#  data(package = "seqinr")   # available datasets
+
+# read the donwloaded file
+tmp <- read.alignment("msaT.aln", format = "clustal")
+
+# read.alignment() returns a list. $seq is a list of strings, one for each
+# complete alignment. However, they are converted to lower case.
+x <- toupper(unlist(tmp$seq)) # get the sequences, uppercase
+
+# $nam contains the names.
+(names(x) <- tmp$nam)
+
+#  Note that the names in the file are refseq IDs, we need to replace the
+#  RefSeqIDs with the database names:
+for (i in seq_along(x)) {
+  # find the index of the RefSeqID
+  id <- gsub("\\..*$", "", names(x)[i])     # fetch the name, drop the version
+  sel <- which(myDB$protein$RefSeqID == id) # get the index
+  names(x)[i] <- myDB$protein$name[sel]     # get the name
+}
+
+# ===  4.1.2  Creating an MsaAAMultipleAlignment object
+
+# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
+# in their @unmasked slot, and a few additional items. Rather then build the
+# object from scratch, we copy an axisting object, and overwrite the dta in its
+# slots with what we need. Our goal is pragmatic, we want an object that msa's
+# functions will accept as input.
+
+# First: convert our named char vector into an AAstringSet
+x <- AAStringSet(x)
+
+# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
+# defined what such an object should look like, with the SetClass() function. To
+# create a new object, simply use the new() function, define the class that the
+# object would have, and fill the slots with something that has the right type.
+# How do we know the right type and the slot names? Easy! We just use
+# str(<object>) to get the information.
+
+str(msaM)
+
+msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
+            unmasked = x,             # "unmasked" slot takes an AASringSet
+            version = "T-Coffee",     # "version" slot takes a string
+            params = list(),          # "params" takes a list(), we leave the
+                                      #   list empty, but we could add the
+                                      #   alignment parameters that we used at
+                                      #   the EBI here.
+            call = "imported from T-coffee alignment") # also a string
+
+str(msaT)
+
+
+msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
+     # use the msa package functions on it
+
+msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
+
+# ==   4.2  More alignments  ===================================================
+
+# Next, we calculate alignments with msa's two other alignment options:
+# CLUSTAL Omega
+(msaO <- msaClustalOmega( MBP1set, order = "aligned"))
+msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
+
+# CLUSTAL W
+(msaW <- msaClustalW( MBP1set, order = "aligned"))
+msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
+
+
+# ==   4.3  Computing comparison metrics  ======================================
+
+# Ready to compare.
+
+# ... sum of alignment scores of alignment divided by sum of alignment scores
+#  of reference alignment (arbitrarily using CLUSTAL W as reference)
+
+sRef <- sum(msaWScores)
+sum(msaWScores) / sRef  # CLUSTAl W
+sum(msaOScores) / sRef  # CLUSTAL O
+sum(msaTScores) / sRef  # T-COFFEE
+sum(msaMScores) / sRef  # MUSCLE
+
+# ... mean alignment scores (higher is better)
+
+mean(msaWScores)  # CLUSTAl W
+mean(msaOScores)  # CLUSTAL O
+mean(msaTScores)  # T-COFFEE
+mean(msaMScores)  # MUSCLE
+
+# total number of gaps (lower is better)
+countGaps <- function(ali) {
+  x <- paste0(as.character(ali), collapse = "")
+  aa <- nchar(gsub("-", "", x))
+  return(nchar(x) - aa)
+}
+
+countGaps(msaW)   # CLUSTAl W
+countGaps(msaO)   # CLUSTAL O
+countGaps(msaT)   # T-COFFEE
+countGaps(msaM)   # MUSCLE
+
+# number of indels in alignment (lower is less fragmented)
+countIndels <- function(ali) {
+  x <- paste0(as.character(ali), collapse = "")  # collapse into single string
+  x <- unlist(strsplit(x, ""))                   # split into characters
+  x <- x == "-"                                  # convert into boolean
+  x <- rle(x)                                    # calculate rle
+  # every run of TRUE is one indel event
+  return(sum(x$values))
+}
+
+countIndels(msaW)   # CLUSTAl W
+countIndels(msaO)   # CLUSTAL O
+countIndels(msaT)   # T-COFFEE
+countIndels(msaM)   # MUSCLE
+
+# Let's look at the distribution of alignment scores:
+boxplot(list(CLUSTAL.W = msaWScores,
+             CLUSTAL.O = msaOScores,
+             T.COFFEE  = msaTScores,
+             MUSCLE    = msaMScores),
+        col = c("#7D556488", "#74628F88", "#5E78A188", "#3DAEB288"))
+
+# CLUSTAL W and CLUSTAL O don't look all that different. T-Coffee tends to have
+# a tighter distribution with less negative scores. Muscle has a slightly higher
+# mean and generally higher scores.
+
+# Boxplots are convenient, but don't give us much detail about the shape of the
+# distribution. For that, we need histograms, or density plots.
+
+plot(density(msaWScores),
+     type = "l",
+     col = "#7D5564",
+     lwd = 1.5,
+     ylim = c(0, (max(density(msaWScores)$y) * 1.3)),
+     main = "Comparing MSA algorithms",
+     xlab = "Alignment Score",
+     ylab = "Density")
+points(density(msaOScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#74628F")
+points(density(msaTScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#5E78A1")
+points(density(msaMScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#3DAEB2")
+legend("topright",
+       legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
+       col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
+       lwd = 2,
+       cex = 0.7,
+       bty = "n")
+
+# The desnity plots confirm in more detail that CLUSTAL W misses some of the
+# higher-scoring possibilities, that wherever CLUSTAL O is bad, it is quite bad,
+# that T-COFFEE has fewer poorly scoring columns but misses some of the better
+# scoring possibilities, and that MUSCLE appears to do best overall.
+
+# Can we attribute these differences to sections of the alignment in which the
+# algorithms did better or worse? Let's plot the scores cumulatively. The
+# alignments have different lengths, so we plot the scores on the respective
+# fraction of the alignement length.
+
+plot(seq(0, 1, length.out = length(msaWScores)), # x- axis: fraction of length
+     cumsum(msaWScores),
+     type = "l",
+     col = "#7D5564",
+     lwd = 1.5,
+     ylim = c(0, max(cumsum(msaMScores))),
+     main = "Comparing MSA algorithms",
+     xlab = "Alignment Position",
+     ylab = "Cumulative Alignment Score")
+points(seq(0, 1, length.out = length(msaOScores)), # x- axis: fraction of length
+       cumsum(msaOScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#74628F")
+points(seq(0, 1, length.out = length(msaTScores)), # x- axis: fraction of length
+       cumsum(msaTScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#5E78A1")
+points(seq(0, 1, length.out = length(msaMScores)), # x- axis: fraction of length
+       cumsum(msaMScores),
+       type = "l",
+       lwd = 1.5,
+       col = "#3DAEB2")
+legend("bottomright",
+       legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
+       col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
+       lwd = 2,
+       cex = 0.7,
+       bty = "n")
+
+# Your alignment is going to be differnte from mine, due to the inclusion of
+# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
+# achieves this with fewer indels then most, and the lowest number of gaps of
+# all algorithms.
+
+# To actually compare regions of alignments, we need to align alignments.
+
+
+# =    5  Profile-Profile alignments  ==========================================
+
+
+# Profile-profile alignments are the most powerful way to pick up distant
+# relationships between sequence families. The can be used, for example to build
+# a profile from structural superpositions of crystal structures, and then map a
+# MSA alignment onto those features. Here we will use profile-profile comparison
+# to compare two MSAs with each other, by aligning them. The algorithm is
+# provided by the DECIPHER package.
+
+if (! require(DECIPHER, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("DECIPHER")
+  library(DECIPHER)
+}
+# Package information:
+#  library(help = DECIPHER)       # basic information
+#  browseVignettes("DECIPHER")    # available vignettes
+#  data(package = "DECIPHER")     # available datasets
+
+# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
+# CLUSTAL W alignments: we could do this directly ...
+AlignProfiles(msaW@unmasked, msaM@unmasked)
+
+# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
+# alignment into the same order as the MUSCLE alignment:
+m <- as.character(msaM)
+w <- as.character(msaW)[names(m)]
+
+(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
+
+# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
+# writeALN function to show it. Here is an arbitrary block, from somewhere in
+# the middle of the alignment:
+
+writeALN(ppa, range = c(751, 810))
+
+# If you look at this for a while, you can begin to figue out where the
+# algorithms made different decisions about where to insert gaps, and how to
+# move segments of sequence around. But matters become more clear if we
+# post-process this profile-profile alignment. Let's replace all hyphens that
+# the pp-alignment has inserted with blanks, and let's add a separator line down
+# the middle between the two alignments.
+
+x <- unlist(strsplit(as.character(ppa), ""))  # unlist all
+dim(x) <- c(width(ppa)[1], length(ppa))       # form into matrix by columns
+x <- t(x)                                     # transpose the matrix
+(a1 <- 1:(nrow(x)/2))                         # rows of alignment 1
+(a2 <- ((nrow(x)/2) + 1):nrow(x))             # rows of alignment 2
+for (i in 1:ncol(x)) {
+  if (all(x[a1, i] == "-")) { x[a1, i] <- " " }  # blank hyphens that shift
+  if (all(x[a2, i] == "-")) { x[a2, i] <- " " }  # original alignment blocks
+}
+
+# collapse the matrix into strings
+ppa2 <- character()
+for (i in 1:nrow(x)) {
+  ppa2[i] <- paste0(x[i, ], collapse = "")
+  names(ppa2)[i] <- names(ppa)[i]
+}
+
+# add a separator line
+x <- paste0(rep("-", width(ppa)[1]), collapse = "")
+ppa2 <- c(ppa2[a1], x, ppa2[a2])
+
+# inspect
+writeALN(ppa2, range = c(800, 960))
+
+# Again, go explore, and get a sense of what's going on. You may find that
+# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
+# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
+# expect from an algorithm that builds alignments from pairwise local
+# alignments, without global refinement.
+
+
+# =    6  Sequence Logos  ======================================================
+
+# To visualize the information that we can get about structure and function with
+# an MSA, we'll calculate a sequence logo of the Mbp1 recognition helix - the
+# part of the structure that inserts into the major groove of the DNA and
+# provides sequence specificity to the DNA binding of this transcription factor.
+# Helix-B in Mbp1 with four residues upstream and downstream spans the sequence
+#     43-ANFAKAKRTRILEKEVLKE-61
+
+# ==   6.1  Subsetting an alignment by motif  ==================================
+
+# Finding the location of such an substring in an alignment is not entirely
+# trivial, because the alignment might have produced indels in that sequence.
+# Our strategy can be:
+#   -  fetch the sequence from the alignment
+#   -  remove all hyphens
+#   -  find the range where the target sequence matches
+#   -  count how many characters in all there are in the aligned sequence, up
+#        to the start and end of the match
+#   -  these numbers define the range of the match in the alignment.
+
+x <- as.character(msaM)["MBP1_SACCE"]
+xAA <- gsub("-", "", x)
+
+motif <- "ANFAKAKRTRILEKEVLKE"
+(m <- regexpr(motif, xAA))  # matched in position 43, with a length of 19
+(motifStart <- as.numeric(m))
+(motifEnd <- attr(m, "match.length") + motifStart - 1)
+
+# To count characters, we split the string into single characters ...
+x <- unlist(strsplit(x, ""))
+
+# ... convert this into a boolean, which is true if the character is not
+# a hyphen ...
+x <- x != "-"
+
+# ... cast this into a numeric, which turns TRUE into 1 and FALSE into 0 ...
+x <- as.numeric(x)
+
+# ... and sum up the cumulative sum.
+x <- cumsum(x)
+
+# Now we can find where the 43'd and 61'st characters are located in the
+# alignment string ...
+(aliStart <- which(x == motifStart)[1])   # get the first hit if there are more
+(aliEnd   <- which(x == motifEnd)[1])
+
+# ... and subset the alignment
+
+(motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd))
+
+
+# ==   6.2  Plot a Sequence Logo  ==============================================
+
+# There are now several good options to plot sequence logos in R, these include
+# dagLogo, DiffLogo, Logolas, and motifStack. For our example we will use
+# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
+# the EBI.
+
+if (! require(ggseqlogo, quietly=TRUE)) {
+  install.packages(("ggseqlogo"))
+  library(ggseqlogo)
+}
+# Package information:
+#  library(help=ggseqlogo)       # basic information
+#  browseVignettes("ggseqlogo")  # available vignettes
+#  data(package = "ggseqlogo")   # available datasets
+
+ggseqlogo(as.character(motifAli))
+
 
 
 
diff --git a/scripts/ABC-writeALN.R b/scripts/ABC-writeALN.R
new file mode 100644
index 0000000..2a6c77b
--- /dev/null
+++ b/scripts/ABC-writeALN.R
@@ -0,0 +1,135 @@
+# ABC-writeALN.R
+#
+# ToDo:    calculate consensus line
+#          append sequence numbers
+# Notes:
+#
+# ==============================================================================
+
+
+writeALN <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 60) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.0
+  # Date:    2017 10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector.
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full alignment length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (blockWidth < 1) {
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+  if (blockWidth > 60) {
+    stop("PANIC: \"blockWidth\" for CLUSTAL format can't be greater than 60.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Right-pad any sequence with "-" that is shorter than ranges[2]
+    for (i in seq_along(sSet)) {
+      if (nchar(sSet[i]) < range[2]) {
+        sSet[i] <- paste0(sSet[i],
+                          paste0(rep("-", range[2] - nchar(sSet[i])),
+                                 collapse = ""))
+      }
+    }
+
+  # Right-pad sequence names
+  sNames <- names(sSet)
+  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
+  for (i in seq_along(sNames)) {
+    sNames[i] <- paste0(sNames[i],
+                      paste0(rep(" ", len - nchar(sNames[i])),
+                             collapse = ""))
+  }
+
+
+  # Process each sequence
+  txt <- paste0("CLUSTAL W format. ", note)
+  txt[2] <- ""
+
+  iStarts <- seq(range[1], range[2], by = blockWidth)
+  iEnds <- c((iStarts[-1] - 1), range[2])
+
+  for (i in seq_along(iStarts)) {
+    for (j in seq_along(sSet)) {
+      txt <- c(txt,
+               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
+    }
+    txt <- c(txt, "")  # append a blank consenus line
+    txt <- c(txt, "")  # append a separator line
+  }
+
+  writeLines(txt, con= myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]
diff --git a/scripts/ABC-writeMFA.R b/scripts/ABC-writeMFA.R
new file mode 100644
index 0000000..1599912
--- /dev/null
+++ b/scripts/ABC-writeMFA.R
@@ -0,0 +1,115 @@
+# ABC-writeMFA.R
+#
+# ToDo:
+# Notes:
+#
+# ==============================================================================
+
+
+writeMFA <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 80) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.0
+  # Date:    2017  10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full sequence length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (blockWidth < 1){
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Process each sequence
+  txt <- character()
+  headers <- paste(names(sSet), note)
+  for (i in seq_along(sSet)) {
+
+    # output FASTA header
+    txt <- c(txt, sprintf(">%s", headers[i]))
+
+    # output the sequence in blocks of blockWidth per line ...
+    iStarts <- seq(range[1], range[2], by = blockWidth)
+    iEnds <- c((iStarts[-1] - 1), range[2])
+
+    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
+    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
+    txt <- c(txt, thisSeq)
+
+    txt <- c(txt, "")  # append an empty line for readability
+  }
+
+  writeLines(txt, con= myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]