MSA unit, and supporting files

2017-10-28 23:09:21 -04:00
parent 044990d1c9
commit 553b042f20
4 changed files with 803 additions and 129 deletions
--- a/.utilities.R
+++ b/.utilities.R
@@ -2,10 +2,11 @@
 #
 # Miscellaneous R code to suppport the project
 #
-# Version: 1.2
+# Version: 1.3
-# Date:    2017 09
+# Date:    2017  09 - 2017  10
 # Author:  Boris Steipe
 #
 # V 1.3    load msa support functions
 # V 1.2    update database utilities to support 2017 version of JSON sources
 # V 1.1    2017 updates for ABC-units
 # V 1.0    First code
@@ -18,6 +19,8 @@
 # ====== SCRIPTS =============================================================
 source("./scripts/ABC-dbUtilities.R")
 source("./scripts/ABC-writeALN.R")
 source("./scripts/ABC-writeMFA.R")
 # ====== SUPPORT FUNCTIONS =====================================================
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
@@ -3,94 +3,148 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-MSA unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10  23
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.0    Fully refactored and rewritten for 2017
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================
 # = 1 Multiple sequence alignment
-# We will compute a multiple sequence alignment using the "muscle" algorithm
+#TOC> ==========================================================================
-# which is available throught the Bioconductor msa package.
+#TOC> 
 #TOC>   Section  Title                                        Line
 #TOC> ------------------------------------------------------------
 #TOC>   1        Preparations                                   51
 #TOC>   2        Aligning full length MBP1 proteins             99
 #TOC>   2.1      Preparing Sequences                           110
 #TOC>   2.2      Compute the MSA                               135
 #TOC>   3        Analyzing an MSA                              156
 #TOC>   4        Comparing MSAs                                227
 #TOC>   4.1      Importing an alignment to msa                 236
 #TOC>   4.1.1    importing an .aln file                        245
 #TOC>   4.1.2    Creating an MsaAAMultipleAlignment object     276
 #TOC>   4.2      More alignments                               313
 #TOC>   4.3      Computing comparison metrics                  325
 #TOC>   5        Profile-Profile alignments                    462
 #TOC>   6        Sequence Logos                                539
 #TOC>   6.1      Subsetting an alignment by motif              548
 #TOC>   6.2      Plot a Sequence Logo                          591
 #TOC> 
 #TOC> ==========================================================================
-if (!require(Biostrings, quietly=TRUE)) {
+
-  source("https://bioconductor.org/biocLite.R")
+# =    1  Preparations  ========================================================
 # You need to reload you protein database, including changes that might
 # have been made to the reference files. If you have worked with the
 # prerequiste units, you should have a script named "makeProteinDB.R"
 # that will create the myDB object with aprotein and feature database.
 # Ask for advice if not.
 source("makeProteinDB.R")
 # Multiple sequence alignment algorithms are provided in
 # the Bioconductor msa package.
 if (! require(Biostrings, quietly=TRUE)) {
  if (! exists("biocLite")) {
    source("https://bioconductor.org/biocLite.R")
  }
  biocLite("Biostrings")
  library(Biostrings)
 }
-data(BLOSUM62)
+# Package information:
 #  library(help = Biostrings)       # basic information
 #  browseVignettes("Biostrings")    # available vignettes
 #  data(package = "Biostrings")     # available datasets
-if (!require(msa, quietly=TRUE)) {
+
-  source("https://bioconductor.org/biocLite.R")
+if (! require(msa, quietly=TRUE)) {
  if (! exists("biocLite")) {
    source("https://bioconductor.org/biocLite.R")
  }
  biocLite("msa")
  library(msa)
 }
 # Package information:
 #  library(help=msa)       # basic information
 #  browseVignettes("msa")  # available vignettes
 #  data(package = "msa")   # available datasets
-# If the installation asks you if you want to update older packages, always
+
-# answer "a" for "all" unless you have an important reason not to. But if the
+# If an installation asks you if you want to update older packages, I recommend
-# installer asks you whether you want to compile from source, answer"n" for "no"
+# to always answer "a" for "all" unless you have an important reason not to. But
-# unless you need new functionality in a particular bleeding-edge version of a
+# if the installer asks you whether you want to compile from source, answer "n"
-# package.
+# for "no" unless you need new functionality in a particular bleeding-edge
 # version of a package.
 help(package = "msa")
 # We have used biostrings' AAString() function before; for multiple
 # alignments we need an AAStringSet(). We can simply feed it
 # a vector of sequences:
-# Let's make a shorthand for selection of Mbp1 proteins from our database: a
+# =    2  Aligning full length MBP1 proteins  ==================================
 # vector of indices for all of the rows in the protein table that contain
 # proteins whose name begins with MBP1.
 iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)
-# Next we create an AAStringSet for all of those proteins
+# In the Wiki part of this unit you have
-seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])
+#   - aligned full length MBP1 protein sequences at the EBI using T-Coffee
 #   - saved the resulting alignment in CLUSTAL format
 #     to the file "MBP1orthologuesTC.aln"
-# ... and align (which is very simple) ...
+# In this section we will calculate an MSA of the same sequences using
-msaMuscle(
+# algorithms in the msa packages, and we will compare the two alignments.
  seqSet,
  order = "aligned")
 # ... but to help us make sense of the alignment we need to add the names for
 # the sequences. names for a seqSet object are held in the ranges slot...
 seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]
 seqSet
 # This little step of adding names is actually really
 # very important. That's because the aligned sequences
 # are meaningless strings of characters unless we can
 # easily identify their biological relationships.
 # Creating MSAs that are only identified by e.g. their
 # RefSeq ids is a type of cargo-cult bioinformatics
 # that we encounter a lot. The point of the alignment
 # is not to create it, but to interpret it!
-# Let's align again, and assign the result ...
+# ==   2.1  Preparing Sequences  ===============================================
 msa1 <-  msaMuscle(
  seqSet,
  order = "aligned")
-msa1
+# We have used the biostrings' AAString() function before; for multiple
 # alignments we need an AAStringSet(). AAStringSets are produced from vectors
 # of sequence.
 sel <- grep("MBP1", myDB$protein$name)
 MBP1set <- AAStringSet(myDB$protein$sequence[sel])
 # To help us make sense of the alignment we need to add the names for
 # the sequences. Names for a seqSet object are held in the ranges slot...
 MBP1set@ranges@NAMES <- myDB$protein$name[sel]
 MBP1set
 # You should have eleven sequences in this set, ask for advice if you don't.
 # The little step of adding names is actually really very important. That's
 # because the aligned sequences are meaningless strings of characters unless we
 # can easily identify their biological relationships. Creating MSAs that are
 # only identified by e.g. their RefSeq ids is a type of cargo-cult
 # bioinformatics that we encounter a lot. The point of the alignment is not to
 # create it, but to interpret it!
 # ==   2.2  Compute the MSA  ===================================================
 # The alignment itself is very simple. msa has msaMuscle() and msaClustalOmega()
 # to produce alignments. (It also has msaClustalW() which is kind of
 # embarrassing since that hasn't been the algorithm of first choice for
 # decades. Don't use that one for real work.)
 # Let's run an alignment with "Muscle"
 (msaM <-  msaMuscle( MBP1set, order = "aligned"))
 # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
-print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
+print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
 # You see that the alignment object has sequence strings with hyphens as
@@ -98,93 +152,460 @@ print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
 # the order has not been preserved, but the most similar sequences are now
 # adjacent to each other.
 # Lets write the alignment to one of the common file formats: a multi-fasta
 # file.
-# Why oh why does the msa package not have a function to do this !!!
+# =    3  Analyzing an MSA  ====================================================
 # Like, seriously ...
-# ==== writeMFA
+# You probabaly realize that computing an MSA is not that hard. It's not
 # entirely trivial to collect meaningful sequences via e.g. PSI-BLAST ... but
 # then computing the alignment gives you a result quickly. But what does it
 # mean? What information does the MSA contain?
-# Here is our own function to write a AAStringSet object to a multi-FASTA file.
+# Let's have a first look at conserved vs. diverged regions of the MSA. msa
-writeMFA <- function(ali, file, blockSize = 50) {
+# provides the function msaConservationScore() which outputs a vector of scores.
-  if (missing(ali)) {
+# The scores are the sum of pairscores for the column: for example a perfectly
-    stop("Input object missing from arguments with no default.")
+# conserved column of histidines would have the following score in our MSA
-  }
+# of eleven sequences:
-  if (missing(file)) {
+#   -  one (H, H) pair score is 8 in BLOSUM62;
-    writeToFile <- FALSE
+#   -  there are (n^2 - n) / 2 pairs that can be formed between amino acids
-  }
+#        in a column from n sequences;
-  else {
+#   -  therefore the column score is 8 * (11^2 - 11) / 2 == 440
-    writeToFile <- TRUE
+
-    sink(file) # divert output to file
+data("BLOSUM62")  # fetch the BLOSUM62 package from the Biostrings package
-  }
+
-  # Extract the raw data from the objects depending on
+msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
-  # their respective class and put this
+plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
-  # into a named vector of strings.
+
-  if (class(ali)[1] == "MsaAAMultipleAlignment") {
+# That plot shows the well-aligned regions (domains ?) of the sequence, but it
-    strings <- character(nrow(ali))
+# could use some smoothing. Options for smoothing such plots include calculating
-    for (i in 1:nrow(ali)) {
+# averages in sliding windows ("moving average"), and lowess() smoothing. Here
-      strings[i] <- as.character(ali@unmasked[i])
+# is a quick demo of a moving average smoothing, to illustrate the principle.
-      names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
+
-    }
+wRadius <- 15     # we take the mean of all values around a point +- wRadius
-  }
+len <- length(msaMScores)
-  else if (class(ali)[1] == "AAStringSet") {
+v <- msaMScores
-    strings <- character(length(ali))
+for (i in (1 + wRadius):(len - wRadius)) {
-    for (i in 1:length(ali)) {
+  v[i] <- mean(msaMScores[(i - wRadius):(i + wRadius)]) # mean of values in
-      strings[i] <- as.character(ali[i])
+                                                        # window around i
-      names(strings)[i] <- ali@ranges@NAMES[i]
+}
-    }
+points(v, col = "#FFFFFF", type = "l", lwd = 4.5)
-  }
+points(v, col = "#3DAEB2", type = "l", lwd = 3)
  else {
    stop(paste("Input object of class",
               class(ali)[1],
               "can't be handled by this function."))
  }
-  for (i in 1:length(strings)) {
+# You can set a threshold and use rle() to define ranges of values that fall
-    # output FASTA header
+# above and below the threshold, and thus approximate domain boundaries:
-    cat(paste(">",
+thrsh <- 30
-              names(strings)[i],
+(highScoringRanges <- rle(v > thrsh))
-              "\n",
+(idx <- c(1, cumsum(highScoringRanges$lengths)))
-              sep=""))
+for (i in seq_along(highScoringRanges$lengths)) {
-    # output the sequence block by block ...
+  if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
-    nLine <- 1
+    rect(idx[i], thrsh, idx[i+1], max(v),    # ... draw a rectangle
-    from <- 1
+         col = "#205C5E33")                  # ... with a transparent color.
-    while (from < nchar(strings[i])) {
+    cat(sprintf("Possible domain from %d to %d\n", idx[i], idx[i+1]))
      to <- from + blockSize - 1
      cat(paste(substr(strings[i], from, to), "\n", sep=""))
      from <- to + 1
    }
    cat("\n") # output an empty line
  }
  if (writeToFile) {
    sink()  # Done. Close the diversion.
  }
 }
-# confirm that the function works
+# Getting this right requires a bit of fiddling with the window radius and
-writeMFA(seqSet)
+# threshold (experiment with that a bit), but once we are satisfied, we can use
-writeMFA(msa1)
+# the boundaries to print the MSA alignments separately for domains.
 # Unfortunately the msa package provides no native way to extract blocks of the
 # alignment for further processing, but your .utilities file loads a function to
 # write alignment objects and sequence sets to .aln formatted output. Have a
 # look:
-# We could use this function to write the raw and aligned sequences to file like
+writeALN
 # so:
-# writeMFA(seqSet, file = "APSES_proteins.mfa")
+# Print out the aligned blocks
-# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")
+for (i in seq_along(highScoringRanges$lengths)) {
-
+  if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold,
-# ...  but since we don't actually need the data on file now, just copy the code
+    cat(sprintf("\n\nPossible domain from %d to %d\n", idx[i], idx[i+1]))
-# that would create the msa to your myCode file so you can quickly reproduce it.
+    writeALN(msaM, range = c(idx[i], idx[i+1]))
-
+  }
-# == Task:
+}
 # Print the output of print(msa1) on a sheet of paper and bring it to
 # class for the next quiz.
 # That'll do.
-# = 1 Tasks
+# =    4  Comparing MSAs  ======================================================
 # Let's compare the results of different alignment algorithms. We computed a
 # vector of scores above, and we can compare that for different alignment
 # algorithms. This is not trivial, so we'll need to look at that data in
 # different ways and explore it. But first, let's get more alignments to compare
 # with.
 # ==   4.1  Importing an alignment to msa  =====================================
 # We computed a T-Coffee alignment at the EBI. msa has no native import function
 # so we need to improvise, and it's a bit of a pain to do - but a good
 # illustration of startegies to convert data into any kind of object:
 #   -  read an .aln file
 #   -  adjust the sequence names
 #   -  convert to msaAAMultipleAlignment object
 # ===  4.1.1  importing an .aln file                   
 # The seqinr package has a function to read CLUSTAL W formatted .aln files ...
 if (! require(seqinr, quietly=TRUE)) {
  install.packages(seqinr)
  library(seqinr)
 }
 # Package information:
 #  library(help=seqinr)       # basic information
 #  browseVignettes("seqinr")  # available vignettes
 #  data(package = "seqinr")   # available datasets
 # read the donwloaded file
 tmp <- read.alignment("msaT.aln", format = "clustal")
 # read.alignment() returns a list. $seq is a list of strings, one for each
 # complete alignment. However, they are converted to lower case.
 x <- toupper(unlist(tmp$seq)) # get the sequences, uppercase
 # $nam contains the names.
 (names(x) <- tmp$nam)
 #  Note that the names in the file are refseq IDs, we need to replace the
 #  RefSeqIDs with the database names:
 for (i in seq_along(x)) {
  # find the index of the RefSeqID
  id <- gsub("\\..*$", "", names(x)[i])     # fetch the name, drop the version
  sel <- which(myDB$protein$RefSeqID == id) # get the index
  names(x)[i] <- myDB$protein$name[sel]     # get the name
 }
 # ===  4.1.2  Creating an MsaAAMultipleAlignment object
 # MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
 # in their @unmasked slot, and a few additional items. Rather then build the
 # object from scratch, we copy an axisting object, and overwrite the dta in its
 # slots with what we need. Our goal is pragmatic, we want an object that msa's
 # functions will accept as input.
 # First: convert our named char vector into an AAstringSet
 x <- AAStringSet(x)
 # Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
 # defined what such an object should look like, with the SetClass() function. To
 # create a new object, simply use the new() function, define the class that the
 # object would have, and fill the slots with something that has the right type.
 # How do we know the right type and the slot names? Easy! We just use
 # str(<object>) to get the information.
 str(msaM)
 msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
            unmasked = x,             # "unmasked" slot takes an AASringSet
            version = "T-Coffee",     # "version" slot takes a string
            params = list(),          # "params" takes a list(), we leave the
                                      #   list empty, but we could add the
                                      #   alignment parameters that we used at
                                      #   the EBI here.
            call = "imported from T-coffee alignment") # also a string
 str(msaT)
 msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
     # use the msa package functions on it
 msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
 # ==   4.2  More alignments  ===================================================
 # Next, we calculate alignments with msa's two other alignment options:
 # CLUSTAL Omega
 (msaO <- msaClustalOmega( MBP1set, order = "aligned"))
 msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
 # CLUSTAL W
 (msaW <- msaClustalW( MBP1set, order = "aligned"))
 msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
 # ==   4.3  Computing comparison metrics  ======================================
 # Ready to compare.
 # ... sum of alignment scores of alignment divided by sum of alignment scores
 #  of reference alignment (arbitrarily using CLUSTAL W as reference)
 sRef <- sum(msaWScores)
 sum(msaWScores) / sRef  # CLUSTAl W
 sum(msaOScores) / sRef  # CLUSTAL O
 sum(msaTScores) / sRef  # T-COFFEE
 sum(msaMScores) / sRef  # MUSCLE
 # ... mean alignment scores (higher is better)
 mean(msaWScores)  # CLUSTAl W
 mean(msaOScores)  # CLUSTAL O
 mean(msaTScores)  # T-COFFEE
 mean(msaMScores)  # MUSCLE
 # total number of gaps (lower is better)
 countGaps <- function(ali) {
  x <- paste0(as.character(ali), collapse = "")
  aa <- nchar(gsub("-", "", x))
  return(nchar(x) - aa)
 }
 countGaps(msaW)   # CLUSTAl W
 countGaps(msaO)   # CLUSTAL O
 countGaps(msaT)   # T-COFFEE
 countGaps(msaM)   # MUSCLE
 # number of indels in alignment (lower is less fragmented)
 countIndels <- function(ali) {
  x <- paste0(as.character(ali), collapse = "")  # collapse into single string
  x <- unlist(strsplit(x, ""))                   # split into characters
  x <- x == "-"                                  # convert into boolean
  x <- rle(x)                                    # calculate rle
  # every run of TRUE is one indel event
  return(sum(x$values))
 }
 countIndels(msaW)   # CLUSTAl W
 countIndels(msaO)   # CLUSTAL O
 countIndels(msaT)   # T-COFFEE
 countIndels(msaM)   # MUSCLE
 # Let's look at the distribution of alignment scores:
 boxplot(list(CLUSTAL.W = msaWScores,
             CLUSTAL.O = msaOScores,
             T.COFFEE  = msaTScores,
             MUSCLE    = msaMScores),
        col = c("#7D556488", "#74628F88", "#5E78A188", "#3DAEB288"))
 # CLUSTAL W and CLUSTAL O don't look all that different. T-Coffee tends to have
 # a tighter distribution with less negative scores. Muscle has a slightly higher
 # mean and generally higher scores.
 # Boxplots are convenient, but don't give us much detail about the shape of the
 # distribution. For that, we need histograms, or density plots.
 plot(density(msaWScores),
     type = "l",
     col = "#7D5564",
     lwd = 1.5,
     ylim = c(0, (max(density(msaWScores)$y) * 1.3)),
     main = "Comparing MSA algorithms",
     xlab = "Alignment Score",
     ylab = "Density")
 points(density(msaOScores),
       type = "l",
       lwd = 1.5,
       col = "#74628F")
 points(density(msaTScores),
       type = "l",
       lwd = 1.5,
       col = "#5E78A1")
 points(density(msaMScores),
       type = "l",
       lwd = 1.5,
       col = "#3DAEB2")
 legend("topright",
       legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
       col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
       lwd = 2,
       cex = 0.7,
       bty = "n")
 # The desnity plots confirm in more detail that CLUSTAL W misses some of the
 # higher-scoring possibilities, that wherever CLUSTAL O is bad, it is quite bad,
 # that T-COFFEE has fewer poorly scoring columns but misses some of the better
 # scoring possibilities, and that MUSCLE appears to do best overall.
 # Can we attribute these differences to sections of the alignment in which the
 # algorithms did better or worse? Let's plot the scores cumulatively. The
 # alignments have different lengths, so we plot the scores on the respective
 # fraction of the alignement length.
 plot(seq(0, 1, length.out = length(msaWScores)), # x- axis: fraction of length
     cumsum(msaWScores),
     type = "l",
     col = "#7D5564",
     lwd = 1.5,
     ylim = c(0, max(cumsum(msaMScores))),
     main = "Comparing MSA algorithms",
     xlab = "Alignment Position",
     ylab = "Cumulative Alignment Score")
 points(seq(0, 1, length.out = length(msaOScores)), # x- axis: fraction of length
       cumsum(msaOScores),
       type = "l",
       lwd = 1.5,
       col = "#74628F")
 points(seq(0, 1, length.out = length(msaTScores)), # x- axis: fraction of length
       cumsum(msaTScores),
       type = "l",
       lwd = 1.5,
       col = "#5E78A1")
 points(seq(0, 1, length.out = length(msaMScores)), # x- axis: fraction of length
       cumsum(msaMScores),
       type = "l",
       lwd = 1.5,
       col = "#3DAEB2")
 legend("bottomright",
       legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"),
       col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"),
       lwd = 2,
       cex = 0.7,
       bty = "n")
 # Your alignment is going to be differnte from mine, due to the inclusion of
 # MYSPE - but what I see is that MUSCLE gives the highest score overall, and
 # achieves this with fewer indels then most, and the lowest number of gaps of
 # all algorithms.
 # To actually compare regions of alignments, we need to align alignments.
 # =    5  Profile-Profile alignments  ==========================================
 # Profile-profile alignments are the most powerful way to pick up distant
 # relationships between sequence families. The can be used, for example to build
 # a profile from structural superpositions of crystal structures, and then map a
 # MSA alignment onto those features. Here we will use profile-profile comparison
 # to compare two MSAs with each other, by aligning them. The algorithm is
 # provided by the DECIPHER package.
 if (! require(DECIPHER, quietly=TRUE)) {
  if (! exists("biocLite")) {
    source("https://bioconductor.org/biocLite.R")
  }
  biocLite("DECIPHER")
  library(DECIPHER)
 }
 # Package information:
 #  library(help = DECIPHER)       # basic information
 #  browseVignettes("DECIPHER")    # available vignettes
 #  data(package = "DECIPHER")     # available datasets
 # AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
 # CLUSTAL W alignments: we could do this directly ...
 AlignProfiles(msaW@unmasked, msaM@unmasked)
 # But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
 # alignment into the same order as the MUSCLE alignment:
 m <- as.character(msaM)
 w <- as.character(msaW)[names(m)]
 (ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
 # Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
 # writeALN function to show it. Here is an arbitrary block, from somewhere in
 # the middle of the alignment:
 writeALN(ppa, range = c(751, 810))
 # If you look at this for a while, you can begin to figue out where the
 # algorithms made different decisions about where to insert gaps, and how to
 # move segments of sequence around. But matters become more clear if we
 # post-process this profile-profile alignment. Let's replace all hyphens that
 # the pp-alignment has inserted with blanks, and let's add a separator line down
 # the middle between the two alignments.
 x <- unlist(strsplit(as.character(ppa), ""))  # unlist all
 dim(x) <- c(width(ppa)[1], length(ppa))       # form into matrix by columns
 x <- t(x)                                     # transpose the matrix
 (a1 <- 1:(nrow(x)/2))                         # rows of alignment 1
 (a2 <- ((nrow(x)/2) + 1):nrow(x))             # rows of alignment 2
 for (i in 1:ncol(x)) {
  if (all(x[a1, i] == "-")) { x[a1, i] <- " " }  # blank hyphens that shift
  if (all(x[a2, i] == "-")) { x[a2, i] <- " " }  # original alignment blocks
 }
 # collapse the matrix into strings
 ppa2 <- character()
 for (i in 1:nrow(x)) {
  ppa2[i] <- paste0(x[i, ], collapse = "")
  names(ppa2)[i] <- names(ppa)[i]
 }
 # add a separator line
 x <- paste0(rep("-", width(ppa)[1]), collapse = "")
 ppa2 <- c(ppa2[a1], x, ppa2[a2])
 # inspect
 writeALN(ppa2, range = c(800, 960))
 # Again, go explore, and get a sense of what's going on. You may find that
 # CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
 # MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
 # expect from an algorithm that builds alignments from pairwise local
 # alignments, without global refinement.
 # =    6  Sequence Logos  ======================================================
 # To visualize the information that we can get about structure and function with
 # an MSA, we'll calculate a sequence logo of the Mbp1 recognition helix - the
 # part of the structure that inserts into the major groove of the DNA and
 # provides sequence specificity to the DNA binding of this transcription factor.
 # Helix-B in Mbp1 with four residues upstream and downstream spans the sequence
 #     43-ANFAKAKRTRILEKEVLKE-61
 # ==   6.1  Subsetting an alignment by motif  ==================================
 # Finding the location of such an substring in an alignment is not entirely
 # trivial, because the alignment might have produced indels in that sequence.
 # Our strategy can be:
 #   -  fetch the sequence from the alignment
 #   -  remove all hyphens
 #   -  find the range where the target sequence matches
 #   -  count how many characters in all there are in the aligned sequence, up
 #        to the start and end of the match
 #   -  these numbers define the range of the match in the alignment.
 x <- as.character(msaM)["MBP1_SACCE"]
 xAA <- gsub("-", "", x)
 motif <- "ANFAKAKRTRILEKEVLKE"
 (m <- regexpr(motif, xAA))  # matched in position 43, with a length of 19
 (motifStart <- as.numeric(m))
 (motifEnd <- attr(m, "match.length") + motifStart - 1)
 # To count characters, we split the string into single characters ...
 x <- unlist(strsplit(x, ""))
 # ... convert this into a boolean, which is true if the character is not
 # a hyphen ...
 x <- x != "-"
 # ... cast this into a numeric, which turns TRUE into 1 and FALSE into 0 ...
 x <- as.numeric(x)
 # ... and sum up the cumulative sum.
 x <- cumsum(x)
 # Now we can find where the 43'd and 61'st characters are located in the
 # alignment string ...
 (aliStart <- which(x == motifStart)[1])   # get the first hit if there are more
 (aliEnd   <- which(x == motifEnd)[1])
 # ... and subset the alignment
 (motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd))
 # ==   6.2  Plot a Sequence Logo  ==============================================
 # There are now several good options to plot sequence logos in R, these include
 # dagLogo, DiffLogo, Logolas, and motifStack. For our example we will use
 # ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
 # the EBI.
 if (! require(ggseqlogo, quietly=TRUE)) {
  install.packages(("ggseqlogo"))
  library(ggseqlogo)
 }
 # Package information:
 #  library(help=ggseqlogo)       # basic information
 #  browseVignettes("ggseqlogo")  # available vignettes
 #  data(package = "ggseqlogo")   # available datasets
 ggseqlogo(as.character(motifAli))
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@@ -0,0 +1,135 @@
 # ABC-writeALN.R
 #
 # ToDo:    calculate consensus line
 #          append sequence numbers
 # Notes:
 #
 # ==============================================================================
 writeALN <- function(ali,
                     range,
                     note = "",
                     myCon = stdout(),
                     blockWidth = 60) {
  # Purpose:
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
  #     a file in multi-FASTA format.
  # Version: 2.0
  # Date:    2017 10
  # Author:  Boris Steipe
  #
  # Parameters:
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
  #                       vector.
  #     range      num  a two-integer vector of start and end positions if
  #                       only a range of the MSA should be written, e.g.
  #                       a domain. Defaults to the full alignment length.
  #     note       chr  a vector of character that is appended to the name
  #                       of a sequence in the FASTA header. Recycling of
  #                       shorter vectors applies, thus a vector of length one
  #                       is added to all headers.
  #     myCon           a connection (cf. the con argument for writeLines).
  #                       Defaults to stdout()
  #     blockWidth int  width of sequence block. Default 80 characters.
  # Value:
  #     NA   the function is invoked for its side effect of printing an
  #          alignment to stdout() or file.
  blockWidth <- as.integer(blockWidth)
  if (is.na(blockWidth)) {
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
  }
  if (blockWidth < 1) {
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
  }
  if (blockWidth > 60) {
    stop("PANIC: \"blockWidth\" for CLUSTAL format can't be greater than 60.")
  }
  # Extract the raw data from the objects depending on their respective class
  # and put it into a named vector of strings.
  # Extract XStringSet from MsaXMultipleAlignment ...
  if (class(ali) == "MsaAAMultipleAlignment" |
      class(ali) == "MsaDNAMultipleAlignment" |
      class(ali) == "MsaRNAMultipleAlignment") {
      ali <- ali@unmasked
  }
  # Process XStringSet
  if (class(ali) == "AAStringSet" |
      class(ali) == "DNAStringSet" |
      class(ali) == "RNAStringSet") {
    sSet <- as.character(ali) # we use as.character(), not toString() thus
                              # we don't _have_ to load Biostrings
  } else if (class(ali) == "character") {
    sSet <- ali
  } else {
    stop(paste("Input object of class",
               class(ali),
               "can't be handled by this function."))
  }
  if (missing(range)) {
    range <- 1
    range[2] <- max(nchar(sSet))
  } else {
    range <- as.integer(range)
    if(length(range) != 2 ||
       any(is.na(range)) ||
       range[1] > range[2] ||
       range[1] < 1) {
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
    }
  }
  # Right-pad any sequence with "-" that is shorter than ranges[2]
    for (i in seq_along(sSet)) {
      if (nchar(sSet[i]) < range[2]) {
        sSet[i] <- paste0(sSet[i],
                          paste0(rep("-", range[2] - nchar(sSet[i])),
                                 collapse = ""))
      }
    }
  # Right-pad sequence names
  sNames <- names(sSet)
  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
  for (i in seq_along(sNames)) {
    sNames[i] <- paste0(sNames[i],
                      paste0(rep(" ", len - nchar(sNames[i])),
                             collapse = ""))
  }
  # Process each sequence
  txt <- paste0("CLUSTAL W format. ", note)
  txt[2] <- ""
  iStarts <- seq(range[1], range[2], by = blockWidth)
  iEnds <- c((iStarts[-1] - 1), range[2])
  for (i in seq_along(iStarts)) {
    for (j in seq_along(sSet)) {
      txt <- c(txt,
               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
    }
    txt <- c(txt, "")  # append a blank consenus line
    txt <- c(txt, "")  # append a separator line
  }
  writeLines(txt, con= myCon)
 }
 # ====  TESTS  =================================================================
 # Enter your function tests here...
 if (FALSE) {
  # test ...
 }
 # [END]
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@@ -0,0 +1,115 @@
 # ABC-writeMFA.R
 #
 # ToDo:
 # Notes:
 #
 # ==============================================================================
 writeMFA <- function(ali,
                     range,
                     note = "",
                     myCon = stdout(),
                     blockWidth = 80) {
  # Purpose:
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
  #     a file in multi-FASTA format.
  # Version: 2.0
  # Date:    2017  10
  # Author:  Boris Steipe
  #
  # Parameters:
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
  #                       vector
  #     range      num  a two-integer vector of start and end positions if
  #                       only a range of the MSA should be written, e.g.
  #                       a domain. Defaults to the full sequence length.
  #     note       chr  a vector of character that is appended to the name
  #                       of a sequence in the FASTA header. Recycling of
  #                       shorter vectors applies, thus a vector of length one
  #                       is added to all headers.
  #     myCon           a connection (cf. the con argument for writeLines).
  #                       Defaults to stdout()
  #     blockWidth int  width of sequence block. Default 80 characters.
  # Value:
  #     NA   the function is invoked for its side effect of printing an
  #          alignment to stdout() or file.
  blockWidth <- as.integer(blockWidth)
  if (is.na(blockWidth)) {
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
  }
  if (blockWidth < 1){
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
  }
  # Extract the raw data from the objects depending on their respective class
  # and put it into a named vector of strings.
  # Extract XStringSet from MsaXMultipleAlignment ...
  if (class(ali) == "MsaAAMultipleAlignment" |
      class(ali) == "MsaDNAMultipleAlignment" |
      class(ali) == "MsaRNAMultipleAlignment") {
      ali <- ali@unmasked
  }
  # Process XStringSet
  if (class(ali) == "AAStringSet" |
      class(ali) == "DNAStringSet" |
      class(ali) == "RNAStringSet") {
    sSet <- as.character(ali) # we use as.character(), not toString() thus
                              # we don't _have_ to load Biostrings
  } else if (class(ali) == "character") {
    sSet <- ali
  } else {
    stop(paste("Input object of class",
               class(ali),
               "can't be handled by this function."))
  }
  if (missing(range)) {
    range <- 1
    range[2] <- max(nchar(sSet))
  } else {
    range <- as.integer(range)
    if(length(range) != 2 ||
       any(is.na(range)) ||
       range[1] > range[2] ||
       range[1] < 1) {
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
    }
  }
  # Process each sequence
  txt <- character()
  headers <- paste(names(sSet), note)
  for (i in seq_along(sSet)) {
    # output FASTA header
    txt <- c(txt, sprintf(">%s", headers[i]))
    # output the sequence in blocks of blockWidth per line ...
    iStarts <- seq(range[1], range[2], by = blockWidth)
    iEnds <- c((iStarts[-1] - 1), range[2])
    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
    txt <- c(txt, thisSeq)
    txt <- c(txt, "")  # append an empty line for readability
  }
  writeLines(txt, con= myCon)
 }
 # ====  TESTS  =================================================================
 # Enter your function tests here...
 if (FALSE) {
  # test ...
 }
 # [END]