bch441-work-abc-units/BIN-ALI-MSA.R

# BIN-ALI-MSA.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-ALI-MSA unit.
#
# Version:  0.1
#
# Date:     2017  08  28
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.1    First code copied from 2016 material.

#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================

# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...

# ==============================================================================

# = 1 Multiple sequence alignment

# We will compute a multiple sequence alignment using the "muscle" algorithm
# which is available throught the Bioconductor msa package.

if (!require(Biostrings, quietly=TRUE)) {
  source("https://bioconductor.org/biocLite.R")
  biocLite("Biostrings")
}
data(BLOSUM62)

if (!require(msa, quietly=TRUE)) {
  source("https://bioconductor.org/biocLite.R")
  biocLite("msa")
  library(msa)
}

# If the installation asks you if you want to update older packages, always
# answer "a" for "all" unless you have an important reason not to. But if the
# installer asks you whether you want to compile from source, answer"n" for "no"
# unless you need new functionality in a particular bleeding-edge version of a
# package.

help(package = "msa")

# We have used biostrings' AAString() function before; for multiple
# alignments we need an AAStringSet(). We can simply feed it
# a vector of sequences:

# Let's make a shorthand for selection of Mbp1 proteins from our database: a
# vector of indices for all of the rows in the protein table that contain
# proteins whose name begins with MBP1.
iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)

# Next we create an AAStringSet for all of those proteins
seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])

# ... and align (which is very simple) ...
msaMuscle(
  seqSet,
  order = "aligned")

# ... but to help us make sense of the alignment we need to add the names for
# the sequences. names for a seqSet object are held in the ranges slot...

seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]

seqSet

# This little step of adding names is actually really
# very important. That's because the aligned sequences
# are meaningless strings of characters unless we can
# easily identify their biological relationships.
# Creating MSAs that are only identified by e.g. their
# RefSeq ids is a type of cargo-cult bioinformatics
# that we encounter a lot. The point of the alignment
# is not to create it, but to interpret it!


# Let's align again, and assign the result ...
msa1 <-  msaMuscle(
  seqSet,
  order = "aligned")

msa1

# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)


# You see that the alignment object has sequence strings with hyphens as
# indel-characters. The names are printed to the console. And you also see that
# the order has not been preserved, but the most similar sequences are now
# adjacent to each other.

# Lets write the alignment to one of the common file formats: a multi-fasta
# file.

# Why oh why does the msa package not have a function to do this !!!
# Like, seriously ...

# ==== writeMFA

# Here is our own function to write a AAStringSet object to a multi-FASTA file.
writeMFA <- function(ali, file, blockSize = 50) {
  if (missing(ali)) {
    stop("Input object missing from arguments with no default.")
  }
  if (missing(file)) {
    writeToFile <- FALSE
  }
  else {
    writeToFile <- TRUE
    sink(file) # divert output to file
  }
  # Extract the raw data from the objects depending on
  # their respective class and put this
  # into a named vector of strings.
  if (class(ali)[1] == "MsaAAMultipleAlignment") {
    strings <- character(nrow(ali))
    for (i in 1:nrow(ali)) {
      strings[i] <- as.character(ali@unmasked[i])
      names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
    }
  }
  else if (class(ali)[1] == "AAStringSet") {
    strings <- character(length(ali))
    for (i in 1:length(ali)) {
      strings[i] <- as.character(ali[i])
      names(strings)[i] <- ali@ranges@NAMES[i]
    }
  }
  else {
    stop(paste("Input object of class",
               class(ali)[1],
               "can't be handled by this function."))
  }


  for (i in 1:length(strings)) {
    # output FASTA header
    cat(paste(">",
              names(strings)[i],
              "\n",
              sep=""))
    # output the sequence block by block ...
    nLine <- 1
    from <- 1
    while (from < nchar(strings[i])) {
      to <- from + blockSize - 1
      cat(paste(substr(strings[i], from, to), "\n", sep=""))
      from <- to + 1
    }
    cat("\n") # output an empty line
  }
  if (writeToFile) {
    sink()  # Done. Close the diversion.
  }
}

# confirm that the function works
writeMFA(seqSet)
writeMFA(msa1)

# We could use this function to write the raw and aligned sequences to file like
# so:

# writeMFA(seqSet, file = "APSES_proteins.mfa")
# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")

# ...  but since we don't actually need the data on file now, just copy the code
# that would create the msa to your myCode file so you can quickly reproduce it.

# == Task:
# Print the output of print(msa1) on a sheet of paper and bring it to
# class for the next quiz.

# That'll do.


# = 1 Tasks


# [END]