bch441-work-abc-units/BIN-ALI-MSA.R
2017-09-12 16:09:20 -04:00

193 lines
5.4 KiB
R

# BIN-ALI-MSA.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-MSA unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 Multiple sequence alignment
# We will compute a multiple sequence alignment using the "muscle" algorithm
# which is available throught the Bioconductor msa package.
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
}
data(BLOSUM62)
if (!require(msa, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("msa")
library(msa)
}
# If the installation asks you if you want to update older packages, always
# answer "a" for "all" unless you have an important reason not to. But if the
# installer asks you whether you want to compile from source, answer"n" for "no"
# unless you need new functionality in a particular bleeding-edge version of a
# package.
help(package = "msa")
# We have used biostrings' AAString() function before; for multiple
# alignments we need an AAStringSet(). We can simply feed it
# a vector of sequences:
# Let's make a shorthand for selection of Mbp1 proteins from our database: a
# vector of indices for all of the rows in the protein table that contain
# proteins whose name begins with MBP1.
iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)
# Next we create an AAStringSet for all of those proteins
seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])
# ... and align (which is very simple) ...
msaMuscle(
seqSet,
order = "aligned")
# ... but to help us make sense of the alignment we need to add the names for
# the sequences. names for a seqSet object are held in the ranges slot...
seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]
seqSet
# This little step of adding names is actually really
# very important. That's because the aligned sequences
# are meaningless strings of characters unless we can
# easily identify their biological relationships.
# Creating MSAs that are only identified by e.g. their
# RefSeq ids is a type of cargo-cult bioinformatics
# that we encounter a lot. The point of the alignment
# is not to create it, but to interpret it!
# Let's align again, and assign the result ...
msa1 <- msaMuscle(
seqSet,
order = "aligned")
msa1
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
# You see that the alignment object has sequence strings with hyphens as
# indel-characters. The names are printed to the console. And you also see that
# the order has not been preserved, but the most similar sequences are now
# adjacent to each other.
# Lets write the alignment to one of the common file formats: a multi-fasta
# file.
# Why oh why does the msa package not have a function to do this !!!
# Like, seriously ...
# ==== writeMFA
# Here is our own function to write a AAStringSet object to a multi-FASTA file.
writeMFA <- function(ali, file, blockSize = 50) {
if (missing(ali)) {
stop("Input object missing from arguments with no default.")
}
if (missing(file)) {
writeToFile <- FALSE
}
else {
writeToFile <- TRUE
sink(file) # divert output to file
}
# Extract the raw data from the objects depending on
# their respective class and put this
# into a named vector of strings.
if (class(ali)[1] == "MsaAAMultipleAlignment") {
strings <- character(nrow(ali))
for (i in 1:nrow(ali)) {
strings[i] <- as.character(ali@unmasked[i])
names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
}
}
else if (class(ali)[1] == "AAStringSet") {
strings <- character(length(ali))
for (i in 1:length(ali)) {
strings[i] <- as.character(ali[i])
names(strings)[i] <- ali@ranges@NAMES[i]
}
}
else {
stop(paste("Input object of class",
class(ali)[1],
"can't be handled by this function."))
}
for (i in 1:length(strings)) {
# output FASTA header
cat(paste(">",
names(strings)[i],
"\n",
sep=""))
# output the sequence block by block ...
nLine <- 1
from <- 1
while (from < nchar(strings[i])) {
to <- from + blockSize - 1
cat(paste(substr(strings[i], from, to), "\n", sep=""))
from <- to + 1
}
cat("\n") # output an empty line
}
if (writeToFile) {
sink() # Done. Close the diversion.
}
}
# confirm that the function works
writeMFA(seqSet)
writeMFA(msa1)
# We could use this function to write the raw and aligned sequences to file like
# so:
# writeMFA(seqSet, file = "APSES_proteins.mfa")
# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")
# ... but since we don't actually need the data on file now, just copy the code
# that would create the msa to your myCode file so you can quickly reproduce it.
# == Task:
# Print the output of print(msa1) on a sheet of paper and bring it to
# class for the next quiz.
# That'll do.
# = 1 Tasks
# [END]