# BIN-PHYLO-Data_preparation.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version:  0.1
#
# Date:     2017  08  28
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.1    First code copied from 2016 material.

#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================

# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...

# ==============================================================================

# = 1 ___Section___

# ==============================================================================
#        PART ONE: Choosing sequences
# ==============================================================================

# Start by loading libraries. You already have the packages installed.
library(Biostrings)
library(msa)
library(stringr)

# What is the latest version of myDB that you have saved?
list.files(pattern = "myDB.*")

# ... load it (probably myDB.05.RData - if not, change the code below).
load("myDB.05.RData")

# The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for MYSPE.
#
# We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature.

# Collect APSES domain sequences from your database. The function
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
# from its start and end coordinates. Try:

dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")

# Lets put all APSES sequences into a vector:
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
APSES <- character(length(APSESnames))

for (i in 1:length(APSESnames)) {
  APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
}

# Let's name the rows of our vector with the BiCode part of the protein name.
# This is important so we can keep track of which sequence is which. We use the
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
names(APSES) <- gsub("^MBP1_", "", APSESnames)

# inspect the result: what do you expect? Is this what you expect?
head(APSES)

# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogegetic tree (see the Assignment Course Wiki page for details on the
# sequence).

APSES[length(APSES) + 1] <-
  "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
names(APSES)[length(APSES)] <- "ESCCO"


# ==============================================================================
#        PART TWO: Multiple sequence alignment
# ==============================================================================

# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#

APSESSeqSet <- AAStringSet(APSES)

APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")

# inspect the alignment.
writeSeqSet(APSESMsaSet, format = "ali")


# What do you think? Is this a good alignment for phylogenetic inference?

# ==============================================================================
#        PART THREE: reviewing and editing alignments
# ==============================================================================

# Head back to the assignment 7 course wiki page and read up on the background
# first.
#


# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.

# Step 1. Go through this by hand...

# get the length of the alignment
lenAli <- APSESMsaSet@unmasked@ranges@width[1]

# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
                    ncol = lenAli)

# assign the correct rownames
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsaSet)) {
  seq <- as.character(APSESMsaSet@unmasked[i])
  msaMatrix[i, ] <- unlist(strsplit(seq, ""))
}

# inspect the result
msaMatrix[1:5, ]

# Now let's make a logical vector with an element
# for each column that selects which columns should
# be masked out.

# To count the number of elements in a vector, R has
# the table() function. For example ...
table(msaMatrix[ , 1])
table(msaMatrix[ , 10])
table(msaMatrix[ , 20])
table(msaMatrix[ , 30])


# Since the return value of table() is a named vector, where
# the name is the element that was counted in each slot,
# we can simply get the counts for hyphens from the
# return value of table(). We don't even need to assign
# the result to an intermediate variable, but we
# can attach the selection via square brackets,
# i.e.: ["-"],  directly to the function call:
table(msaMatrix[ , 1])["-"]

# ... to get the number of hyphens. And we can compare
# whether it is eg. > 4.
table(msaMatrix[ , 1])["-"] > 4

# Thus filling our logical vector is really simple:

# initialize the mask
colMask <- logical(lenAli)

# define the threshold for rejecting a column
limit <- round(nrow(APSESMsaSet) * (2/3))

# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more.
for (i in 1:lenAli) {
  count <- table(msaMatrix[ , i])["-"]
  if (is.na(count)) { # No hyphen
    count <- 0
  }
  colMask[i] <- count <= limit
}

# inspect the mask
colMask

# How many positions were masked? R has a simple trick
# to count the number of TRUE and FALSE in a logical
# vector. If a logical TRUE or FALSE is converted into
# a number, it becomes 1 or 0 respectively. If we use
# the sum() function on the vector, the conversion is
# done implicitly. Thus ...
sum(colMask)

# ... gives the number of TRUE elements.

cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
            100 * (1 - (sum(colMask) / length(colMask)))))


# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]

# check:
ncol(maskedMatrix)


# ... then collapse each row back into a sequence ...

apsMaskedSeq <- character()
for (i in 1:nrow(maskedMatrix)) {
  apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(apsMaskedSeq) <- rownames(maskedMatrix)

# ... and read it back into an AAStringSet object

apsMaskedSet <- AAStringSet(apsMaskedSeq)

# inspect ...
writeSeqSet(apsMaskedSet, format = "ali")


# Step 2. Turn this code into a function...

# Even though the procedure is simple, doing this more than once is tedious and
# prone to errors. I have assembled the steps we just went through into a
# function maskSet() and put it into the utilities.R file, from where it has
# been loaded when you started this sesssion.

maskSet

# Check that the function gives identical results
# to what we did before by hand:
identical(apsMaskedSet, maskSet(APSESMsaSet))

# The result must be TRUE. If it's not TRUE you have
# an error somewhere.

# We save the aligned, masked domains to a file in multi-FASTA format.
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa",   format = "mfa")


# = 1 Tasks


# [END]