bch441-work-abc-units/BIN-PHYLO-Data_preparation.R
2017-10-03 23:38:48 -04:00

251 lines
7.4 KiB
R

# BIN-PHYLO-Data_preparation.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# ==============================================================================
# PART ONE: Choosing sequences
# ==============================================================================
# Start by loading libraries. You already have the packages installed.
library(Biostrings)
library(msa)
library(stringr)
# What is the latest version of myDB that you have saved?
list.files(pattern = "myDB.*")
# ... load it (probably myDB.05.RData - if not, change the code below).
load("myDB.05.RData")
# The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for MYSPE.
#
# We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature.
# Collect APSES domain sequences from your database. The function
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
# from its start and end coordinates. Try:
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
# Lets put all APSES sequences into a vector:
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
APSES <- character(length(APSESnames))
for (i in 1:length(APSESnames)) {
APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
}
# Let's name the rows of our vector with the BiCode part of the protein name.
# This is important so we can keep track of which sequence is which. We use the
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
names(APSES) <- gsub("^MBP1_", "", APSESnames)
# inspect the result: what do you expect? Is this what you expect?
head(APSES)
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogegetic tree (see the Assignment Course Wiki page for details on the
# sequence).
APSES[length(APSES) + 1] <-
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
names(APSES)[length(APSES)] <- "ESCCO"
# ==============================================================================
# PART TWO: Multiple sequence alignment
# ==============================================================================
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#
APSESSeqSet <- AAStringSet(APSES)
APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")
# inspect the alignment.
writeSeqSet(APSESMsaSet, format = "ali")
# What do you think? Is this a good alignment for phylogenetic inference?
# ==============================================================================
# PART THREE: reviewing and editing alignments
# ==============================================================================
# Head back to the assignment 7 course wiki page and read up on the background
# first.
#
# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.
# Step 1. Go through this by hand...
# get the length of the alignment
lenAli <- APSESMsaSet@unmasked@ranges@width[1]
# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
ncol = lenAli)
# assign the correct rownames
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsaSet)) {
seq <- as.character(APSESMsaSet@unmasked[i])
msaMatrix[i, ] <- unlist(strsplit(seq, ""))
}
# inspect the result
msaMatrix[1:5, ]
# Now let's make a logical vector with an element
# for each column that selects which columns should
# be masked out.
# To count the number of elements in a vector, R has
# the table() function. For example ...
table(msaMatrix[ , 1])
table(msaMatrix[ , 10])
table(msaMatrix[ , 20])
table(msaMatrix[ , 30])
# Since the return value of table() is a named vector, where
# the name is the element that was counted in each slot,
# we can simply get the counts for hyphens from the
# return value of table(). We don't even need to assign
# the result to an intermediate variable, but we
# can attach the selection via square brackets,
# i.e.: ["-"], directly to the function call:
table(msaMatrix[ , 1])["-"]
# ... to get the number of hyphens. And we can compare
# whether it is eg. > 4.
table(msaMatrix[ , 1])["-"] > 4
# Thus filling our logical vector is really simple:
# initialize the mask
colMask <- logical(lenAli)
# define the threshold for rejecting a column
limit <- round(nrow(APSESMsaSet) * (2/3))
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more.
for (i in 1:lenAli) {
count <- table(msaMatrix[ , i])["-"]
if (is.na(count)) { # No hyphen
count <- 0
}
colMask[i] <- count <= limit
}
# inspect the mask
colMask
# How many positions were masked? R has a simple trick
# to count the number of TRUE and FALSE in a logical
# vector. If a logical TRUE or FALSE is converted into
# a number, it becomes 1 or 0 respectively. If we use
# the sum() function on the vector, the conversion is
# done implicitly. Thus ...
sum(colMask)
# ... gives the number of TRUE elements.
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]
# check:
ncol(maskedMatrix)
# ... then collapse each row back into a sequence ...
apsMaskedSeq <- character()
for (i in 1:nrow(maskedMatrix)) {
apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(apsMaskedSeq) <- rownames(maskedMatrix)
# ... and read it back into an AAStringSet object
apsMaskedSet <- AAStringSet(apsMaskedSeq)
# inspect ...
writeSeqSet(apsMaskedSet, format = "ali")
# Step 2. Turn this code into a function...
# Even though the procedure is simple, doing this more than once is tedious and
# prone to errors. I have assembled the steps we just went through into a
# function maskSet() and put it into the utilities.R file, from where it has
# been loaded when you started this sesssion.
maskSet
# Check that the function gives identical results
# to what we did before by hand:
identical(apsMaskedSet, maskSet(APSESMsaSet))
# The result must be TRUE. If it's not TRUE you have
# an error somewhere.
# We save the aligned, masked domains to a file in multi-FASTA format.
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa", format = "mfa")
# = 1 Tasks
# [END]