251 lines
7.4 KiB
R
251 lines
7.4 KiB
R
# BIN-PHYLO-Data_preparation.R
|
|
#
|
|
# Purpose: A Bioinformatics Course:
|
|
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
|
#
|
|
# Version: 0.1
|
|
#
|
|
# Date: 2017 08 28
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
#
|
|
# Versions:
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
# TODO:
|
|
#
|
|
#
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
# going on. That's not how it works ...
|
|
|
|
# ==============================================================================
|
|
|
|
# = 1 ___Section___
|
|
|
|
# ==============================================================================
|
|
# PART ONE: Choosing sequences
|
|
# ==============================================================================
|
|
|
|
# Start by loading libraries. You already have the packages installed.
|
|
library(Biostrings)
|
|
library(msa)
|
|
library(stringr)
|
|
|
|
# What is the latest version of myDB that you have saved?
|
|
list.files(pattern = "myDB.*")
|
|
|
|
# ... load it (probably myDB.05.RData - if not, change the code below).
|
|
load("myDB.05.RData")
|
|
|
|
# The database contains the ten Mbp1 orthologues from the reference species
|
|
# and the Mbp1 RBM for MYSPE.
|
|
#
|
|
# We will construct a phylogenetic tree from the proteins' APSES domains.
|
|
# You have annotated their ranges as a feature.
|
|
|
|
# Collect APSES domain sequences from your database. The function
|
|
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
|
|
# from its start and end coordinates. Try:
|
|
|
|
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
|
|
|
|
# Lets put all APSES sequences into a vector:
|
|
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
|
|
APSES <- character(length(APSESnames))
|
|
|
|
for (i in 1:length(APSESnames)) {
|
|
APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
|
|
}
|
|
|
|
# Let's name the rows of our vector with the BiCode part of the protein name.
|
|
# This is important so we can keep track of which sequence is which. We use the
|
|
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
|
|
names(APSES) <- gsub("^MBP1_", "", APSESnames)
|
|
|
|
# inspect the result: what do you expect? Is this what you expect?
|
|
head(APSES)
|
|
|
|
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
|
|
# phylogegetic tree (see the Assignment Course Wiki page for details on the
|
|
# sequence).
|
|
|
|
APSES[length(APSES) + 1] <-
|
|
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
|
|
names(APSES)[length(APSES)] <- "ESCCO"
|
|
|
|
|
|
# ==============================================================================
|
|
# PART TWO: Multiple sequence alignment
|
|
# ==============================================================================
|
|
|
|
# This vector of sequences with named elements fulfills the requirements to be
|
|
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
|
# the MSA algorithms in Biostrings.
|
|
#
|
|
|
|
APSESSeqSet <- AAStringSet(APSES)
|
|
|
|
APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")
|
|
|
|
# inspect the alignment.
|
|
writeSeqSet(APSESMsaSet, format = "ali")
|
|
|
|
|
|
# What do you think? Is this a good alignment for phylogenetic inference?
|
|
|
|
# ==============================================================================
|
|
# PART THREE: reviewing and editing alignments
|
|
# ==============================================================================
|
|
|
|
# Head back to the assignment 7 course wiki page and read up on the background
|
|
# first.
|
|
#
|
|
|
|
|
|
|
|
# Let's mask out all columns that have observations for
|
|
# less than 1/3 of the sequences in the dataset. This
|
|
# means they have more than round(nrow(msaSet) * (2/3))
|
|
# hyphens in a column.
|
|
#
|
|
# We take all sequences, split them into single
|
|
# characters, and put them into a matrix. Then we
|
|
# go through the matrix, column by column and decide
|
|
# whether we want to include that column.
|
|
|
|
# Step 1. Go through this by hand...
|
|
|
|
# get the length of the alignment
|
|
lenAli <- APSESMsaSet@unmasked@ranges@width[1]
|
|
|
|
# initialize a matrix that can hold all characters
|
|
# individually
|
|
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
|
|
ncol = lenAli)
|
|
|
|
# assign the correct rownames
|
|
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
|
|
for (i in 1:nrow(APSESMsaSet)) {
|
|
seq <- as.character(APSESMsaSet@unmasked[i])
|
|
msaMatrix[i, ] <- unlist(strsplit(seq, ""))
|
|
}
|
|
|
|
# inspect the result
|
|
msaMatrix[1:5, ]
|
|
|
|
# Now let's make a logical vector with an element
|
|
# for each column that selects which columns should
|
|
# be masked out.
|
|
|
|
# To count the number of elements in a vector, R has
|
|
# the table() function. For example ...
|
|
table(msaMatrix[ , 1])
|
|
table(msaMatrix[ , 10])
|
|
table(msaMatrix[ , 20])
|
|
table(msaMatrix[ , 30])
|
|
|
|
|
|
# Since the return value of table() is a named vector, where
|
|
# the name is the element that was counted in each slot,
|
|
# we can simply get the counts for hyphens from the
|
|
# return value of table(). We don't even need to assign
|
|
# the result to an intermediate variable, but we
|
|
# can attach the selection via square brackets,
|
|
# i.e.: ["-"], directly to the function call:
|
|
table(msaMatrix[ , 1])["-"]
|
|
|
|
# ... to get the number of hyphens. And we can compare
|
|
# whether it is eg. > 4.
|
|
table(msaMatrix[ , 1])["-"] > 4
|
|
|
|
# Thus filling our logical vector is really simple:
|
|
|
|
# initialize the mask
|
|
colMask <- logical(lenAli)
|
|
|
|
# define the threshold for rejecting a column
|
|
limit <- round(nrow(APSESMsaSet) * (2/3))
|
|
|
|
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
|
|
# hyphens, FALSE if there are more.
|
|
for (i in 1:lenAli) {
|
|
count <- table(msaMatrix[ , i])["-"]
|
|
if (is.na(count)) { # No hyphen
|
|
count <- 0
|
|
}
|
|
colMask[i] <- count <= limit
|
|
}
|
|
|
|
# inspect the mask
|
|
colMask
|
|
|
|
# How many positions were masked? R has a simple trick
|
|
# to count the number of TRUE and FALSE in a logical
|
|
# vector. If a logical TRUE or FALSE is converted into
|
|
# a number, it becomes 1 or 0 respectively. If we use
|
|
# the sum() function on the vector, the conversion is
|
|
# done implicitly. Thus ...
|
|
sum(colMask)
|
|
|
|
# ... gives the number of TRUE elements.
|
|
|
|
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
|
|
100 * (1 - (sum(colMask) / length(colMask)))))
|
|
|
|
|
|
# Next, we use colMask to remove the masked columns from the matrix
|
|
# in one step:
|
|
maskedMatrix <- msaMatrix[ , colMask]
|
|
|
|
# check:
|
|
ncol(maskedMatrix)
|
|
|
|
|
|
# ... then collapse each row back into a sequence ...
|
|
|
|
apsMaskedSeq <- character()
|
|
for (i in 1:nrow(maskedMatrix)) {
|
|
apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
|
|
}
|
|
names(apsMaskedSeq) <- rownames(maskedMatrix)
|
|
|
|
# ... and read it back into an AAStringSet object
|
|
|
|
apsMaskedSet <- AAStringSet(apsMaskedSeq)
|
|
|
|
# inspect ...
|
|
writeSeqSet(apsMaskedSet, format = "ali")
|
|
|
|
|
|
|
|
# Step 2. Turn this code into a function...
|
|
|
|
# Even though the procedure is simple, doing this more than once is tedious and
|
|
# prone to errors. I have assembled the steps we just went through into a
|
|
# function maskSet() and put it into the utilities.R file, from where it has
|
|
# been loaded when you started this sesssion.
|
|
|
|
maskSet
|
|
|
|
# Check that the function gives identical results
|
|
# to what we did before by hand:
|
|
identical(apsMaskedSet, maskSet(APSESMsaSet))
|
|
|
|
# The result must be TRUE. If it's not TRUE you have
|
|
# an error somewhere.
|
|
|
|
# We save the aligned, masked domains to a file in multi-FASTA format.
|
|
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa", format = "mfa")
|
|
|
|
|
|
|
|
# = 1 Tasks
|
|
|
|
|
|
|
|
|
|
# [END]
|