bch441-work-abc-units/BIN-PHYLO-Data_preparation.R

236 lines
7.4 KiB
R
Raw Normal View History

2017-09-12 20:09:20 +00:00
# BIN-PHYLO-Data_preparation.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
2017-11-01 13:44:24 +00:00
# Version: 1.0
2017-09-12 20:09:20 +00:00
#
2017-11-02 03:30:01 +00:00
# Date: 2017 10 31
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2017-11-01 13:44:24 +00:00
# 1.0 First 2017 version
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-11-02 03:30:01 +00:00
#TOC> ==========================================================================
2017-11-12 09:37:53 +00:00
#TOC>
2017-11-02 03:30:01 +00:00
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 Preparations 41
#TOC> 2 Fetching sequences 78
#TOC> 3 Multiple Sequence Alignment 119
2017-11-20 23:59:54 +00:00
#TOC> 4 Reviewing and Editing Alignments 138
#TOC> 4.1 Masking workflow 154
2017-11-12 09:37:53 +00:00
#TOC>
2017-11-02 03:30:01 +00:00
#TOC> ==========================================================================
2017-11-01 13:44:24 +00:00
# = 1 Preparations ========================================================
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# You need to reload your protein database, including changes that might have
# been made to the reference files. If you have worked with the prerequiste
# units, you should have a script named "makeProteinDB.R" that will create the
# myDB object with a protein and feature database. Ask for advice if not.
source("makeProteinDB.R")
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# Load packages we need
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("Biostrings")
library(Biostrings)
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
if (! require(msa, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("msa")
library(msa)
}
# Package information:
2017-11-12 09:37:53 +00:00
# library(help = msa) # basic information
2017-11-01 13:44:24 +00:00
# browseVignettes("msa") # available vignettes
# data(package = "msa") # available datasets
2017-09-12 20:09:20 +00:00
2017-11-02 03:30:01 +00:00
# = 2 Fetching sequences ==================================================
2017-11-01 13:44:24 +00:00
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
# domains. You have annotated their ranges as a feature. The following code
# retrieves the sequences from myDB. You have seen similar code in other units.
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
sel <- grep("^MBP1_", myDB$protein$name)
(proNames <- myDB$protein$name[sel])
(proIDs <- myDB$protein$ID[sel])
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
myDB$annotation$featureID == sel]) # == !
# Why?
APSI <- character(length(fanIDs))
for (i in seq_along(fanIDs)) {
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
proID <- myDB$annotation$proteinID[sel] # get its protein ID
start <- myDB$annotation$start[sel] # get start ...
end <- myDB$annotation$end[sel] # ... and end
sel <- myDB$protein$ID == proID # get the protein row index ...
# ... and the sequence
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
names(APSI)[i] <- (myDB$protein$name[sel])
}
head(APSI)
2017-09-12 20:09:20 +00:00
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
2017-11-01 13:44:24 +00:00
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
APSI <- c(APSI,
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
names(APSI)[length(APSI)] <- "KILA_ESCCO"
tail(APSI)
2017-09-12 20:09:20 +00:00
2017-11-02 03:30:01 +00:00
# = 3 Multiple Sequence Alignment =========================================
2017-09-12 20:09:20 +00:00
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#
2017-11-01 13:44:24 +00:00
APSESSet <- AAStringSet(APSI)
APSESMsa <- msaMuscle(APSESSet, order = "aligned")
2017-09-12 20:09:20 +00:00
2017-11-20 23:59:54 +00:00
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
# that happens in your case, just use msaClustalOmega() instead.
2017-09-12 20:09:20 +00:00
# inspect the alignment.
2017-11-01 13:44:24 +00:00
writeALN(APSESMsa)
2017-09-12 20:09:20 +00:00
# What do you think? Is this a good alignment for phylogenetic inference?
2017-11-02 03:30:01 +00:00
# = 4 Reviewing and Editing Alignments ====================================
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# Head back to the Wiki page for this unit and read up on the background
# first.
2017-09-12 20:09:20 +00:00
# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.
2017-11-02 03:30:01 +00:00
# == 4.1 Masking workflow ==================================================
2017-09-12 20:09:20 +00:00
# get the length of the alignment
2017-11-01 13:44:24 +00:00
(lenAli <- APSESMsa@unmasked@ranges@width[1])
2017-09-12 20:09:20 +00:00
# initialize a matrix that can hold all characters
# individually
2017-11-01 13:44:24 +00:00
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
2017-09-12 20:09:20 +00:00
ncol = lenAli)
# assign the correct rownames
2017-11-01 13:44:24 +00:00
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsa)) {
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
2017-09-12 20:09:20 +00:00
}
# inspect the result
2017-11-01 13:44:24 +00:00
msaMatrix[1:7, 1:14]
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# Now let's make a logical vector with an element for each column that selects
# which columns should be masked out.
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# The number of hyphens in a column is easy to count. Consider:
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
msaMatrix[ , 20]
msaMatrix[ , 20] == "-"
sum(msaMatrix[ , 20] == "-")
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# Thus filling our logical vector is simple:
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# initialize a mask
colMask <- logical(ncol(msaMatrix))
2017-09-12 20:09:20 +00:00
# define the threshold for rejecting a column
2017-11-01 13:44:24 +00:00
limit <- round(nrow(APSESMsa) * (2/3))
2017-09-12 20:09:20 +00:00
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more - i.e. TRUE columns will be used fr analysis
# and FALSE columns will be rejected.
2017-11-01 13:44:24 +00:00
for (i in 1:ncol(msaMatrix)) {
count <- sum(msaMatrix[ , i] == "-")
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
2017-09-12 20:09:20 +00:00
}
# Inspect the mask
2017-09-12 20:09:20 +00:00
colMask
# How many positions are being kept?
2017-09-12 20:09:20 +00:00
sum(colMask)
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]
# check:
ncol(maskedMatrix)
2017-11-01 13:44:24 +00:00
# ... then collapse each row of single characters back into a string ...
APSESphyloSet <- character()
2017-09-12 20:09:20 +00:00
for (i in 1:nrow(maskedMatrix)) {
2017-11-01 13:44:24 +00:00
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
2017-09-12 20:09:20 +00:00
}
2017-11-01 13:44:24 +00:00
names(APSESphyloSet) <- rownames(maskedMatrix)
2017-09-12 20:09:20 +00:00
# inspect ...
2017-11-01 13:44:24 +00:00
writeALN(APSESphyloSet)
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
# several indels from the KILA_ESCCO outgroup sequence.
2017-09-12 20:09:20 +00:00
# We save the aligned, masked domains to a file in multi-FASTA format.
2017-11-01 13:44:24 +00:00
writeMFA(APSESphyloSet, myCon = "APSESphyloSet.mfa")
2017-09-12 20:09:20 +00:00
# [END]