Updates, and add treedist()
This commit is contained in:
parent
0da30ec109
commit
1d6170f417
@ -5,7 +5,7 @@
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 10. 31
|
||||
# Date: 2017 10 31
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
@ -24,6 +24,20 @@
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 Preparations 41
|
||||
#TOC> 2 Fetching sequences 78
|
||||
#TOC> 3 Multiple Sequence Alignment 119
|
||||
#TOC> 4 Reviewing and Editing Alignments 136
|
||||
#TOC> 4.1 Masking workflow 152
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
|
||||
|
||||
@ -61,18 +75,7 @@ if (! require(msa, quietly=TRUE)) {
|
||||
# data(package = "msa") # available datasets
|
||||
|
||||
|
||||
if (! require(stringr, quietly=TRUE)) {
|
||||
install.packages("stringr")
|
||||
library(stringr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help=stringr) # basic information
|
||||
# browseVignettes("stringr") # available vignettes
|
||||
# data(package = "stringr") # available datasets
|
||||
|
||||
|
||||
|
||||
# = 1 Fetching sequences
|
||||
# = 2 Fetching sequences ==================================================
|
||||
|
||||
|
||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
||||
@ -113,7 +116,7 @@ names(APSI)[length(APSI)] <- "KILA_ESCCO"
|
||||
tail(APSI)
|
||||
|
||||
|
||||
# = 1 Multiple Sequence Alignment
|
||||
# = 3 Multiple Sequence Alignment =========================================
|
||||
|
||||
# This vector of sequences with named elements fulfills the requirements to be
|
||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
||||
@ -130,7 +133,7 @@ writeALN(APSESMsa)
|
||||
# What do you think? Is this a good alignment for phylogenetic inference?
|
||||
|
||||
|
||||
# = 1 Reviewing and Editing Alignments
|
||||
# = 4 Reviewing and Editing Alignments ====================================
|
||||
|
||||
|
||||
# Head back to the Wiki page for this unit and read up on the background
|
||||
@ -146,7 +149,7 @@ writeALN(APSESMsa)
|
||||
# go through the matrix, column by column and decide
|
||||
# whether we want to include that column.
|
||||
|
||||
# = 1.1 Masking workflow
|
||||
# == 4.1 Masking workflow ==================================================
|
||||
|
||||
# get the length of the alignment
|
||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
||||
|
@ -24,7 +24,21 @@
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# = 1 ___Section___
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------
|
||||
#TOC> 1 ___Section___ 38
|
||||
#TOC> 2 Tree Analysis 77
|
||||
#TOC> 2.1 Rooting Trees 136
|
||||
#TOC> 2.2 Rotating Clades 182
|
||||
#TOC> 2.3 Computing tree distances 229
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
|
||||
if (!require(Rphylip, quietly=TRUE)) {
|
||||
@ -63,7 +77,7 @@ nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
|
||||
# species tree.
|
||||
|
||||
|
||||
# = 1 Tree Analysis
|
||||
# = 2 Tree Analysis =======================================================
|
||||
|
||||
|
||||
# 1.1 Visualizing your tree
|
||||
@ -122,7 +136,7 @@ Ntip(apsTree)
|
||||
# Finally, write the tree to console in Newick format
|
||||
write.tree(apsTree)
|
||||
|
||||
# = 1.1 Rooting Trees
|
||||
# == 2.1 Rooting Trees =====================================================
|
||||
|
||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||
@ -168,7 +182,7 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
|
||||
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
|
||||
|
||||
|
||||
# = 1.1 Rotating Clades
|
||||
# == 2.2 Rotating Clades ===================================================
|
||||
|
||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||
# in the order expected from the cladogram of species.
|
||||
@ -202,11 +216,104 @@ plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
||||
add.scale.bar(length=0.5)
|
||||
layout(matrix(1), widths=1.0, heights=1.0)
|
||||
|
||||
# Study the two trees and consider their similarities and differences. What do
|
||||
# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
|
||||
# since it contains only a single gene for the species we considered. All of the
|
||||
# branch points in this tree are speciation events.
|
||||
# Task: Study the two trees and consider their similarities and differences.
|
||||
# What do you expect? What do you find? Note that this is not a "mixed"
|
||||
# gene tree yet, since it contains only a single gene for the species
|
||||
# we considered. All of the branch points in this tree are speciation
|
||||
# events. Thus the gene tree should have the same topology as the
|
||||
# species tree. Does it? Are the differences important? How many
|
||||
# branches would you need to remove and reinsert elsewhere to get the
|
||||
# same topology as the species tree?
|
||||
|
||||
# In order to quantiofy how different these tow trees are, we need to compute
|
||||
# tree distances.
|
||||
|
||||
|
||||
# == 2.3 Computing tree distances ==========================================
|
||||
|
||||
|
||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||
|
||||
if (!require(phangorn, quietly=TRUE)) {
|
||||
install.packages("phangorn")
|
||||
library(phangorn)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = phangorn) # basic information
|
||||
# browseVignettes("phangorn") # available vignettes
|
||||
# data(package = "phangorn") # available datasets
|
||||
|
||||
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
|
||||
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
|
||||
apsTree2 <- apsTree
|
||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||
|
||||
# phangorn provides several functions to compute tree-differences (and there
|
||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||
# "symmetric difference"
|
||||
treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||
|
||||
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||
# The ape package (which was loaded with RPhylip) provides the function rtree()
|
||||
# to compute random trees.
|
||||
|
||||
rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
rooted = TRUE, # we rooted the tree above,
|
||||
# and fungiTree is rooted anyway
|
||||
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||
br = NULL) # don't generate branch lengths since
|
||||
# fungiTree has none, so we can't
|
||||
# compare them anyway.
|
||||
|
||||
# Let's compute some random trees this way, calculate the distances to
|
||||
# fungiTree, and then compare the values we get for apsTree2:
|
||||
|
||||
set.seed(112358)
|
||||
N <- 10000 # takes about 15 seconds
|
||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||
colnames(myTreeDistances) <- c("symm", "path")
|
||||
|
||||
for (i in 1:N) {
|
||||
xTree <- rtree(n = length(apsTree2$tip.label),
|
||||
rooted = TRUE,
|
||||
tip.label = apsTree2$tip.label,
|
||||
br = NULL)
|
||||
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
||||
}
|
||||
|
||||
table(myTreeDistances[, "symm"])
|
||||
|
||||
(symmObs <- treedist(fungiTree, apsTree2)[1])
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
hist(myTreeDistances[, "path"],
|
||||
col = "aliceblue",
|
||||
main = "Distances of random Trees to fungiTree")
|
||||
(pathObs <- treedist(fungiTree, apsTree2)[2])
|
||||
abline(v = pathObs, col = "chartreuse")
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
# Indeed, our apsTree is _very_ much more similar to the species tree than
|
||||
# we would expect by random chance.
|
||||
|
||||
# What do we gain from that analysis? Analyzing the tree we get from a single
|
||||
# gene of orthologous sequences is a positive control in our computational
|
||||
# experiment. If these genes are indeed orthologues, a correct tree-building
|
||||
# program ought to give us a tree that exactly matches the species tree.
|
||||
# Evaluating how far off we are from the known correct result gives us a way to
|
||||
# validate our workflow and our algorithm. If we can't get that right, we can't
|
||||
# expect to get "real" data right either. Employing such positive controls in
|
||||
# every computational experiment is essential for research. Not doing so is
|
||||
# Cargo Cult Bioinformatics.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -14,7 +14,8 @@
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# Add MrBayes
|
||||
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
@ -25,8 +26,22 @@
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------------
|
||||
#TOC> 1 Calculating Trees 43
|
||||
#TOC> 1.1 PROMLPATH ... 64
|
||||
#TOC> 1.1.1 ... on the Mac 69
|
||||
#TOC> 1.1.2 ... on Windows 80
|
||||
#TOC> 1.1.3 ... on Linux 94
|
||||
#TOC> 1.1.4 Confirming PROMLPATH 99
|
||||
#TOC> 1.2 Building a maxiiimum likelihood tree 108
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
# = 1 Calculating Trees
|
||||
|
||||
# = 1 Calculating Trees ===================================================
|
||||
|
||||
|
||||
# Follow the instructions found at phylip's home on the Web to install. If you
|
||||
@ -47,11 +62,12 @@ if (!require(Rphylip, quietly=TRUE)) {
|
||||
# This will install RPhylip, as well as its dependency, the package "ape".
|
||||
|
||||
|
||||
|
||||
# == 1.1 PROMLPATH ... =====================================================
|
||||
# The next part may be tricky. You will need to figure out where
|
||||
# on your computer Phylip has been installed and define the path
|
||||
# to the proml program that calculates a maximum-likelihood tree.
|
||||
|
||||
# === 1.1.1 ... on the Mac
|
||||
# On the Mac, the standard installation places a phylip folder
|
||||
# in the /Applications directory. That folder contains all the
|
||||
# individual phylip programs as <name>.app files. These are not
|
||||
@ -62,6 +78,7 @@ if (!require(Rphylip, quietly=TRUE)) {
|
||||
# directly to that subdirectory to find the program it needs:
|
||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
||||
|
||||
# === 1.1.2 ... on Windows
|
||||
# On Windows you need to know where the rograms have been installed, and you
|
||||
# need to specify a path that is correct for the Windows OS. Find the folder
|
||||
# that is named "exe", and right-click to inspect its properties. The path
|
||||
@ -75,10 +92,12 @@ if (!require(Rphylip, quietly=TRUE)) {
|
||||
# I have heard that your path must not contain spaces, and it is prudent to
|
||||
# avoid other special characters as well.
|
||||
|
||||
# === 1.1.3 ... on Linux
|
||||
# If you are running Linux I trust you know what to do. It's probably
|
||||
# something like
|
||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
||||
|
||||
# === 1.1.4 Confirming PROMLPATH
|
||||
# Confirm that the settings are right.
|
||||
PROMLPATH # returns the path
|
||||
list.dirs(PROMLPATH) # returns the directories in that path
|
||||
@ -87,6 +106,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
||||
# If "proml" is NOT among the files that the last command returns, you
|
||||
# can't continue. Ask on the mailing list for advice.
|
||||
|
||||
# == 1.2 Building a maxiiimum likelihood tree ==============================
|
||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||
|
||||
@ -94,11 +114,16 @@ apsIn <- read.protein("APSESphyloSet.mfa")
|
||||
|
||||
# ... and you are ready to build a tree.
|
||||
|
||||
# There are many fast options in PHYLIP - we will use the most _accurate_ one
|
||||
# that it has: proml, a maximum-likelihood tree building program for protein
|
||||
# data.
|
||||
|
||||
# Building maximum-likelihood trees can eat as much computer time
|
||||
# as you can throw at it. Calculating a tree of 48 APSES domains
|
||||
# with default parameters of Rproml() runs for more than half a day
|
||||
# on my computer. But we have only twelve sequences here, so the
|
||||
# process will take us about 5 to 10 minutes.
|
||||
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
|
||||
# of coffee while you are waiting.
|
||||
|
||||
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user