Updates, and add treedist()
This commit is contained in:
parent
0da30ec109
commit
1d6170f417
@ -5,7 +5,7 @@
|
|||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 10. 31
|
# Date: 2017 10 31
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
@ -24,6 +24,20 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> ---------------------------------------------------
|
||||||
|
#TOC> 1 Preparations 41
|
||||||
|
#TOC> 2 Fetching sequences 78
|
||||||
|
#TOC> 3 Multiple Sequence Alignment 119
|
||||||
|
#TOC> 4 Reviewing and Editing Alignments 136
|
||||||
|
#TOC> 4.1 Masking workflow 152
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Preparations ========================================================
|
# = 1 Preparations ========================================================
|
||||||
|
|
||||||
|
|
||||||
@ -61,18 +75,7 @@ if (! require(msa, quietly=TRUE)) {
|
|||||||
# data(package = "msa") # available datasets
|
# data(package = "msa") # available datasets
|
||||||
|
|
||||||
|
|
||||||
if (! require(stringr, quietly=TRUE)) {
|
# = 2 Fetching sequences ==================================================
|
||||||
install.packages("stringr")
|
|
||||||
library(stringr)
|
|
||||||
}
|
|
||||||
# Package information:
|
|
||||||
# library(help=stringr) # basic information
|
|
||||||
# browseVignettes("stringr") # available vignettes
|
|
||||||
# data(package = "stringr") # available datasets
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Fetching sequences
|
|
||||||
|
|
||||||
|
|
||||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
||||||
@ -113,7 +116,7 @@ names(APSI)[length(APSI)] <- "KILA_ESCCO"
|
|||||||
tail(APSI)
|
tail(APSI)
|
||||||
|
|
||||||
|
|
||||||
# = 1 Multiple Sequence Alignment
|
# = 3 Multiple Sequence Alignment =========================================
|
||||||
|
|
||||||
# This vector of sequences with named elements fulfills the requirements to be
|
# This vector of sequences with named elements fulfills the requirements to be
|
||||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
||||||
@ -130,7 +133,7 @@ writeALN(APSESMsa)
|
|||||||
# What do you think? Is this a good alignment for phylogenetic inference?
|
# What do you think? Is this a good alignment for phylogenetic inference?
|
||||||
|
|
||||||
|
|
||||||
# = 1 Reviewing and Editing Alignments
|
# = 4 Reviewing and Editing Alignments ====================================
|
||||||
|
|
||||||
|
|
||||||
# Head back to the Wiki page for this unit and read up on the background
|
# Head back to the Wiki page for this unit and read up on the background
|
||||||
@ -146,7 +149,7 @@ writeALN(APSESMsa)
|
|||||||
# go through the matrix, column by column and decide
|
# go through the matrix, column by column and decide
|
||||||
# whether we want to include that column.
|
# whether we want to include that column.
|
||||||
|
|
||||||
# = 1.1 Masking workflow
|
# == 4.1 Masking workflow ==================================================
|
||||||
|
|
||||||
# get the length of the alignment
|
# get the length of the alignment
|
||||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
||||||
|
@ -24,7 +24,21 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -------------------------------------------
|
||||||
|
#TOC> 1 ___Section___ 38
|
||||||
|
#TOC> 2 Tree Analysis 77
|
||||||
|
#TOC> 2.1 Rooting Trees 136
|
||||||
|
#TOC> 2.2 Rotating Clades 182
|
||||||
|
#TOC> 2.3 Computing tree distances 229
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 ___Section___ =======================================================
|
||||||
|
|
||||||
|
|
||||||
if (!require(Rphylip, quietly=TRUE)) {
|
if (!require(Rphylip, quietly=TRUE)) {
|
||||||
@ -63,7 +77,7 @@ nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
|
|||||||
# species tree.
|
# species tree.
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tree Analysis
|
# = 2 Tree Analysis =======================================================
|
||||||
|
|
||||||
|
|
||||||
# 1.1 Visualizing your tree
|
# 1.1 Visualizing your tree
|
||||||
@ -122,7 +136,7 @@ Ntip(apsTree)
|
|||||||
# Finally, write the tree to console in Newick format
|
# Finally, write the tree to console in Newick format
|
||||||
write.tree(apsTree)
|
write.tree(apsTree)
|
||||||
|
|
||||||
# = 1.1 Rooting Trees
|
# == 2.1 Rooting Trees =====================================================
|
||||||
|
|
||||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||||
@ -168,7 +182,7 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
|
|||||||
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
|
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
|
||||||
|
|
||||||
|
|
||||||
# = 1.1 Rotating Clades
|
# == 2.2 Rotating Clades ===================================================
|
||||||
|
|
||||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||||
# in the order expected from the cladogram of species.
|
# in the order expected from the cladogram of species.
|
||||||
@ -202,11 +216,104 @@ plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
|||||||
add.scale.bar(length=0.5)
|
add.scale.bar(length=0.5)
|
||||||
layout(matrix(1), widths=1.0, heights=1.0)
|
layout(matrix(1), widths=1.0, heights=1.0)
|
||||||
|
|
||||||
# Study the two trees and consider their similarities and differences. What do
|
# Task: Study the two trees and consider their similarities and differences.
|
||||||
# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
|
# What do you expect? What do you find? Note that this is not a "mixed"
|
||||||
# since it contains only a single gene for the species we considered. All of the
|
# gene tree yet, since it contains only a single gene for the species
|
||||||
# branch points in this tree are speciation events.
|
# we considered. All of the branch points in this tree are speciation
|
||||||
|
# events. Thus the gene tree should have the same topology as the
|
||||||
|
# species tree. Does it? Are the differences important? How many
|
||||||
|
# branches would you need to remove and reinsert elsewhere to get the
|
||||||
|
# same topology as the species tree?
|
||||||
|
|
||||||
|
# In order to quantiofy how different these tow trees are, we need to compute
|
||||||
|
# tree distances.
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.3 Computing tree distances ==========================================
|
||||||
|
|
||||||
|
|
||||||
|
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||||
|
|
||||||
|
if (!require(phangorn, quietly=TRUE)) {
|
||||||
|
install.packages("phangorn")
|
||||||
|
library(phangorn)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = phangorn) # basic information
|
||||||
|
# browseVignettes("phangorn") # available vignettes
|
||||||
|
# data(package = "phangorn") # available datasets
|
||||||
|
|
||||||
|
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
|
||||||
|
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
|
||||||
|
apsTree2 <- apsTree
|
||||||
|
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||||
|
|
||||||
|
# phangorn provides several functions to compute tree-differences (and there
|
||||||
|
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||||
|
# "symmetric difference"
|
||||||
|
treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||||
|
|
||||||
|
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||||
|
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||||
|
# The ape package (which was loaded with RPhylip) provides the function rtree()
|
||||||
|
# to compute random trees.
|
||||||
|
|
||||||
|
rtree(n = length(apsTree2$tip.label), # number of tips
|
||||||
|
rooted = TRUE, # we rooted the tree above,
|
||||||
|
# and fungiTree is rooted anyway
|
||||||
|
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||||
|
br = NULL) # don't generate branch lengths since
|
||||||
|
# fungiTree has none, so we can't
|
||||||
|
# compare them anyway.
|
||||||
|
|
||||||
|
# Let's compute some random trees this way, calculate the distances to
|
||||||
|
# fungiTree, and then compare the values we get for apsTree2:
|
||||||
|
|
||||||
|
set.seed(112358)
|
||||||
|
N <- 10000 # takes about 15 seconds
|
||||||
|
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||||
|
colnames(myTreeDistances) <- c("symm", "path")
|
||||||
|
|
||||||
|
for (i in 1:N) {
|
||||||
|
xTree <- rtree(n = length(apsTree2$tip.label),
|
||||||
|
rooted = TRUE,
|
||||||
|
tip.label = apsTree2$tip.label,
|
||||||
|
br = NULL)
|
||||||
|
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
||||||
|
}
|
||||||
|
|
||||||
|
table(myTreeDistances[, "symm"])
|
||||||
|
|
||||||
|
(symmObs <- treedist(fungiTree, apsTree2)[1])
|
||||||
|
|
||||||
|
# Random events less-or-equal to observation, divided by total number of
|
||||||
|
# events gives us the empirical p-value.
|
||||||
|
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
||||||
|
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
||||||
|
|
||||||
|
hist(myTreeDistances[, "path"],
|
||||||
|
col = "aliceblue",
|
||||||
|
main = "Distances of random Trees to fungiTree")
|
||||||
|
(pathObs <- treedist(fungiTree, apsTree2)[2])
|
||||||
|
abline(v = pathObs, col = "chartreuse")
|
||||||
|
|
||||||
|
# Random events less-or-equal to observation, divided by total number of
|
||||||
|
# events gives us the empirical p-value.
|
||||||
|
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
|
||||||
|
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
|
||||||
|
|
||||||
|
# Indeed, our apsTree is _very_ much more similar to the species tree than
|
||||||
|
# we would expect by random chance.
|
||||||
|
|
||||||
|
# What do we gain from that analysis? Analyzing the tree we get from a single
|
||||||
|
# gene of orthologous sequences is a positive control in our computational
|
||||||
|
# experiment. If these genes are indeed orthologues, a correct tree-building
|
||||||
|
# program ought to give us a tree that exactly matches the species tree.
|
||||||
|
# Evaluating how far off we are from the known correct result gives us a way to
|
||||||
|
# validate our workflow and our algorithm. If we can't get that right, we can't
|
||||||
|
# expect to get "real" data right either. Employing such positive controls in
|
||||||
|
# every computational experiment is essential for research. Not doing so is
|
||||||
|
# Cargo Cult Bioinformatics.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -14,7 +14,8 @@
|
|||||||
#
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
# Add MrBayes
|
||||||
|
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
#
|
#
|
||||||
@ -25,8 +26,22 @@
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -------------------------------------------------------
|
||||||
|
#TOC> 1 Calculating Trees 43
|
||||||
|
#TOC> 1.1 PROMLPATH ... 64
|
||||||
|
#TOC> 1.1.1 ... on the Mac 69
|
||||||
|
#TOC> 1.1.2 ... on Windows 80
|
||||||
|
#TOC> 1.1.3 ... on Linux 94
|
||||||
|
#TOC> 1.1.4 Confirming PROMLPATH 99
|
||||||
|
#TOC> 1.2 Building a maxiiimum likelihood tree 108
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
# = 1 Calculating Trees
|
|
||||||
|
# = 1 Calculating Trees ===================================================
|
||||||
|
|
||||||
|
|
||||||
# Follow the instructions found at phylip's home on the Web to install. If you
|
# Follow the instructions found at phylip's home on the Web to install. If you
|
||||||
@ -47,11 +62,12 @@ if (!require(Rphylip, quietly=TRUE)) {
|
|||||||
# This will install RPhylip, as well as its dependency, the package "ape".
|
# This will install RPhylip, as well as its dependency, the package "ape".
|
||||||
|
|
||||||
|
|
||||||
|
# == 1.1 PROMLPATH ... =====================================================
|
||||||
# The next part may be tricky. You will need to figure out where
|
# The next part may be tricky. You will need to figure out where
|
||||||
# on your computer Phylip has been installed and define the path
|
# on your computer Phylip has been installed and define the path
|
||||||
# to the proml program that calculates a maximum-likelihood tree.
|
# to the proml program that calculates a maximum-likelihood tree.
|
||||||
|
|
||||||
|
# === 1.1.1 ... on the Mac
|
||||||
# On the Mac, the standard installation places a phylip folder
|
# On the Mac, the standard installation places a phylip folder
|
||||||
# in the /Applications directory. That folder contains all the
|
# in the /Applications directory. That folder contains all the
|
||||||
# individual phylip programs as <name>.app files. These are not
|
# individual phylip programs as <name>.app files. These are not
|
||||||
@ -62,6 +78,7 @@ if (!require(Rphylip, quietly=TRUE)) {
|
|||||||
# directly to that subdirectory to find the program it needs:
|
# directly to that subdirectory to find the program it needs:
|
||||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
||||||
|
|
||||||
|
# === 1.1.2 ... on Windows
|
||||||
# On Windows you need to know where the rograms have been installed, and you
|
# On Windows you need to know where the rograms have been installed, and you
|
||||||
# need to specify a path that is correct for the Windows OS. Find the folder
|
# need to specify a path that is correct for the Windows OS. Find the folder
|
||||||
# that is named "exe", and right-click to inspect its properties. The path
|
# that is named "exe", and right-click to inspect its properties. The path
|
||||||
@ -75,10 +92,12 @@ if (!require(Rphylip, quietly=TRUE)) {
|
|||||||
# I have heard that your path must not contain spaces, and it is prudent to
|
# I have heard that your path must not contain spaces, and it is prudent to
|
||||||
# avoid other special characters as well.
|
# avoid other special characters as well.
|
||||||
|
|
||||||
|
# === 1.1.3 ... on Linux
|
||||||
# If you are running Linux I trust you know what to do. It's probably
|
# If you are running Linux I trust you know what to do. It's probably
|
||||||
# something like
|
# something like
|
||||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
||||||
|
|
||||||
|
# === 1.1.4 Confirming PROMLPATH
|
||||||
# Confirm that the settings are right.
|
# Confirm that the settings are right.
|
||||||
PROMLPATH # returns the path
|
PROMLPATH # returns the path
|
||||||
list.dirs(PROMLPATH) # returns the directories in that path
|
list.dirs(PROMLPATH) # returns the directories in that path
|
||||||
@ -87,6 +106,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
|||||||
# If "proml" is NOT among the files that the last command returns, you
|
# If "proml" is NOT among the files that the last command returns, you
|
||||||
# can't continue. Ask on the mailing list for advice.
|
# can't continue. Ask on the mailing list for advice.
|
||||||
|
|
||||||
|
# == 1.2 Building a maxiiimum likelihood tree ==============================
|
||||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||||
|
|
||||||
@ -94,11 +114,16 @@ apsIn <- read.protein("APSESphyloSet.mfa")
|
|||||||
|
|
||||||
# ... and you are ready to build a tree.
|
# ... and you are ready to build a tree.
|
||||||
|
|
||||||
|
# There are many fast options in PHYLIP - we will use the most _accurate_ one
|
||||||
|
# that it has: proml, a maximum-likelihood tree building program for protein
|
||||||
|
# data.
|
||||||
|
|
||||||
# Building maximum-likelihood trees can eat as much computer time
|
# Building maximum-likelihood trees can eat as much computer time
|
||||||
# as you can throw at it. Calculating a tree of 48 APSES domains
|
# as you can throw at it. Calculating a tree of 48 APSES domains
|
||||||
# with default parameters of Rproml() runs for more than half a day
|
# with default parameters of Rproml() runs for more than half a day
|
||||||
# on my computer. But we have only twelve sequences here, so the
|
# on my computer. But we have only twelve sequences here, so the
|
||||||
# process will take us about 5 to 10 minutes.
|
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
|
||||||
|
# of coffee while you are waiting.
|
||||||
|
|
||||||
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user