Updates, and add treedist()

This commit is contained in:
hyginn 2017-11-01 23:30:01 -04:00
parent 0da30ec109
commit 1d6170f417
3 changed files with 163 additions and 28 deletions

View File

@ -5,7 +5,7 @@
#
# Version: 1.0
#
# Date: 2017 10. 31
# Date: 2017 10 31
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
@ -24,6 +24,20 @@
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 Preparations 41
#TOC> 2 Fetching sequences 78
#TOC> 3 Multiple Sequence Alignment 119
#TOC> 4 Reviewing and Editing Alignments 136
#TOC> 4.1 Masking workflow 152
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
@ -61,18 +75,7 @@ if (! require(msa, quietly=TRUE)) {
# data(package = "msa") # available datasets
if (! require(stringr, quietly=TRUE)) {
install.packages("stringr")
library(stringr)
}
# Package information:
# library(help=stringr) # basic information
# browseVignettes("stringr") # available vignettes
# data(package = "stringr") # available datasets
# = 1 Fetching sequences
# = 2 Fetching sequences ==================================================
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
@ -113,7 +116,7 @@ names(APSI)[length(APSI)] <- "KILA_ESCCO"
tail(APSI)
# = 1 Multiple Sequence Alignment
# = 3 Multiple Sequence Alignment =========================================
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
@ -130,7 +133,7 @@ writeALN(APSESMsa)
# What do you think? Is this a good alignment for phylogenetic inference?
# = 1 Reviewing and Editing Alignments
# = 4 Reviewing and Editing Alignments ====================================
# Head back to the Wiki page for this unit and read up on the background
@ -146,7 +149,7 @@ writeALN(APSESMsa)
# go through the matrix, column by column and decide
# whether we want to include that column.
# = 1.1 Masking workflow
# == 4.1 Masking workflow ==================================================
# get the length of the alignment
(lenAli <- APSESMsa@unmasked@ranges@width[1])

View File

@ -24,7 +24,21 @@
#
# ==============================================================================
# = 1 ___Section___
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------
#TOC> 1 ___Section___ 38
#TOC> 2 Tree Analysis 77
#TOC> 2.1 Rooting Trees 136
#TOC> 2.2 Rotating Clades 182
#TOC> 2.3 Computing tree distances 229
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
if (!require(Rphylip, quietly=TRUE)) {
@ -63,7 +77,7 @@ nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
# species tree.
# = 1 Tree Analysis
# = 2 Tree Analysis =======================================================
# 1.1 Visualizing your tree
@ -122,7 +136,7 @@ Ntip(apsTree)
# Finally, write the tree to console in Newick format
write.tree(apsTree)
# = 1.1 Rooting Trees
# == 2.1 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
@ -168,7 +182,7 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
# = 1.1 Rotating Clades
# == 2.2 Rotating Clades ===================================================
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
@ -202,11 +216,104 @@ plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
add.scale.bar(length=0.5)
layout(matrix(1), widths=1.0, heights=1.0)
# Study the two trees and consider their similarities and differences. What do
# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
# since it contains only a single gene for the species we considered. All of the
# branch points in this tree are speciation events.
# Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree?
# In order to quantiofy how different these tow trees are, we need to compute
# tree distances.
# == 2.3 Computing tree distances ==========================================
# Many superb phylogeny tools are contributed by the phangorn package.
if (!require(phangorn, quietly=TRUE)) {
install.packages("phangorn")
library(phangorn)
}
# Package information:
# library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
apsTree2 <- apsTree
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
# The ape package (which was loaded with RPhylip) provides the function rtree()
# to compute random trees.
rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels
br = NULL) # don't generate branch lengths since
# fungiTree has none, so we can't
# compare them anyway.
# Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2:
set.seed(112358)
N <- 10000 # takes about 15 seconds
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path")
for (i in 1:N) {
xTree <- rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
tip.label = apsTree2$tip.label,
br = NULL)
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
}
table(myTreeDistances[, "symm"])
(symmObs <- treedist(fungiTree, apsTree2)[1])
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
hist(myTreeDistances[, "path"],
col = "aliceblue",
main = "Distances of random Trees to fungiTree")
(pathObs <- treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
# Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics.
# [END]

View File

@ -14,7 +14,8 @@
#
#
# TODO:
#
# Add MrBayes
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
@ -25,8 +26,22 @@
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------------
#TOC> 1 Calculating Trees 43
#TOC> 1.1 PROMLPATH ... 64
#TOC> 1.1.1 ... on the Mac 69
#TOC> 1.1.2 ... on Windows 80
#TOC> 1.1.3 ... on Linux 94
#TOC> 1.1.4 Confirming PROMLPATH 99
#TOC> 1.2 Building a maxiiimum likelihood tree 108
#TOC>
#TOC> ==========================================================================
# = 1 Calculating Trees
# = 1 Calculating Trees ===================================================
# Follow the instructions found at phylip's home on the Web to install. If you
@ -47,11 +62,12 @@ if (!require(Rphylip, quietly=TRUE)) {
# This will install RPhylip, as well as its dependency, the package "ape".
# == 1.1 PROMLPATH ... =====================================================
# The next part may be tricky. You will need to figure out where
# on your computer Phylip has been installed and define the path
# to the proml program that calculates a maximum-likelihood tree.
# === 1.1.1 ... on the Mac
# On the Mac, the standard installation places a phylip folder
# in the /Applications directory. That folder contains all the
# individual phylip programs as <name>.app files. These are not
@ -62,6 +78,7 @@ if (!require(Rphylip, quietly=TRUE)) {
# directly to that subdirectory to find the program it needs:
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
# === 1.1.2 ... on Windows
# On Windows you need to know where the rograms have been installed, and you
# need to specify a path that is correct for the Windows OS. Find the folder
# that is named "exe", and right-click to inspect its properties. The path
@ -75,10 +92,12 @@ if (!require(Rphylip, quietly=TRUE)) {
# I have heard that your path must not contain spaces, and it is prudent to
# avoid other special characters as well.
# === 1.1.3 ... on Linux
# If you are running Linux I trust you know what to do. It's probably
# something like
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
# === 1.1.4 Confirming PROMLPATH
# Confirm that the settings are right.
PROMLPATH # returns the path
list.dirs(PROMLPATH) # returns the directories in that path
@ -87,6 +106,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
# If "proml" is NOT among the files that the last command returns, you
# can't continue. Ask on the mailing list for advice.
# == 1.2 Building a maxiiimum likelihood tree ==============================
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
# as a "proseq" object with the read.protein() function of the RPhylip package:
@ -94,11 +114,16 @@ apsIn <- read.protein("APSESphyloSet.mfa")
# ... and you are ready to build a tree.
# There are many fast options in PHYLIP - we will use the most _accurate_ one
# that it has: proml, a maximum-likelihood tree building program for protein
# data.
# Building maximum-likelihood trees can eat as much computer time
# as you can throw at it. Calculating a tree of 48 APSES domains
# with default parameters of Rproml() runs for more than half a day
# on my computer. But we have only twelve sequences here, so the
# process will take us about 5 to 10 minutes.
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
# of coffee while you are waiting.
apsTree <- Rproml(apsIn, path=PROMLPATH)