2020-09-18 11:56:30 +00:00
# tocID <- "BIN-PHYLO-Tree_analysis.R"
#
2017-09-12 20:09:20 +00:00
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
2020-09-26 11:39:18 +00:00
# Version: 1.2
2017-09-12 20:09:20 +00:00
#
2020-09-26 11:39:18 +00:00
# Date: 2017-10 - 2020-09
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2020-09-26 11:39:18 +00:00
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
# Rewrite of tip re-ordering. Better handling of
# messages. pBar() for randomization.
2019-01-08 07:11:25 +00:00
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
2017-11-23 05:32:13 +00:00
# 1.0.2 Typo in variable name, style changes
2017-11-10 04:43:48 +00:00
# 1.0.1 Wrong section heading
2017-11-01 13:44:24 +00:00
# 1.0 First 2017 version
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-11-01 13:44:24 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-11-02 03:30:01 +00:00
#TOC> ==========================================================================
2020-09-26 11:39:18 +00:00
#TOC>
2019-01-07 06:17:23 +00:00
#TOC> Section Title Line
#TOC> --------------------------------------------------
2020-09-26 11:39:18 +00:00
#TOC> 1 Preparation and Tree Plot 50
#TOC> 2 SPECIES REFERENCE TREE 66
#TOC> 3 Tree Analysis 117
#TOC> 3.1 Rooting Trees 177
#TOC> 3.2 Rotating Clades 222
#TOC> 3.3 Computing tree distances 309
#TOC>
2017-11-02 03:30:01 +00:00
#TOC> ==========================================================================
2017-11-23 05:32:13 +00:00
# = 1 Preparation and Tree Plot ===========================================
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " ape" , quietly = TRUE ) ) {
install.packages ( " ape" )
2017-11-01 13:44:24 +00:00
}
# Package information:
2019-01-08 07:11:25 +00:00
# library(help = ape) # basic information
# browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets
2017-11-01 13:44:24 +00:00
2020-09-26 11:39:18 +00:00
# We change the graphics parameters from time to time, let's define the
# default so we can recreate a sane state:
dev.off ( )
PAR <- par ( )
# = 2 SPECIES REFERENCE TREE ==============================================
# Before we do any kind of phylogenetic analysis of genes from several species,
# we MUST have a reference tree of the taxonomic relationships in hand. This
# context is absolutely required for the interpretation of our tree.
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
if ( ! requireNamespace ( " taxize" , quietly = TRUE ) ) {
install.packages ( " taxize" )
}
# Package information:
# library(help = taxize) # basic information
# browseVignettes("taxize") # available vignettes
# data(package = "taxize") # available datasets
( mySOI <- c ( myDB $ taxonomy $ ID , " 83333" ) )
myClass <- taxize :: classification ( mySOI , db = " ncbi" )
str ( myClass )
2017-11-01 13:44:24 +00:00
2020-09-26 11:39:18 +00:00
myClass [ [1 ] ]
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
fungiTree <- taxize :: class2tree ( myClass , check = TRUE )
2017-11-01 13:44:24 +00:00
plot ( fungiTree )
2020-09-26 11:39:18 +00:00
# The tree produced by taxize:: contains full length species names,
# but it would be more convenient if it had bicodes instead. Also, the actual
# tree is only part of the list(), which will cause problems later:
2017-11-01 13:44:24 +00:00
str ( fungiTree )
2020-09-26 11:39:18 +00:00
# we therefor simplify
fungiTree <- fungiTree $ phylo
str ( fungiTree )
2017-11-01 13:44:24 +00:00
2020-09-26 11:39:18 +00:00
# The species names are in a vector $phylo$tip.label of this list.
# We can use biCode() to shorten them.
fungiTree $ tip.label <- biCode ( fungiTree $ tip.label )
2017-09-12 20:09:20 +00:00
# Plot the tree
2020-09-26 11:39:18 +00:00
nSP <- length ( fungiTree $ tip.label )
plot ( fungiTree , cex = 0.8 , root.edge = TRUE , no.margin = TRUE )
text ( -1 , nSP - 0.5 , " Species Tree:\nFungi" , pos = 4 )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( text = fungiTree $ node.label ,
cex = 0.6 ,
adj = 0.2 ,
bg = " #D4F2DA" )
2020-09-26 11:39:18 +00:00
# Note that you can use the arrow buttons in the menu above the plot pane to
# scroll back to plots you have created earlier - so you can reference back to
# this species tree in your later analysis.
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
# = 3 Tree Analysis =======================================================
2017-09-12 20:09:20 +00:00
2017-11-01 13:44:24 +00:00
# 1.1 Visualizing your tree
2017-09-12 20:09:20 +00:00
# The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
2019-01-08 07:11:25 +00:00
# print it have been defined with the Rphylip package, and in ape. You can
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
# to plot it. The underlying function is plot.phylo(), and documentation for its
# many options can by found by typing:
2017-09-12 20:09:20 +00:00
? plot.phylo
2017-11-01 13:44:24 +00:00
# We load the APSES sequence tree that you produced in the
# BIN-PHYLO-Tree_building unit:
2020-09-26 11:39:18 +00:00
apsTree <- readRDS ( file = " data/APSEStreeRproml.rds" )
2017-11-01 13:44:24 +00:00
2017-09-12 20:09:20 +00:00
plot ( apsTree ) # default type is "phylogram"
2017-11-23 05:32:13 +00:00
plot ( apsTree , type = " unrooted" )
plot ( apsTree , type = " fan" , no.margin = TRUE )
2017-09-12 20:09:20 +00:00
# rescale to show all of the labels:
2017-11-01 13:44:24 +00:00
# record the current plot parameters by assigning them to a variable ...
( tmp <- plot ( apsTree , type = " fan" , no.margin = TRUE , plot = FALSE ) )
# ... and adjust the plot limits for a new plot:
2017-09-12 20:09:20 +00:00
plot ( apsTree ,
2017-11-23 05:32:13 +00:00
type = " fan" ,
2017-09-12 20:09:20 +00:00
x.lim = tmp $ x.lim * 1.8 ,
y.lim = tmp $ y.lim * 1.8 ,
cex = 0.8 ,
no.margin = TRUE )
# Inspect the tree object
str ( apsTree )
apsTree $ tip.label
apsTree $ edge
apsTree $ edge.length
# show the node / edge and tip labels on a plot
plot ( apsTree )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( )
ape :: edgelabels ( )
ape :: tiplabels ( )
2017-09-12 20:09:20 +00:00
# show the number of nodes, edges and tips
2019-01-08 07:11:25 +00:00
ape :: Nnode ( apsTree )
ape :: Nedge ( apsTree )
ape :: Ntip ( apsTree )
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
par ( PAR ) # reset graphics state
2017-09-12 20:09:20 +00:00
# Finally, write the tree to console in Newick format
2019-01-08 07:11:25 +00:00
ape :: write.tree ( apsTree )
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
# == 3.1 Rooting Trees =====================================================
2017-09-12 20:09:20 +00:00
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
2019-01-08 07:11:25 +00:00
ape :: is.rooted ( apsTree )
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
# You can root the tree with the command root() from the "ape" package.
2017-09-12 20:09:20 +00:00
plot ( apsTree )
# add labels for internal nodes and tips
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( cex = 0.5 , frame = " circle" )
ape :: tiplabels ( cex = 0.5 , frame = " rect" )
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
2017-09-12 20:09:20 +00:00
# number in yours. Substitute the correct node number below for "outgroup".
2019-01-08 07:11:25 +00:00
apsTree <- ape :: root ( apsTree , outgroup = 11 , resolve.root = TRUE )
2017-09-12 20:09:20 +00:00
plot ( apsTree )
2019-01-08 07:11:25 +00:00
ape :: is.rooted ( apsTree )
2017-09-12 20:09:20 +00:00
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common
# Ancestor) of the ingroup.
# The edge lengths are stored in the phylo object:
apsTree $ edge.length
# ... and you can assign a small arbitrary value to the edge
# to show how it connects to the tree without having an
# overlap.
apsTree $ edge.length [1 ] <- 0.1
2017-11-23 05:32:13 +00:00
plot ( apsTree , cex = 0.7 )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( text = " MRCA" , node = 12 , cex = 0.5 , adj = 0.1 , bg = " #ff8866" )
2017-09-12 20:09:20 +00:00
# This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
# myself that too. We'll just add a length by hand.
apsTree $ root.edge <- mean ( apsTree $ edge.length ) * 1.5
2017-11-23 05:32:13 +00:00
plot ( apsTree , cex = 0.7 , root.edge = TRUE )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( text = " MRCA" , node = 12 , cex = 0.5 , adj = 0.8 , bg = " #ff8866" )
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
# == 3.2 Rotating Clades ===================================================
2017-09-12 20:09:20 +00:00
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
2017-11-01 13:44:24 +00:00
# We can either rotate around individual internal nodes ...
2017-09-12 20:09:20 +00:00
layout ( matrix ( 1 : 2 , 1 , 2 ) )
2017-11-23 05:32:13 +00:00
plot ( apsTree , no.margin = TRUE , root.edge = TRUE )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( node = 13 , cex = 0.7 , bg = " #ff8866" )
plot ( ape :: rotate ( apsTree , node = 13 ) , no.margin = TRUE , root.edge = TRUE )
ape :: nodelabels ( node = 13 , cex = 0.7 , bg = " #88ff66" )
2017-11-01 13:44:24 +00:00
# Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top.
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
par ( PAR ) # reset graphics state
2017-09-12 20:09:20 +00:00
2020-09-26 11:39:18 +00:00
# ... or we can rearrange the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
# from the NCBI taxonomic classification.
2017-09-12 20:09:20 +00:00
nOrg <- length ( apsTree $ tip.label )
2020-09-26 11:39:18 +00:00
plot ( fungiTree ,
no.margin = FALSE , root.edge = TRUE )
ape :: nodelabels ( text = fungiTree $ node.label ,
cex = 0.5 ,
adj = 0.2 ,
bg = " #D4F2DA" )
# These are the fungi tree tips ...
fungiTree $ tip.label
# ... and their order is determined by the edge-list that is stored in
fungiTree $ edge
# which edges join the tips?
ape :: tiplabels ( cex = 0.5 , frame = " rect" )
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
# ordered from bottom to top.
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
sel <- fungiTree $ edge [ , 2 ] <= nOrg
( oSp <- fungiTree $ tip.label [fungiTree $ edge [sel , 2 ] ] )
# Now, here are the genes of the apsTree tips ...
apsTree $ tip.label
# ... and the "constraint" we need for reordering, according to the help page
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
oSp <- gsub ( " ^" , " MBP1_" , oSp )
( oSp <- gsub ( " MBP1_ESSCO" , " KILA_ESCCO" , oSp ) )
# Then we can plot the two trees to compare: the fungi- tree
par ( PAR ) # reset graphics state
2017-09-12 20:09:20 +00:00
layout ( matrix ( 1 : 2 , 1 , 2 ) )
2017-11-01 13:44:24 +00:00
plot ( fungiTree ,
2020-09-26 11:39:18 +00:00
no.margin = TRUE ,
root.edge = TRUE )
2019-01-08 07:11:25 +00:00
ape :: nodelabels ( text = fungiTree $ node.label ,
cex = 0.5 ,
adj = 0.2 ,
bg = " #D4F2DA" )
2020-09-26 11:39:18 +00:00
# and the re-organized apsesTree ...
plot ( ape :: rotateConstr ( apsTree , constraint = oSp [ ] ) ,
2019-01-08 07:11:25 +00:00
no.margin = TRUE ,
root.edge = TRUE )
2020-09-26 11:39:18 +00:00
par ( PAR ) # reset graphics state
# As you can see, the reordering is not perfect, since the topologies are
# different, mostly due to the unresolved nodes in the reference tree. One
# could play with that ...
2017-09-12 20:09:20 +00:00
2017-11-02 03:30:01 +00:00
# Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree?
2020-09-26 11:39:18 +00:00
# In order to quantify how different these two trees are, we need to compute
2017-11-02 03:30:01 +00:00
# tree distances.
2020-09-26 11:39:18 +00:00
# == 3.3 Computing tree distances ==========================================
2017-11-02 03:30:01 +00:00
# Many superb phylogeny tools are contributed by the phangorn package.
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " phangorn" , quietly = TRUE ) ) {
2017-11-02 03:30:01 +00:00
install.packages ( " phangorn" )
}
# Package information:
# library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
apsTree2 <- apsTree
apsTree2 $ tip.label <- gsub ( " (MBP1_)|(KILA_)" , " " , apsTree2 $ tip.label )
2020-09-26 11:39:18 +00:00
2017-11-02 03:30:01 +00:00
# phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
2019-01-08 07:11:25 +00:00
phangorn :: treedist ( fungiTree , apsTree2 , check.labels = TRUE )
2017-11-02 03:30:01 +00:00
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
2019-01-08 07:11:25 +00:00
# The ape package provides the function rtree()
2017-11-02 03:30:01 +00:00
# to compute random trees.
2019-01-08 07:11:25 +00:00
ape :: rtree ( n = length ( apsTree2 $ tip.label ) , # number of tips
rooted = TRUE , # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2 $ tip.label , # use the apsTree2 labels
br = NULL ) # don't generate branch lengths since
# fungiTree has none, so we can't
# compare them anyway.
2017-11-02 03:30:01 +00:00
2020-09-26 11:39:18 +00:00
# (Note the warning message about non-binary trees; we'll suppress that later
# by wrapping the function call in supressMessages(); we don't want to
# print it 10,000 times :-)
2017-11-02 03:30:01 +00:00
# Let's compute some random trees this way, calculate the distances to
2019-01-07 06:17:23 +00:00
# fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
2017-11-02 03:30:01 +00:00
2020-09-26 11:39:18 +00:00
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
# defined in .utilities.R to keep track of where we are at:
2017-11-02 03:30:01 +00:00
myTreeDistances <- matrix ( numeric ( N * 2 ) , ncol = 2 )
colnames ( myTreeDistances ) <- c ( " symm" , " path" )
2019-01-07 06:17:23 +00:00
set.seed ( 112358 )
2017-11-02 03:30:01 +00:00
for ( i in 1 : N ) {
2020-09-26 11:39:18 +00:00
pBar ( i , N )
2019-01-08 07:11:25 +00:00
xTree <- ape :: rtree ( n = length ( apsTree2 $ tip.label ) ,
rooted = TRUE ,
tip.label = apsTree2 $ tip.label ,
br = NULL )
2020-09-26 11:39:18 +00:00
myTreeDistances [i , ] <- suppressMessages ( phangorn :: treedist ( fungiTree , xTree ) )
2017-11-02 03:30:01 +00:00
}
2019-01-07 06:17:23 +00:00
set.seed ( NULL ) # reset the random number generator
2017-09-12 20:09:20 +00:00
2017-11-02 03:30:01 +00:00
table ( myTreeDistances [ , " symm" ] )
2020-09-26 11:39:18 +00:00
( symmObs <- phangorn :: treedist ( fungiTree , apsTree2 ) [1 ] )
2017-11-02 03:30:01 +00:00
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat ( sprintf ( " \nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n" ,
( sum ( myTreeDistances [ , " symm" ] <= symmObs ) + 1 ) / ( N + 1 ) ) )
2020-09-26 11:39:18 +00:00
par ( PAR ) # reset graphics state
2017-11-02 03:30:01 +00:00
hist ( myTreeDistances [ , " path" ] ,
col = " aliceblue" ,
main = " Distances of random Trees to fungiTree" )
2019-01-08 07:11:25 +00:00
( pathObs <- phangorn :: treedist ( fungiTree , apsTree2 ) [2 ] )
2017-11-02 03:30:01 +00:00
abline ( v = pathObs , col = " chartreuse" )
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat ( sprintf ( " \nEmpirical p-value for path diff. of observed tree is %1.4f\n" ,
( sum ( myTreeDistances [ , " path" ] <= symmObs ) + 1 ) / ( N + 1 ) ) )
# Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics.
2017-09-12 20:09:20 +00:00
# [END]