Deprecate iTol and use taxize:: instead. Rewrite of tip re-ordering.
This commit is contained in:
parent
68b00c83e8
commit
777cabcb20
@ -1,20 +1,17 @@
|
|||||||
# tocID <- "BIN-PHYLO-Tree_analysis.R"
|
# tocID <- "BIN-PHYLO-Tree_analysis.R"
|
||||||
#
|
#
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
# PATIENCE ... #
|
|
||||||
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
||||||
# boris.steipe@utoronto.ca #
|
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
#
|
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2019 01
|
# Date: 2017-10 - 2020-09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
|
||||||
|
# Rewrite of tip re-ordering. Better handling of
|
||||||
|
# messages. pBar() for randomization.
|
||||||
# 1.1 Change from require() to requireNamespace(),
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# use Biocmanager:: not biocLite()
|
# use Biocmanager:: not biocLite()
|
||||||
@ -40,11 +37,12 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------
|
#TOC> --------------------------------------------------
|
||||||
#TOC> 1 Preparation and Tree Plot 46
|
#TOC> 1 Preparation and Tree Plot 50
|
||||||
#TOC> 2 Tree Analysis 86
|
#TOC> 2 SPECIES REFERENCE TREE 66
|
||||||
#TOC> 2.1 Rooting Trees 145
|
#TOC> 3 Tree Analysis 117
|
||||||
#TOC> 2.2 Rotating Clades 190
|
#TOC> 3.1 Rooting Trees 177
|
||||||
#TOC> 2.3 Computing tree distances 241
|
#TOC> 3.2 Rotating Clades 222
|
||||||
|
#TOC> 3.3 Computing tree distances 309
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -60,36 +58,63 @@ if (! requireNamespace("ape", quietly = TRUE)) {
|
|||||||
# browseVignettes("ape") # available vignettes
|
# browseVignettes("ape") # available vignettes
|
||||||
# data(package = "ape") # available datasets
|
# data(package = "ape") # available datasets
|
||||||
|
|
||||||
|
# We change the graphics parameters from time to time, let's define the
|
||||||
|
# default so we can recreate a sane state:
|
||||||
|
dev.off()
|
||||||
|
PAR <- par()
|
||||||
|
|
||||||
# Read the species tree that you have created at the phyloT Website:
|
# = 2 SPECIES REFERENCE TREE ==============================================
|
||||||
fungiTree <- ape::read.tree("fungiTree.txt")
|
|
||||||
|
|
||||||
|
# Before we do any kind of phylogenetic analysis of genes from several species,
|
||||||
|
# we MUST have a reference tree of the taxonomic relationships in hand. This
|
||||||
|
# context is absolutely required for the interpretation of our tree.
|
||||||
|
|
||||||
|
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
|
||||||
|
|
||||||
|
if (! requireNamespace("taxize", quietly = TRUE)) {
|
||||||
|
install.packages("taxize")
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = taxize) # basic information
|
||||||
|
# browseVignettes("taxize") # available vignettes
|
||||||
|
# data(package = "taxize") # available datasets
|
||||||
|
|
||||||
|
( mySOI <- c(myDB$taxonomy$ID, "83333") )
|
||||||
|
myClass <- taxize::classification(mySOI, db = "ncbi")
|
||||||
|
str(myClass)
|
||||||
|
|
||||||
|
myClass[[1]]
|
||||||
|
|
||||||
|
fungiTree <- taxize::class2tree(myClass, check = TRUE)
|
||||||
plot(fungiTree)
|
plot(fungiTree)
|
||||||
|
|
||||||
# The tree produced by phyloT contains full length species names, but it would
|
# The tree produced by taxize:: contains full length species names,
|
||||||
# be more convenient if it had bicodes instead.
|
# but it would be more convenient if it had bicodes instead. Also, the actual
|
||||||
|
# tree is only part of the list(), which will cause problems later:
|
||||||
str(fungiTree)
|
str(fungiTree)
|
||||||
|
|
||||||
# The species names are in a vector $tip.label of this list. We can use bicode()
|
# we therefor simplify
|
||||||
# to shorten them - but note that they have underscores as word separators. Thus
|
fungiTree <- fungiTree$phylo
|
||||||
# we will use gsub("-", " ", ...) to replace the underscores with spaces.
|
str(fungiTree)
|
||||||
|
|
||||||
for (i in seq_along(fungiTree$tip.label)) {
|
# The species names are in a vector $phylo$tip.label of this list.
|
||||||
fungiTree$tip.label[i] <- biCode(gsub("_", " ", fungiTree$tip.label[i]))
|
# We can use biCode() to shorten them.
|
||||||
}
|
fungiTree$tip.label <- biCode(fungiTree$tip.label)
|
||||||
|
|
||||||
# Plot the tree
|
# Plot the tree
|
||||||
plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE)
|
nSP <- length(fungiTree$tip.label)
|
||||||
|
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
|
||||||
|
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
|
||||||
ape::nodelabels(text = fungiTree$node.label,
|
ape::nodelabels(text = fungiTree$node.label,
|
||||||
cex = 0.6,
|
cex = 0.6,
|
||||||
adj = 0.2,
|
adj = 0.2,
|
||||||
bg = "#D4F2DA")
|
bg = "#D4F2DA")
|
||||||
# Note that you can use the arrow buttons in the menu above the plot to scroll
|
# Note that you can use the arrow buttons in the menu above the plot pane to
|
||||||
# back to plots you have created earlier - so you can reference back to the
|
# scroll back to plots you have created earlier - so you can reference back to
|
||||||
# species tree.
|
# this species tree in your later analysis.
|
||||||
|
|
||||||
|
|
||||||
# = 2 Tree Analysis =======================================================
|
# = 3 Tree Analysis =======================================================
|
||||||
|
|
||||||
|
|
||||||
# 1.1 Visualizing your tree
|
# 1.1 Visualizing your tree
|
||||||
@ -110,7 +135,7 @@ ape::nodelabels(text = fungiTree$node.label,
|
|||||||
|
|
||||||
# We load the APSES sequence tree that you produced in the
|
# We load the APSES sequence tree that you produced in the
|
||||||
# BIN-PHYLO-Tree_building unit:
|
# BIN-PHYLO-Tree_building unit:
|
||||||
apsTree <- readRDS(file = "APSEStreeRproml.rds")
|
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
|
||||||
|
|
||||||
plot(apsTree) # default type is "phylogram"
|
plot(apsTree) # default type is "phylogram"
|
||||||
plot(apsTree, type = "unrooted")
|
plot(apsTree, type = "unrooted")
|
||||||
@ -144,11 +169,12 @@ ape::Nnode(apsTree)
|
|||||||
ape::Nedge(apsTree)
|
ape::Nedge(apsTree)
|
||||||
ape::Ntip(apsTree)
|
ape::Ntip(apsTree)
|
||||||
|
|
||||||
|
par(PAR) # reset graphics state
|
||||||
|
|
||||||
# Finally, write the tree to console in Newick format
|
# Finally, write the tree to console in Newick format
|
||||||
ape::write.tree(apsTree)
|
ape::write.tree(apsTree)
|
||||||
|
|
||||||
# == 2.1 Rooting Trees =====================================================
|
# == 3.1 Rooting Trees =====================================================
|
||||||
|
|
||||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||||
@ -163,7 +189,7 @@ plot(apsTree)
|
|||||||
ape::nodelabels(cex = 0.5, frame = "circle")
|
ape::nodelabels(cex = 0.5, frame = "circle")
|
||||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||||
|
|
||||||
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
|
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
|
||||||
# number in yours. Substitute the correct node number below for "outgroup".
|
# number in yours. Substitute the correct node number below for "outgroup".
|
||||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
@ -193,7 +219,7 @@ plot(apsTree, cex = 0.7, root.edge = TRUE)
|
|||||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||||
|
|
||||||
|
|
||||||
# == 2.2 Rotating Clades ===================================================
|
# == 3.2 Rotating Clades ===================================================
|
||||||
|
|
||||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||||
# in the order expected from the cladogram of species.
|
# in the order expected from the cladogram of species.
|
||||||
@ -206,30 +232,66 @@ plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
|
|||||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
||||||
# Note that the species at the bottom of the clade descending from node
|
# Note that the species at the bottom of the clade descending from node
|
||||||
# 17 is now plotted at the top.
|
# 17 is now plotted at the top.
|
||||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
|
||||||
|
|
||||||
# ... or we can plot the tree so it corresponds as well as possible to a
|
par(PAR) # reset graphics state
|
||||||
# predefined tip ordering. Here we use the ordering that phyloT has returned
|
|
||||||
# for the species tree.
|
|
||||||
|
|
||||||
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
# ... or we can rearrange the tree so it corresponds as well as possible to a
|
||||||
# expression [nOrg:1] below instead of using the vector directly.)
|
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
|
||||||
|
# from the NCBI taxonomic classification.
|
||||||
|
|
||||||
nOrg <- length(apsTree$tip.label)
|
nOrg <- length(apsTree$tip.label)
|
||||||
|
|
||||||
layout(matrix(1:2, 1, 2))
|
|
||||||
plot(fungiTree,
|
plot(fungiTree,
|
||||||
no.margin = TRUE, root.edge = TRUE)
|
no.margin = FALSE, root.edge = TRUE)
|
||||||
ape::nodelabels(text = fungiTree$node.label,
|
ape::nodelabels(text = fungiTree$node.label,
|
||||||
cex = 0.5,
|
cex = 0.5,
|
||||||
adj = 0.2,
|
adj = 0.2,
|
||||||
bg = "#D4F2DA")
|
bg = "#D4F2DA")
|
||||||
|
|
||||||
plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
# These are the fungi tree tips ...
|
||||||
|
fungiTree$tip.label
|
||||||
|
# ... and their order is determined by the edge-list that is stored in
|
||||||
|
fungiTree$edge
|
||||||
|
# which edges join the tips?
|
||||||
|
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||||
|
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
|
||||||
|
# ordered from bottom to top.
|
||||||
|
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
|
||||||
|
|
||||||
|
sel <- fungiTree$edge[ , 2 ] <= nOrg
|
||||||
|
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
|
||||||
|
|
||||||
|
# Now, here are the genes of the apsTree tips ...
|
||||||
|
apsTree$tip.label
|
||||||
|
|
||||||
|
# ... and the "constraint" we need for reordering, according to the help page
|
||||||
|
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
|
||||||
|
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
|
||||||
|
oSp <- gsub("^", "MBP1_", oSp)
|
||||||
|
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
|
||||||
|
|
||||||
|
# Then we can plot the two trees to compare: the fungi- tree
|
||||||
|
par(PAR) # reset graphics state
|
||||||
|
layout(matrix(1:2, 1, 2))
|
||||||
|
plot(fungiTree,
|
||||||
no.margin = TRUE,
|
no.margin = TRUE,
|
||||||
root.edge = TRUE)
|
root.edge = TRUE)
|
||||||
ape::add.scale.bar(length = 0.5)
|
ape::nodelabels(text = fungiTree$node.label,
|
||||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
cex = 0.5,
|
||||||
|
adj = 0.2,
|
||||||
|
bg = "#D4F2DA")
|
||||||
|
|
||||||
|
# and the re-organized apsesTree ...
|
||||||
|
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
|
||||||
|
no.margin = TRUE,
|
||||||
|
root.edge = TRUE)
|
||||||
|
|
||||||
|
par(PAR) # reset graphics state
|
||||||
|
|
||||||
|
# As you can see, the reordering is not perfect, since the topologies are
|
||||||
|
# different, mostly due to the unresolved nodes in the reference tree. One
|
||||||
|
# could play with that ...
|
||||||
|
|
||||||
|
|
||||||
# Task: Study the two trees and consider their similarities and differences.
|
# Task: Study the two trees and consider their similarities and differences.
|
||||||
# What do you expect? What do you find? Note that this is not a "mixed"
|
# What do you expect? What do you find? Note that this is not a "mixed"
|
||||||
@ -240,11 +302,11 @@ layout(matrix(1), widths = 1.0, heights = 1.0)
|
|||||||
# branches would you need to remove and reinsert elsewhere to get the
|
# branches would you need to remove and reinsert elsewhere to get the
|
||||||
# same topology as the species tree?
|
# same topology as the species tree?
|
||||||
|
|
||||||
# In order to quantiofy how different these tow trees are, we need to compute
|
# In order to quantify how different these two trees are, we need to compute
|
||||||
# tree distances.
|
# tree distances.
|
||||||
|
|
||||||
|
|
||||||
# == 2.3 Computing tree distances ==========================================
|
# == 3.3 Computing tree distances ==========================================
|
||||||
|
|
||||||
|
|
||||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||||
@ -262,6 +324,7 @@ if (! requireNamespace("phangorn", quietly = TRUE)) {
|
|||||||
apsTree2 <- apsTree
|
apsTree2 <- apsTree
|
||||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||||
|
|
||||||
|
|
||||||
# phangorn provides several functions to compute tree-differences (and there
|
# phangorn provides several functions to compute tree-differences (and there
|
||||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||||
# "symmetric difference"
|
# "symmetric difference"
|
||||||
@ -280,21 +343,28 @@ ape::rtree(n = length(apsTree2$tip.label), # number of tips
|
|||||||
# fungiTree has none, so we can't
|
# fungiTree has none, so we can't
|
||||||
# compare them anyway.
|
# compare them anyway.
|
||||||
|
|
||||||
|
# (Note the warning message about non-binary trees; we'll suppress that later
|
||||||
|
# by wrapping the function call in supressMessages(); we don't want to
|
||||||
|
# print it 10,000 times :-)
|
||||||
|
|
||||||
|
|
||||||
# Let's compute some random trees this way, calculate the distances to
|
# Let's compute some random trees this way, calculate the distances to
|
||||||
# fungiTree, and then compare the values we get for apsTree2. The random
|
# fungiTree, and then compare the values we get for apsTree2. The random
|
||||||
# trees are provided by ape::rtree().
|
# trees are provided by ape::rtree().
|
||||||
|
|
||||||
N <- 10000 # takes about 15 seconds
|
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
|
||||||
|
# defined in .utilities.R to keep track of where we are at:
|
||||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||||
colnames(myTreeDistances) <- c("symm", "path")
|
colnames(myTreeDistances) <- c("symm", "path")
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
|
pBar(i, N)
|
||||||
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
||||||
rooted = TRUE,
|
rooted = TRUE,
|
||||||
tip.label = apsTree2$tip.label,
|
tip.label = apsTree2$tip.label,
|
||||||
br = NULL)
|
br = NULL)
|
||||||
myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree)
|
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
|
||||||
}
|
}
|
||||||
set.seed(NULL) # reset the random number generator
|
set.seed(NULL) # reset the random number generator
|
||||||
|
|
||||||
@ -307,6 +377,7 @@ table(myTreeDistances[, "symm"])
|
|||||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
||||||
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
||||||
|
|
||||||
|
par(PAR) # reset graphics state
|
||||||
hist(myTreeDistances[, "path"],
|
hist(myTreeDistances[, "path"],
|
||||||
col = "aliceblue",
|
col = "aliceblue",
|
||||||
main = "Distances of random Trees to fungiTree")
|
main = "Distances of random Trees to fungiTree")
|
||||||
|
Loading…
Reference in New Issue
Block a user