diff --git a/BIN-PHYLO-Tree_analysis.R b/BIN-PHYLO-Tree_analysis.R index 5919df5..3dabf5b 100644 --- a/BIN-PHYLO-Tree_analysis.R +++ b/BIN-PHYLO-Tree_analysis.R @@ -1,20 +1,17 @@ # tocID <- "BIN-PHYLO-Tree_analysis.R" # -# ---------------------------------------------------------------------------- # -# PATIENCE ... # -# Do not yet work wih this code. Updates in progress. Thank you. # -# boris.steipe@utoronto.ca # -# ---------------------------------------------------------------------------- # -# # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Tree_analysis unit. # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 10 - 2019 01 +# Date: 2017-10 - 2020-09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 2020 updates. Deprecate iTol and use taxize:: instead. +# Rewrite of tip re-ordering. Better handling of +# messages. pBar() for randomization. # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout, # use Biocmanager:: not biocLite() @@ -37,15 +34,16 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------------------- -#TOC> 1 Preparation and Tree Plot 46 -#TOC> 2 Tree Analysis 86 -#TOC> 2.1 Rooting Trees 145 -#TOC> 2.2 Rotating Clades 190 -#TOC> 2.3 Computing tree distances 241 -#TOC> +#TOC> 1 Preparation and Tree Plot 50 +#TOC> 2 SPECIES REFERENCE TREE 66 +#TOC> 3 Tree Analysis 117 +#TOC> 3.1 Rooting Trees 177 +#TOC> 3.2 Rotating Clades 222 +#TOC> 3.3 Computing tree distances 309 +#TOC> #TOC> ========================================================================== @@ -60,36 +58,63 @@ if (! requireNamespace("ape", quietly = TRUE)) { # browseVignettes("ape") # available vignettes # data(package = "ape") # available datasets +# We change the graphics parameters from time to time, let's define the +# default so we can recreate a sane state: +dev.off() +PAR <- par() -# Read the species tree that you have created at the phyloT Website: -fungiTree <- ape::read.tree("fungiTree.txt") +# = 2 SPECIES REFERENCE TREE ============================================== +# Before we do any kind of phylogenetic analysis of genes from several species, +# we MUST have a reference tree of the taxonomic relationships in hand. This +# context is absolutely required for the interpretation of our tree. + +# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. + +if (! requireNamespace("taxize", quietly = TRUE)) { + install.packages("taxize") +} +# Package information: +# library(help = taxize) # basic information +# browseVignettes("taxize") # available vignettes +# data(package = "taxize") # available datasets + +( mySOI <- c(myDB$taxonomy$ID, "83333") ) +myClass <- taxize::classification(mySOI, db = "ncbi") +str(myClass) + +myClass[[1]] + +fungiTree <- taxize::class2tree(myClass, check = TRUE) plot(fungiTree) -# The tree produced by phyloT contains full length species names, but it would -# be more convenient if it had bicodes instead. +# The tree produced by taxize:: contains full length species names, +# but it would be more convenient if it had bicodes instead. Also, the actual +# tree is only part of the list(), which will cause problems later: str(fungiTree) -# The species names are in a vector $tip.label of this list. We can use bicode() -# to shorten them - but note that they have underscores as word separators. Thus -# we will use gsub("-", " ", ...) to replace the underscores with spaces. +# we therefor simplify +fungiTree <- fungiTree$phylo +str(fungiTree) -for (i in seq_along(fungiTree$tip.label)) { - fungiTree$tip.label[i] <- biCode(gsub("_", " ", fungiTree$tip.label[i])) -} +# The species names are in a vector $phylo$tip.label of this list. +# We can use biCode() to shorten them. +fungiTree$tip.label <- biCode(fungiTree$tip.label) # Plot the tree -plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE) +nSP <- length(fungiTree$tip.label) +plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) +text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) ape::nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA") -# Note that you can use the arrow buttons in the menu above the plot to scroll -# back to plots you have created earlier - so you can reference back to the -# species tree. +# Note that you can use the arrow buttons in the menu above the plot pane to +# scroll back to plots you have created earlier - so you can reference back to +# this species tree in your later analysis. -# = 2 Tree Analysis ======================================================= +# = 3 Tree Analysis ======================================================= # 1.1 Visualizing your tree @@ -110,7 +135,7 @@ ape::nodelabels(text = fungiTree$node.label, # We load the APSES sequence tree that you produced in the # BIN-PHYLO-Tree_building unit: -apsTree <- readRDS(file = "APSEStreeRproml.rds") +apsTree <- readRDS(file = "data/APSEStreeRproml.rds") plot(apsTree) # default type is "phylogram" plot(apsTree, type = "unrooted") @@ -144,11 +169,12 @@ ape::Nnode(apsTree) ape::Nedge(apsTree) ape::Ntip(apsTree) +par(PAR) # reset graphics state # Finally, write the tree to console in Newick format ape::write.tree(apsTree) -# == 2.1 Rooting Trees ===================================================== +# == 3.1 Rooting Trees ===================================================== # In order to analyse the tree, it is helpful to root it first and reorder its # clades. Contrary to documentation, Rproml() returns an unrooted tree. @@ -163,7 +189,7 @@ plot(apsTree) ape::nodelabels(cex = 0.5, frame = "circle") ape::tiplabels(cex = 0.5, frame = "rect") -# The outgroup of the tree is tip "11" in my sample tree, it may be a different +# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different # number in yours. Substitute the correct node number below for "outgroup". apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) plot(apsTree) @@ -193,7 +219,7 @@ plot(apsTree, cex = 0.7, root.edge = TRUE) ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") -# == 2.2 Rotating Clades =================================================== +# == 3.2 Rotating Clades =================================================== # To interpret the tree, it is useful to rotate the clades so that they appear # in the order expected from the cladogram of species. @@ -206,30 +232,66 @@ plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") # Note that the species at the bottom of the clade descending from node # 17 is now plotted at the top. -layout(matrix(1), widths = 1.0, heights = 1.0) -# ... or we can plot the tree so it corresponds as well as possible to a -# predefined tip ordering. Here we use the ordering that phyloT has returned -# for the species tree. +par(PAR) # reset graphics state -# (Nb. we need to reverse the ordering for the plot. This is why we use the -# expression [nOrg:1] below instead of using the vector directly.) +# ... or we can rearrange the tree so it corresponds as well as possible to a +# predefined tip ordering. Here we use the ordering that taxize:: has inferred +# from the NCBI taxonomic classification. nOrg <- length(apsTree$tip.label) -layout(matrix(1:2, 1, 2)) plot(fungiTree, - no.margin = TRUE, root.edge = TRUE) + no.margin = FALSE, root.edge = TRUE) ape::nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA") -plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), +# These are the fungi tree tips ... +fungiTree$tip.label +# ... and their order is determined by the edge-list that is stored in +fungiTree$edge +# which edges join the tips? +ape::tiplabels(cex = 0.5, frame = "rect") +# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are +# ordered from bottom to top. +# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... + +sel <- fungiTree$edge[ , 2 ] <= nOrg +( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) + +# Now, here are the genes of the apsTree tips ... +apsTree$tip.label + +# ... and the "constraint" we need for reordering, according to the help page +# of ape::rotateConstr(), is "a vector specifying the order of the tips as they +# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector +oSp <- gsub("^", "MBP1_", oSp) +( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) + +# Then we can plot the two trees to compare: the fungi- tree +par(PAR) # reset graphics state +layout(matrix(1:2, 1, 2)) +plot(fungiTree, + no.margin = TRUE, + root.edge = TRUE) +ape::nodelabels(text = fungiTree$node.label, + cex = 0.5, + adj = 0.2, + bg = "#D4F2DA") + +# and the re-organized apsesTree ... +plot(ape::rotateConstr(apsTree, constraint = oSp[]), no.margin = TRUE, root.edge = TRUE) -ape::add.scale.bar(length = 0.5) -layout(matrix(1), widths = 1.0, heights = 1.0) + +par(PAR) # reset graphics state + +# As you can see, the reordering is not perfect, since the topologies are +# different, mostly due to the unresolved nodes in the reference tree. One +# could play with that ... + # Task: Study the two trees and consider their similarities and differences. # What do you expect? What do you find? Note that this is not a "mixed" @@ -240,11 +302,11 @@ layout(matrix(1), widths = 1.0, heights = 1.0) # branches would you need to remove and reinsert elsewhere to get the # same topology as the species tree? -# In order to quantiofy how different these tow trees are, we need to compute +# In order to quantify how different these two trees are, we need to compute # tree distances. -# == 2.3 Computing tree distances ========================================== +# == 3.3 Computing tree distances ========================================== # Many superb phylogeny tools are contributed by the phangorn package. @@ -262,6 +324,7 @@ if (! requireNamespace("phangorn", quietly = TRUE)) { apsTree2 <- apsTree apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) + # phangorn provides several functions to compute tree-differences (and there # is a _whole_ lot of theory on how to compare trees). treedist() returns the # "symmetric difference" @@ -280,33 +343,41 @@ ape::rtree(n = length(apsTree2$tip.label), # number of tips # fungiTree has none, so we can't # compare them anyway. +# (Note the warning message about non-binary trees; we'll suppress that later +# by wrapping the function call in supressMessages(); we don't want to +# print it 10,000 times :-) + + # Let's compute some random trees this way, calculate the distances to # fungiTree, and then compare the values we get for apsTree2. The random # trees are provided by ape::rtree(). -N <- 10000 # takes about 15 seconds +N <- 10000 # takes about 15 seconds, and we'll use the pBar function, + # defined in .utilities.R to keep track of where we are at: myTreeDistances <- matrix(numeric(N * 2), ncol = 2) colnames(myTreeDistances) <- c("symm", "path") set.seed(112358) for (i in 1:N) { + pBar(i, N) xTree <- ape::rtree(n = length(apsTree2$tip.label), rooted = TRUE, tip.label = apsTree2$tip.label, br = NULL) - myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree) + myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) } set.seed(NULL) # reset the random number generator table(myTreeDistances[, "symm"]) -(symmObs <- phangorn::treedist(fungiTree, apsTree2)[1]) +( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) # Random events less-or-equal to observation, divided by total number of # events gives us the empirical p-value. cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) +par(PAR) # reset graphics state hist(myTreeDistances[, "path"], col = "aliceblue", main = "Distances of random Trees to fungiTree")