Updates, and add treedist()

2017-11-01 23:30:01 -04:00
parent 0da30ec109
commit 1d6170f417
3 changed files with 163 additions and 28 deletions
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@@ -5,7 +5,7 @@
 #
 # Version:  1.0
 #
-# Date:     2017  10.  31
+# Date:     2017  10  31
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
@@ -24,6 +24,20 @@
 #
 # ==============================================================================

+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                               Line
+#TOC> ---------------------------------------------------
+#TOC>   1        Preparations                          41
+#TOC>   2        Fetching sequences                    78
+#TOC>   3        Multiple Sequence Alignment          119
+#TOC>   4        Reviewing and Editing Alignments     136
+#TOC>   4.1      Masking workflow                     152
+#TOC> 
+#TOC> ==========================================================================
+
+
 # =    1  Preparations  ========================================================


@@ -61,18 +75,7 @@ if (! require(msa, quietly=TRUE)) {
 #  data(package = "msa")   # available datasets


-if (! require(stringr, quietly=TRUE)) {
-  install.packages("stringr")
-  library(stringr)
-}
-# Package information:
-#  library(help=stringr)       # basic information
-#  browseVignettes("stringr")  # available vignettes
-#  data(package = "stringr")   # available datasets
-
-
-
-# = 1      Fetching sequences
+# =    2  Fetching sequences  ==================================================


 # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
@@ -113,7 +116,7 @@ names(APSI)[length(APSI)] <- "KILA_ESCCO"
 tail(APSI)


-# = 1 Multiple Sequence Alignment
+# =    3  Multiple Sequence Alignment  =========================================

 # This vector of sequences with named elements fulfills the requirements to be
 # imported as a Biostrings object - an AAStringSet - which we need as input for
@@ -130,7 +133,7 @@ writeALN(APSESMsa)
 # What do you think? Is this a good alignment for phylogenetic inference?


-# = 1 Reviewing and Editing Alignments
+# =    4  Reviewing and Editing Alignments  ====================================


 # Head back to the Wiki page for this unit and read up on the background
@@ -146,7 +149,7 @@ writeALN(APSESMsa)
 # go through the matrix, column by column and decide
 # whether we want to include that column.

-# = 1.1 Masking workflow
+# ==   4.1  Masking workflow  ==================================================

 # get the length of the alignment
 (lenAli <- APSESMsa@unmasked@ranges@width[1])
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -24,7 +24,21 @@
 #
 # ==============================================================================

-# = 1 ___Section___
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                       Line
+#TOC> -------------------------------------------
+#TOC>   1        ___Section___                 38
+#TOC>   2        Tree Analysis                 77
+#TOC>   2.1      Rooting Trees                136
+#TOC>   2.2      Rotating Clades              182
+#TOC>   2.3      Computing tree distances     229
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  ___Section___  =======================================================


 if (!require(Rphylip, quietly=TRUE)) {
@@ -63,7 +77,7 @@ nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
 # species tree.


-# = 1 Tree Analysis
+# =    2  Tree Analysis  =======================================================


 # 1.1  Visualizing your tree
@@ -122,7 +136,7 @@ Ntip(apsTree)
 # Finally, write the tree to console in Newick format
 write.tree(apsTree)

-# = 1.1 Rooting Trees
+# ==   2.1  Rooting Trees  =====================================================

 # In order to analyse the tree, it is helpful to root it first and reorder its
 # clades. Contrary to documentation, Rproml() returns an unrooted tree.
@@ -168,7 +182,7 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
 nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")


-# = 1.1 Rotating Clades
+# ==   2.2  Rotating Clades  ===================================================

 # To interpret the tree, it is useful to rotate the clades so that they appear
 # in the order expected from the cladogram of species.
@@ -202,11 +216,104 @@ plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
 add.scale.bar(length=0.5)
 layout(matrix(1), widths=1.0, heights=1.0)

-# Study the two trees and consider their similarities and differences. What do
-# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
-# since it contains only a single gene for the species we considered. All of the
-# branch points in this tree are speciation events.
+# Task: Study the two trees and consider their similarities and differences.
+#         What do you expect? What do you find? Note that this is not a "mixed"
+#         gene tree yet, since it contains only a single gene for the species
+#         we considered. All of the branch points in this tree are speciation
+#         events. Thus the gene tree should have the same topology as the
+#         species tree. Does it? Are the differences important? How many
+#         branches would you need to remove and reinsert elsewhere to get the
+#         same topology as the species tree?

+# In order to quantiofy how different these tow trees are, we need to compute
+# tree distances.
+
+
+# ==   2.3  Computing tree distances  ==========================================
+
+
+# Many superb phylogeny tools are contributed by the phangorn package.
+
+if (!require(phangorn, quietly=TRUE)) {
+  install.packages("phangorn")
+  library(phangorn)
+}
+# Package information:
+#  library(help = phangorn)       # basic information
+#  browseVignettes("phangorn")    # available vignettes
+#  data(package = "phangorn")     # available datasets
+
+# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
+# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
+apsTree2 <- apsTree
+apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
+
+# phangorn provides several functions to compute tree-differences (and there
+# is a _whole_ lot of theory on how to compare trees). treedist() returns the
+# "symmetric difference"
+treedist(fungiTree, apsTree2, check.labels = TRUE)
+
+# Numbers. What do they mean? How much more similar is our apsTree to the
+# (presumably) ground truth of fungiTree than a random tree would be?
+# The ape package (which was loaded with RPhylip) provides the function rtree()
+# to compute random trees.
+
+rtree(n = length(apsTree2$tip.label),  # number of tips
+      rooted = TRUE,                   # we rooted the tree above,
+                                       #  and fungiTree is rooted anyway
+      tip.label = apsTree2$tip.label,  # use the apsTree2 labels
+      br = NULL)                       # don't generate branch lengths since
+                                       #   fungiTree has none, so we can't
+                                       #   compare them anyway.
+
+# Let's compute some random trees this way, calculate the distances to
+# fungiTree, and then compare the values we get for apsTree2:
+
+set.seed(112358)
+N <- 10000  # takes about 15 seconds
+myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
+colnames(myTreeDistances) <- c("symm", "path")
+
+for (i in 1:N) {
+  xTree <- rtree(n = length(apsTree2$tip.label),
+                 rooted = TRUE,
+                 tip.label = apsTree2$tip.label,
+                 br = NULL)
+  myTreeDistances[i, ] <- treedist(fungiTree, xTree)
+}
+
+table(myTreeDistances[, "symm"])
+
+(symmObs <- treedist(fungiTree, apsTree2)[1])
+
+# Random events less-or-equal to observation, divided by total number of
+# events gives us the empirical p-value.
+cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
+            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
+
+hist(myTreeDistances[, "path"],
+     col = "aliceblue",
+     main = "Distances of random Trees to fungiTree")
+(pathObs <- treedist(fungiTree, apsTree2)[2])
+abline(v = pathObs, col = "chartreuse")
+
+# Random events less-or-equal to observation, divided by total number of
+# events gives us the empirical p-value.
+cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
+            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
+
+# Indeed, our apsTree is _very_ much more similar to the species tree than
+# we would expect by random chance.
+
+# What do we gain from that analysis? Analyzing the tree we get from a single
+# gene of orthologous sequences is a positive control in our computational
+# experiment. If these genes are indeed orthologues, a correct tree-building
+# program ought to give us a tree that exactly matches the species tree.
+# Evaluating how far off we are from the known correct result gives us a way to
+# validate our workflow and our algorithm. If we can't get that right, we can't
+# expect to get "real" data right either. Employing such positive controls in
+# every computational experiment is essential for research. Not doing so is
+# Cargo Cult Bioinformatics.


 # [END]
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@@ -14,7 +14,8 @@
 #
 #
 # TODO:
-#
+#           Add MrBayes
+# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
 #
@@ -25,8 +26,22 @@
 # ==============================================================================


+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                   Line
+#TOC> -------------------------------------------------------
+#TOC>   1        Calculating Trees                         43
+#TOC>   1.1      PROMLPATH ...                             64
+#TOC>   1.1.1    ... on the Mac                            69
+#TOC>   1.1.2    ... on Windows                            80
+#TOC>   1.1.3    ... on Linux                              94
+#TOC>   1.1.4    Confirming PROMLPATH                      99
+#TOC>   1.2      Building a maxiiimum likelihood tree     108
+#TOC> 
+#TOC> ==========================================================================

-# = 1  Calculating Trees
+
+# =    1  Calculating Trees  ===================================================


 # Follow the instructions found at phylip's home on the Web to install. If you
@@ -47,11 +62,12 @@ if (!require(Rphylip, quietly=TRUE)) {
 # This will install RPhylip, as well as its dependency, the package "ape".


-
+# ==   1.1  PROMLPATH ...  =====================================================
 # The next part may be tricky. You will need to figure out where
 # on your computer Phylip has been installed and define the path
 # to the proml program that calculates a maximum-likelihood tree.

+# ===  1.1.1  ... on the Mac                      
 # On the Mac, the standard installation places a phylip folder
 # in the /Applications directory. That folder contains all the
 # individual phylip programs as <name>.app files. These are not
@@ -62,6 +78,7 @@ if (!require(Rphylip, quietly=TRUE)) {
 # directly to that subdirectory to find the program it needs:
 # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"

+# ===  1.1.2  ... on Windows                      
 # On Windows you need to know where the rograms have been installed, and you
 # need to specify a path that is correct for the Windows OS. Find the folder
 # that is named "exe", and right-click to inspect its properties. The path
@@ -75,10 +92,12 @@ if (!require(Rphylip, quietly=TRUE)) {
 # I have heard that your path must not contain spaces, and it is prudent to
 # avoid other special characters as well.

+# ===  1.1.3  ... on Linux                        
 # If you are running Linux I trust you know what to do. It's probably
 # something like
 # PROMLPATH <- "/usr/local/phylip-3.695/bin"

+# ===  1.1.4  Confirming PROMLPATH                
 # Confirm that the settings are right.
 PROMLPATH                # returns the path
 list.dirs(PROMLPATH)     # returns the directories in that path
@@ -87,6 +106,7 @@ list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
 # If "proml" is NOT among the files that the last command returns, you
 # can't continue. Ask on the mailing list for advice.

+# ==   1.2  Building a maxiiimum likelihood tree  ==============================
 # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
 # as a "proseq" object with the read.protein() function of the RPhylip package:

@@ -94,11 +114,16 @@ apsIn <- read.protein("APSESphyloSet.mfa")

 # ... and you are ready to build a tree.

+# There are many fast options in PHYLIP - we will use the most _accurate_ one
+# that it has: proml, a maximum-likelihood tree building program for protein
+# data.
+
 # Building maximum-likelihood trees can eat as much computer time
 # as you can throw at it. Calculating a tree of 48 APSES domains
 # with default parameters of Rproml() runs for more than half a day
 # on my computer. But we have only twelve sequences here, so the
-# process will take us about 5 to 10 minutes.
+# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
+# of coffee while you are waiting.

 apsTree <- Rproml(apsIn, path=PROMLPATH)