diff --git a/BIN-PHYLO-Data_preparation.R b/BIN-PHYLO-Data_preparation.R index e95dc1e..697a2d5 100644 --- a/BIN-PHYLO-Data_preparation.R +++ b/BIN-PHYLO-Data_preparation.R @@ -5,7 +5,7 @@ # # Version: 1.0 # -# Date: 2017 10. 31 +# Date: 2017 10 31 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: @@ -24,6 +24,20 @@ # # ============================================================================== + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------- +#TOC> 1 Preparations 41 +#TOC> 2 Fetching sequences 78 +#TOC> 3 Multiple Sequence Alignment 119 +#TOC> 4 Reviewing and Editing Alignments 136 +#TOC> 4.1 Masking workflow 152 +#TOC> +#TOC> ========================================================================== + + # = 1 Preparations ======================================================== @@ -61,18 +75,7 @@ if (! require(msa, quietly=TRUE)) { # data(package = "msa") # available datasets -if (! require(stringr, quietly=TRUE)) { - install.packages("stringr") - library(stringr) -} -# Package information: -# library(help=stringr) # basic information -# browseVignettes("stringr") # available vignettes -# data(package = "stringr") # available datasets - - - -# = 1 Fetching sequences +# = 2 Fetching sequences ================================================== # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 @@ -113,7 +116,7 @@ names(APSI)[length(APSI)] <- "KILA_ESCCO" tail(APSI) -# = 1 Multiple Sequence Alignment +# = 3 Multiple Sequence Alignment ========================================= # This vector of sequences with named elements fulfills the requirements to be # imported as a Biostrings object - an AAStringSet - which we need as input for @@ -130,7 +133,7 @@ writeALN(APSESMsa) # What do you think? Is this a good alignment for phylogenetic inference? -# = 1 Reviewing and Editing Alignments +# = 4 Reviewing and Editing Alignments ==================================== # Head back to the Wiki page for this unit and read up on the background @@ -146,7 +149,7 @@ writeALN(APSESMsa) # go through the matrix, column by column and decide # whether we want to include that column. -# = 1.1 Masking workflow +# == 4.1 Masking workflow ================================================== # get the length of the alignment (lenAli <- APSESMsa@unmasked@ranges@width[1]) diff --git a/BIN-PHYLO-Tree_analysis.R b/BIN-PHYLO-Tree_analysis.R index c0799a9..bcaab5d 100644 --- a/BIN-PHYLO-Tree_analysis.R +++ b/BIN-PHYLO-Tree_analysis.R @@ -24,7 +24,21 @@ # # ============================================================================== -# = 1 ___Section___ + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------- +#TOC> 1 ___Section___ 38 +#TOC> 2 Tree Analysis 77 +#TOC> 2.1 Rooting Trees 136 +#TOC> 2.2 Rotating Clades 182 +#TOC> 2.3 Computing tree distances 229 +#TOC> +#TOC> ========================================================================== + + +# = 1 ___Section___ ======================================================= if (!require(Rphylip, quietly=TRUE)) { @@ -63,7 +77,7 @@ nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA") # species tree. -# = 1 Tree Analysis +# = 2 Tree Analysis ======================================================= # 1.1 Visualizing your tree @@ -122,7 +136,7 @@ Ntip(apsTree) # Finally, write the tree to console in Newick format write.tree(apsTree) -# = 1.1 Rooting Trees +# == 2.1 Rooting Trees ===================================================== # In order to analyse the tree, it is helpful to root it first and reorder its # clades. Contrary to documentation, Rproml() returns an unrooted tree. @@ -168,7 +182,7 @@ plot(apsTree, cex=0.7, root.edge=TRUE) nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866") -# = 1.1 Rotating Clades +# == 2.2 Rotating Clades =================================================== # To interpret the tree, it is useful to rotate the clades so that they appear # in the order expected from the cladogram of species. @@ -202,11 +216,104 @@ plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), add.scale.bar(length=0.5) layout(matrix(1), widths=1.0, heights=1.0) -# Study the two trees and consider their similarities and differences. What do -# you expect? What do you find? Note that this is not a "mixed" gene tree yet, -# since it contains only a single gene for the species we considered. All of the -# branch points in this tree are speciation events. +# Task: Study the two trees and consider their similarities and differences. +# What do you expect? What do you find? Note that this is not a "mixed" +# gene tree yet, since it contains only a single gene for the species +# we considered. All of the branch points in this tree are speciation +# events. Thus the gene tree should have the same topology as the +# species tree. Does it? Are the differences important? How many +# branches would you need to remove and reinsert elsewhere to get the +# same topology as the species tree? +# In order to quantiofy how different these tow trees are, we need to compute +# tree distances. + + +# == 2.3 Computing tree distances ========================================== + + +# Many superb phylogeny tools are contributed by the phangorn package. + +if (!require(phangorn, quietly=TRUE)) { + install.packages("phangorn") + library(phangorn) +} +# Package information: +# library(help = phangorn) # basic information +# browseVignettes("phangorn") # available vignettes +# data(package = "phangorn") # available datasets + +# To compare two trees, they must have the same tip labels. We delete "MBP1_" or +# "KILA_" from the existing tip labels in a copy of our APSES domain tree. +apsTree2 <- apsTree +apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) + +# phangorn provides several functions to compute tree-differences (and there +# is a _whole_ lot of theory on how to compare trees). treedist() returns the +# "symmetric difference" +treedist(fungiTree, apsTree2, check.labels = TRUE) + +# Numbers. What do they mean? How much more similar is our apsTree to the +# (presumably) ground truth of fungiTree than a random tree would be? +# The ape package (which was loaded with RPhylip) provides the function rtree() +# to compute random trees. + +rtree(n = length(apsTree2$tip.label), # number of tips + rooted = TRUE, # we rooted the tree above, + # and fungiTree is rooted anyway + tip.label = apsTree2$tip.label, # use the apsTree2 labels + br = NULL) # don't generate branch lengths since + # fungiTree has none, so we can't + # compare them anyway. + +# Let's compute some random trees this way, calculate the distances to +# fungiTree, and then compare the values we get for apsTree2: + +set.seed(112358) +N <- 10000 # takes about 15 seconds +myTreeDistances <- matrix(numeric(N * 2), ncol = 2) +colnames(myTreeDistances) <- c("symm", "path") + +for (i in 1:N) { + xTree <- rtree(n = length(apsTree2$tip.label), + rooted = TRUE, + tip.label = apsTree2$tip.label, + br = NULL) + myTreeDistances[i, ] <- treedist(fungiTree, xTree) +} + +table(myTreeDistances[, "symm"]) + +(symmObs <- treedist(fungiTree, apsTree2)[1]) + +# Random events less-or-equal to observation, divided by total number of +# events gives us the empirical p-value. +cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", + (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) + +hist(myTreeDistances[, "path"], + col = "aliceblue", + main = "Distances of random Trees to fungiTree") +(pathObs <- treedist(fungiTree, apsTree2)[2]) +abline(v = pathObs, col = "chartreuse") + +# Random events less-or-equal to observation, divided by total number of +# events gives us the empirical p-value. +cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", + (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) + +# Indeed, our apsTree is _very_ much more similar to the species tree than +# we would expect by random chance. + +# What do we gain from that analysis? Analyzing the tree we get from a single +# gene of orthologous sequences is a positive control in our computational +# experiment. If these genes are indeed orthologues, a correct tree-building +# program ought to give us a tree that exactly matches the species tree. +# Evaluating how far off we are from the known correct result gives us a way to +# validate our workflow and our algorithm. If we can't get that right, we can't +# expect to get "real" data right either. Employing such positive controls in +# every computational experiment is essential for research. Not doing so is +# Cargo Cult Bioinformatics. # [END] diff --git a/BIN-PHYLO-Tree_building.R b/BIN-PHYLO-Tree_building.R index c916126..aea7482 100644 --- a/BIN-PHYLO-Tree_building.R +++ b/BIN-PHYLO-Tree_building.R @@ -14,7 +14,8 @@ # # # TODO: -# +# Add MrBayes +# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html # # == DO NOT SIMPLY source() THIS FILE! ======================================= # @@ -25,8 +26,22 @@ # ============================================================================== +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------------- +#TOC> 1 Calculating Trees 43 +#TOC> 1.1 PROMLPATH ... 64 +#TOC> 1.1.1 ... on the Mac 69 +#TOC> 1.1.2 ... on Windows 80 +#TOC> 1.1.3 ... on Linux 94 +#TOC> 1.1.4 Confirming PROMLPATH 99 +#TOC> 1.2 Building a maxiiimum likelihood tree 108 +#TOC> +#TOC> ========================================================================== -# = 1 Calculating Trees + +# = 1 Calculating Trees =================================================== # Follow the instructions found at phylip's home on the Web to install. If you @@ -47,11 +62,12 @@ if (!require(Rphylip, quietly=TRUE)) { # This will install RPhylip, as well as its dependency, the package "ape". - +# == 1.1 PROMLPATH ... ===================================================== # The next part may be tricky. You will need to figure out where # on your computer Phylip has been installed and define the path # to the proml program that calculates a maximum-likelihood tree. +# === 1.1.1 ... on the Mac # On the Mac, the standard installation places a phylip folder # in the /Applications directory. That folder contains all the # individual phylip programs as .app files. These are not @@ -62,6 +78,7 @@ if (!require(Rphylip, quietly=TRUE)) { # directly to that subdirectory to find the program it needs: # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" +# === 1.1.2 ... on Windows # On Windows you need to know where the rograms have been installed, and you # need to specify a path that is correct for the Windows OS. Find the folder # that is named "exe", and right-click to inspect its properties. The path @@ -75,10 +92,12 @@ if (!require(Rphylip, quietly=TRUE)) { # I have heard that your path must not contain spaces, and it is prudent to # avoid other special characters as well. +# === 1.1.3 ... on Linux # If you are running Linux I trust you know what to do. It's probably # something like # PROMLPATH <- "/usr/local/phylip-3.695/bin" +# === 1.1.4 Confirming PROMLPATH # Confirm that the settings are right. PROMLPATH # returns the path list.dirs(PROMLPATH) # returns the directories in that path @@ -87,6 +106,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command" # If "proml" is NOT among the files that the last command returns, you # can't continue. Ask on the mailing list for advice. +# == 1.2 Building a maxiiimum likelihood tree ============================== # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, # as a "proseq" object with the read.protein() function of the RPhylip package: @@ -94,11 +114,16 @@ apsIn <- read.protein("APSESphyloSet.mfa") # ... and you are ready to build a tree. +# There are many fast options in PHYLIP - we will use the most _accurate_ one +# that it has: proml, a maximum-likelihood tree building program for protein +# data. + # Building maximum-likelihood trees can eat as much computer time # as you can throw at it. Calculating a tree of 48 APSES domains # with default parameters of Rproml() runs for more than half a day # on my computer. But we have only twelve sequences here, so the -# process will take us about 5 to 10 minutes. +# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup +# of coffee while you are waiting. apsTree <- Rproml(apsIn, path=PROMLPATH)