add PHYLO units

2017-11-01 09:44:24 -04:00
parent 7cc2853d00
commit 8fe794cf33
3 changed files with 206 additions and 208 deletions
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@@ -3,108 +3,138 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Data_preparation unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10.  31
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    First 2017 version
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
-# ==============================================================================
-
-# = 1 ___Section___
-
-# ==============================================================================
-#        PART ONE: Choosing sequences
-# ==============================================================================
-
-# Start by loading libraries. You already have the packages installed.
-library(Biostrings)
-library(msa)
-library(stringr)
-
-# What is the latest version of myDB that you have saved?
-list.files(pattern = "myDB.*")
-
-# ... load it (probably myDB.05.RData - if not, change the code below).
-load("myDB.05.RData")
-
-# The database contains the ten Mbp1 orthologues from the reference species
-# and the Mbp1 RBM for MYSPE.
 #
-# We will construct a phylogenetic tree from the proteins' APSES domains.
-# You have annotated their ranges as a feature.
+# ==============================================================================

-# Collect APSES domain sequences from your database. The function
-# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
-# from its start and end coordinates. Try:
+# =    1  Preparations  ========================================================

-dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")

-# Lets put all APSES sequences into a vector:
-APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
-APSES <- character(length(APSESnames))
+# You need to reload your protein database, including changes that might have
+# been made to the reference files. If you have worked with the prerequiste
+# units, you should have a script named "makeProteinDB.R" that will create the
+# myDB object with a protein and feature database. Ask for advice if not.
+source("makeProteinDB.R")

-for (i in 1:length(APSESnames)) {
-  APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
+# Load packages we need
+
+if (! require(Biostrings, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("Biostrings")
+  library(Biostrings)
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+if (! require(msa, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("msa")
+  library(msa)
+}
+# Package information:
+#  library(help=msa)       # basic information
+#  browseVignettes("msa")  # available vignettes
+#  data(package = "msa")   # available datasets
+
+
+if (! require(stringr, quietly=TRUE)) {
+  install.packages("stringr")
+  library(stringr)
+}
+# Package information:
+#  library(help=stringr)       # basic information
+#  browseVignettes("stringr")  # available vignettes
+#  data(package = "stringr")   # available datasets
+
+
+
+# = 1      Fetching sequences
+
+
+# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
+# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
+# domains. You have annotated their ranges as a feature. The following code
+# retrieves the sequences from myDB. You have seen similar code in other units.
+
+sel <- grep("^MBP1_", myDB$protein$name)
+(proNames <- myDB$protein$name[sel])
+(proIDs <- myDB$protein$ID[sel])
+
+(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
+                              myDB$annotation$featureID == sel])      #  ==  !
+                                                                      # Why?
+APSI <- character(length(fanIDs))
+
+for (i in seq_along(fanIDs)) {
+  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
+  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
+  start <- myDB$annotation$start[sel]       # get start ...
+  end   <- myDB$annotation$end[sel]         # ... and end
+
+  sel <- myDB$protein$ID == proID           # get the protein row index ...
+                                            # ... and the sequence
+  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
+  names(APSI)[i] <- (myDB$protein$name[sel])
 }

-# Let's name the rows of our vector with the BiCode part of the protein name.
-# This is important so we can keep track of which sequence is which. We use the
-# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
-names(APSES) <- gsub("^MBP1_", "", APSESnames)
-
-# inspect the result: what do you expect? Is this what you expect?
-head(APSES)
+head(APSI)

 # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
-# phylogegetic tree (see the Assignment Course Wiki page for details on the
-# sequence).
+# phylogenetic tree (see the unit's Wiki page for details on the sequence).

-APSES[length(APSES) + 1] <-
-  "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
-names(APSES)[length(APSES)] <- "ESCCO"
+APSI <- c(APSI,
+"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
+names(APSI)[length(APSI)] <- "KILA_ESCCO"
+tail(APSI)


-# ==============================================================================
-#        PART TWO: Multiple sequence alignment
-# ==============================================================================
+# = 1 Multiple Sequence Alignment

 # This vector of sequences with named elements fulfills the requirements to be
 # imported as a Biostrings object - an AAStringSet - which we need as input for
 # the MSA algorithms in Biostrings.
 #

-APSESSeqSet <- AAStringSet(APSES)
-
-APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")
+APSESSet <- AAStringSet(APSI)
+APSESMsa <- msaMuscle(APSESSet, order = "aligned")

 # inspect the alignment.
-writeSeqSet(APSESMsaSet, format = "ali")
+writeALN(APSESMsa)


 # What do you think? Is this a good alignment for phylogenetic inference?

-# ==============================================================================
-#        PART THREE: reviewing and editing alignments
-# ==============================================================================

-# Head back to the assignment 7 course wiki page and read up on the background
+# = 1 Reviewing and Editing Alignments
+
+
+# Head back to the Wiki page for this unit and read up on the background
 # first.
-#
-
-

 # Let's mask out all columns that have observations for
 # less than 1/3 of the sequences in the dataset. This
@@ -116,82 +146,55 @@ writeSeqSet(APSESMsaSet, format = "ali")
 # go through the matrix, column by column and decide
 # whether we want to include that column.

-# Step 1. Go through this by hand...
+# = 1.1 Masking workflow

 # get the length of the alignment
-lenAli <- APSESMsaSet@unmasked@ranges@width[1]
+(lenAli <- APSESMsa@unmasked@ranges@width[1])

 # initialize a matrix that can hold all characters
 # individually
-msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
+msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
                    ncol = lenAli)

 # assign the correct rownames
-rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
-for (i in 1:nrow(APSESMsaSet)) {
-  seq <- as.character(APSESMsaSet@unmasked[i])
-  msaMatrix[i, ] <- unlist(strsplit(seq, ""))
+rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
+for (i in 1:nrow(APSESMsa)) {
+  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
 }

 # inspect the result
-msaMatrix[1:5, ]
+msaMatrix[1:7, 1:14]

-# Now let's make a logical vector with an element
-# for each column that selects which columns should
-# be masked out.
+# Now let's make a logical vector with an element for each column that selects
+# which columns should be masked out.

-# To count the number of elements in a vector, R has
-# the table() function. For example ...
-table(msaMatrix[ , 1])
-table(msaMatrix[ , 10])
-table(msaMatrix[ , 20])
-table(msaMatrix[ , 30])
+# The number of hyphens in a column is easy to count. Consider:

+    msaMatrix[ , 20]
+    msaMatrix[ , 20] == "-"
+sum(msaMatrix[ , 20] == "-")

-# Since the return value of table() is a named vector, where
-# the name is the element that was counted in each slot,
-# we can simply get the counts for hyphens from the
-# return value of table(). We don't even need to assign
-# the result to an intermediate variable, but we
-# can attach the selection via square brackets,
-# i.e.: ["-"],  directly to the function call:
-table(msaMatrix[ , 1])["-"]
+# Thus filling our logical vector is simple:

-# ... to get the number of hyphens. And we can compare
-# whether it is eg. > 4.
-table(msaMatrix[ , 1])["-"] > 4
-
-# Thus filling our logical vector is really simple:
-
-# initialize the mask
-colMask <- logical(lenAli)
+# initialize a mask
+colMask <- logical(ncol(msaMatrix))

 # define the threshold for rejecting a column
-limit <- round(nrow(APSESMsaSet) * (2/3))
+limit <- round(nrow(APSESMsa) * (2/3))

 # iterate over all columns, and write TRUE if there are less-or-equal to "limit"
 # hyphens, FALSE if there are more.
-for (i in 1:lenAli) {
-  count <- table(msaMatrix[ , i])["-"]
-  if (is.na(count)) { # No hyphen
-    count <- 0
-  }
-  colMask[i] <- count <= limit
+for (i in 1:ncol(msaMatrix)) {
+  count <- sum(msaMatrix[ , i] == "-")
+  colMask[i] <- count <= limit # FALSE if less-or-equal to limit, TRUE if not
 }

 # inspect the mask
 colMask

-# How many positions were masked? R has a simple trick
-# to count the number of TRUE and FALSE in a logical
-# vector. If a logical TRUE or FALSE is converted into
-# a number, it becomes 1 or 0 respectively. If we use
-# the sum() function on the vector, the conversion is
-# done implicitly. Thus ...
+# How many positions were masked?
 sum(colMask)

-# ... gives the number of TRUE elements.
-
 cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
            100 * (1 - (sum(colMask) / length(colMask)))))

@@ -203,46 +206,22 @@ maskedMatrix <- msaMatrix[ , colMask]
 # check:
 ncol(maskedMatrix)

-
-# ... then collapse each row back into a sequence ...
-
-apsMaskedSeq <- character()
+# ... then collapse each row of single characters back into a string ...
+APSESphyloSet <- character()
 for (i in 1:nrow(maskedMatrix)) {
-  apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
+  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
 }
-names(apsMaskedSeq) <- rownames(maskedMatrix)
-
-# ... and read it back into an AAStringSet object
-
-apsMaskedSet <- AAStringSet(apsMaskedSeq)
+names(APSESphyloSet) <- rownames(maskedMatrix)

 # inspect ...
-writeSeqSet(apsMaskedSet, format = "ali")
+writeALN(APSESphyloSet)

+# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
+# several indels from the KILA_ESCCO outgroup sequence.


-# Step 2. Turn this code into a function...
-
-# Even though the procedure is simple, doing this more than once is tedious and
-# prone to errors. I have assembled the steps we just went through into a
-# function maskSet() and put it into the utilities.R file, from where it has
-# been loaded when you started this sesssion.
-
-maskSet
-
-# Check that the function gives identical results
-# to what we did before by hand:
-identical(apsMaskedSet, maskSet(APSESMsaSet))
-
-# The result must be TRUE. If it's not TRUE you have
-# an error somewhere.
-
 # We save the aligned, masked domains to a file in multi-FASTA format.
-writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa",   format = "mfa")
-
-
-
-# = 1 Tasks
+writeMFA(APSESphyloSet, myCon = "APSESphyloSet.mfa")



--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -3,46 +3,70 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_analysis unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10.  31
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    First 2017 version
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================

 # = 1 ___Section___

-# ==============================================================================
-#        PART FIVE: Tree analysis
-# ==============================================================================

-# A Entrez restriction command
-cat(paste(paste(c(myDB$taxonomy$ID, "83333"), "[taxid]", sep=""), collapse=" OR "))
+if (!require(Rphylip, quietly=TRUE)) {
+  install.packages("Rphylip")
+  library(Rphylip)
+}
+# Package information:
+#  library(help = Rphylip)       # basic information
+#  browseVignettes("Rphylip")    # available vignettes
+#  data(package = "Rphylip")     # available datasets

-# The Common Tree from NCBI
-# Download the EDITED phyliptree.phy
-commonTree <- read.tree("phyliptree.phy")
+
+
+# Read the species tree that you have created at the phyloT Website:
+fungiTree <- read.tree("fungiTree.txt")
+
+plot(fungiTree)
+
+# The tree produced by phyloT contains full length species names, but it would
+# be more convenient if it had bicodes instead.
+str(fungiTree)
+
+# The species names are in a vector $tip.label of this list. We can use bicode()
+# to shorten them - but note that they have underscores as word separators. Thus
+# we will use gsub("-", " ", ...) to replace the underscores with spaces.
+
+for (i in seq_along(fungiTree$tip.label)) {
+  fungiTree$tip.label[i] <- biCode(gsub("_", " ", fungiTree$tip.label[i]))
+}

 # Plot the tree
-plot(commonTree, cex=1.0, root.edge=TRUE, no.margin=TRUE)
-nodelabels(text=commonTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
+plot(fungiTree, cex=1.0, root.edge=TRUE, no.margin=TRUE)
+nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
+# Note that you can use the arrow buttons in the menu above the plot to scroll
+# back to plots you have created earlier - so you can reference back to the
+# species tree.


-# === Visualizing your tree ====================================================
+# = 1 Tree Analysis

+
+# 1.1  Visualizing your tree
 # The trees that are produced by Rphylip are stored as an object of class
 # "phylo". This is a class for phylogenetic trees that is widely used in the
 # community, practically all R phylogenetics packages will options to read and
@@ -51,21 +75,25 @@ nodelabels(text=commonTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
 # trees in Newick format and visualize them elsewhere.

 # The "phylo" class object is one of R's "S3" objects and methods to plot and
-# print it have been added to the system. You can simply call plot(<your-tree>)
-# and R knows what to do with <your-tree> and how to plot it. The underlying
-# function is plot.phylo(), and documentation for its many options can by found
-# by typing:
+# print it have been defined with the Rphylip package, and the package ape that
+# Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to
+# do with <your-tree> and how to plot it. The underlying function is
+# plot.phylo(), and documentation for its many options can by found by typing:

 ?plot.phylo

+# We load the APSES sequence tree that you produced in the
+# BIN-PHYLO-Tree_building unit:
+load(file = "APSEStreeRproml.RData")
+
 plot(apsTree) # default type is "phylogram"
 plot(apsTree, type="unrooted")
 plot(apsTree, type="fan", no.margin = TRUE)

 # rescale to show all of the labels:
-# record the current plot parameters ...
-tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)
-# ... and adjust the plot limits for a new plot
+# record the current plot parameters by assigning them to a variable ...
+(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
+# ... and adjust the plot limits for a new plot:
 plot(apsTree,
     type="fan",
     x.lim = tmp$x.lim * 1.8,
@@ -94,7 +122,7 @@ Ntip(apsTree)
 # Finally, write the tree to console in Newick format
 write.tree(apsTree)

-# === Rooting Trees ============================================================
+# = 1.1 Rooting Trees

 # In order to analyse the tree, it is helpful to root it first and reorder its
 # clades. Contrary to documentation, Rproml() returns an unrooted tree.
@@ -110,9 +138,9 @@ plot(apsTree)
 nodelabels(cex=0.5, frame="circle")
 tiplabels(cex=0.5, frame="rect")

-# The outgroup of the tree is tip "8" in my sample tree, it may be a different
+# The outgroup of the tree is tip "11" in my sample tree, it may be a different
 # number in yours. Substitute the correct node number below for "outgroup".
-apsTree <- root(apsTree, outgroup = 8, resolve.root = TRUE)
+apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE)
 plot(apsTree)
 is.rooted(apsTree)

@@ -140,24 +168,24 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
 nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")


-# === Rotating Clades ==========================================================
+# = 1.1 Rotating Clades

 # To interpret the tree, it is useful to rotate the clades so that they appear
 # in the order expected from the cladogram of species.

-# We can either rotate around individual internal nodes:
+# We can either rotate around individual internal nodes ...
 layout(matrix(1:2, 1, 2))
 plot(apsTree, no.margin=TRUE, root.edge=TRUE)
 nodelabels(node=17, cex=0.7, bg="#ff8866")
 plot(rotate(apsTree, node=17), no.margin=TRUE, root.edge=TRUE)
 nodelabels(node=17, cex=0.7, bg="#88ff66")
+# Note that the species at the bottom of the clade descending from node
+# 17 is now plotted at the top.
 layout(matrix(1), widths=1.0, heights=1.0)

 # ... or we can plot the tree so it corresponds as well as possible to a
-# predefined tip ordering. Here we use the ordering that NCBI Global Tree
-# returns for the reference species - we have used it above to make the vector
-# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
-# move it to its correct position in the cladogram.
+# predefined tip ordering. Here we use the ordering that phyloT has returned
+# for the species tree.

 # (Nb. we need to reverse the ordering for the plot. This is why we use the
 # expression [nOrg:1] below instead of using the vector directly.)
@@ -165,9 +193,9 @@ layout(matrix(1), widths=1.0, heights=1.0)
 nOrg <- length(apsTree$tip.label)

 layout(matrix(1:2, 1, 2))
-plot(commonTree,
+plot(fungiTree,
     no.margin=TRUE, root.edge=TRUE)
-nodelabels(text=commonTree$node.label, cex=0.5, adj=0.2, bg="#D4F2DA")
+nodelabels(text=fungiTree$node.label, cex=0.5, adj=0.2, bg="#D4F2DA")

 plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
     no.margin=TRUE, root.edge=TRUE)
@@ -175,16 +203,9 @@ add.scale.bar(length=0.5)
 layout(matrix(1), widths=1.0, heights=1.0)

 # Study the two trees and consider their similarities and differences. What do
-# you expect? What do you find?
-#
-
-# Print the two trees on one sheet of paper, write your name and student number,
-# and bring it to class as your deliverable for this assignment. Also write two
-# or three sentences about if/how the gene tree matches the species tree or not.
-
-
-# = 1 Tasks
-
+# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
+# since it contains only a single gene for the species we considered. All of the
+# branch points in this tree are speciation events.



--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@@ -1,33 +1,33 @@
-# ___ID___ .R
+# BIN-PHYLO-Tree_building.R
 #
 # Purpose:  A Bioinformatics Course:
-#              R code accompanying the ___ID___ unit.
+#              R code accompanying the BIN-PHYLO-Tree_building unit.
 #
-# Version:  0.1
+# Version:  1.0
 #
-# Date:     2017  08  28
+# Date:     2017  10.  31
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    First 2017 version
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================

-# = 1 ___Section___

-# ==============================================================================
-#        PART FOUR: Calculating trees
-# ==============================================================================
+
+# = 1  Calculating Trees
+

 # Follow the instructions found at phylip's home on the Web to install. If you
 # are on a Windows computer, take note of the installation directory.
@@ -82,16 +82,15 @@ if (!require(Rphylip, quietly=TRUE)) {
 # Confirm that the settings are right.
 PROMLPATH                # returns the path
 list.dirs(PROMLPATH)     # returns the directories in that path
-list.files(PROMLPATH)    # lists the files
+list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"

 # If "proml" is NOT among the files that the last command returns, you
-# can't continue.
+# can't continue. Ask on the mailing list for advice.

-# Now read the mfa file you have saved, as a "proseq" object with the
-# read.protein() function of the RPhylip package:
+# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
+# as a "proseq" object with the read.protein() function of the RPhylip package:

-apsIn <- read.protein("APSES.mfa")
-apsIn <- read.protein("~/Desktop/APSES_HISCA.mfa")
+apsIn <- read.protein("APSESphyloSet.mfa")

 # ... and you are ready to build a tree.

@@ -103,15 +102,14 @@ apsIn <- read.protein("~/Desktop/APSES_HISCA.mfa")

 apsTree <- Rproml(apsIn, path=PROMLPATH)

-
-
 # A quick first look:

 plot(apsTree)

+# save your tree:
+save(apsTree, file = "APSEStreeRproml.RData")

-
-# = 1 Tasks
+# If this did not work, ask for advice.