add PHYLO units
This commit is contained in:
parent
7cc2853d00
commit
8fe794cf33
@ -3,108 +3,138 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 10. 31
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 First 2017 version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
# PART ONE: Choosing sequences
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# Start by loading libraries. You already have the packages installed.
|
|
||||||
library(Biostrings)
|
|
||||||
library(msa)
|
|
||||||
library(stringr)
|
|
||||||
|
|
||||||
# What is the latest version of myDB that you have saved?
|
|
||||||
list.files(pattern = "myDB.*")
|
|
||||||
|
|
||||||
# ... load it (probably myDB.05.RData - if not, change the code below).
|
|
||||||
load("myDB.05.RData")
|
|
||||||
|
|
||||||
# The database contains the ten Mbp1 orthologues from the reference species
|
|
||||||
# and the Mbp1 RBM for MYSPE.
|
|
||||||
#
|
#
|
||||||
# We will construct a phylogenetic tree from the proteins' APSES domains.
|
# ==============================================================================
|
||||||
# You have annotated their ranges as a feature.
|
|
||||||
|
|
||||||
# Collect APSES domain sequences from your database. The function
|
# = 1 Preparations ========================================================
|
||||||
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
|
|
||||||
# from its start and end coordinates. Try:
|
|
||||||
|
|
||||||
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
|
|
||||||
|
|
||||||
# Lets put all APSES sequences into a vector:
|
# You need to reload your protein database, including changes that might have
|
||||||
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
|
# been made to the reference files. If you have worked with the prerequiste
|
||||||
APSES <- character(length(APSESnames))
|
# units, you should have a script named "makeProteinDB.R" that will create the
|
||||||
|
# myDB object with a protein and feature database. Ask for advice if not.
|
||||||
|
source("makeProteinDB.R")
|
||||||
|
|
||||||
for (i in 1:length(APSESnames)) {
|
# Load packages we need
|
||||||
APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
|
|
||||||
|
if (! require(Biostrings, quietly=TRUE)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
|
biocLite("Biostrings")
|
||||||
|
library(Biostrings)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = Biostrings) # basic information
|
||||||
|
# browseVignettes("Biostrings") # available vignettes
|
||||||
|
# data(package = "Biostrings") # available datasets
|
||||||
|
|
||||||
|
|
||||||
|
if (! require(msa, quietly=TRUE)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
|
biocLite("msa")
|
||||||
|
library(msa)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help=msa) # basic information
|
||||||
|
# browseVignettes("msa") # available vignettes
|
||||||
|
# data(package = "msa") # available datasets
|
||||||
|
|
||||||
|
|
||||||
|
if (! require(stringr, quietly=TRUE)) {
|
||||||
|
install.packages("stringr")
|
||||||
|
library(stringr)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help=stringr) # basic information
|
||||||
|
# browseVignettes("stringr") # available vignettes
|
||||||
|
# data(package = "stringr") # available datasets
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 Fetching sequences
|
||||||
|
|
||||||
|
|
||||||
|
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
||||||
|
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
|
||||||
|
# domains. You have annotated their ranges as a feature. The following code
|
||||||
|
# retrieves the sequences from myDB. You have seen similar code in other units.
|
||||||
|
|
||||||
|
sel <- grep("^MBP1_", myDB$protein$name)
|
||||||
|
(proNames <- myDB$protein$name[sel])
|
||||||
|
(proIDs <- myDB$protein$ID[sel])
|
||||||
|
|
||||||
|
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||||
|
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
|
||||||
|
myDB$annotation$featureID == sel]) # == !
|
||||||
|
# Why?
|
||||||
|
APSI <- character(length(fanIDs))
|
||||||
|
|
||||||
|
for (i in seq_along(fanIDs)) {
|
||||||
|
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
|
||||||
|
proID <- myDB$annotation$proteinID[sel] # get its protein ID
|
||||||
|
start <- myDB$annotation$start[sel] # get start ...
|
||||||
|
end <- myDB$annotation$end[sel] # ... and end
|
||||||
|
|
||||||
|
sel <- myDB$protein$ID == proID # get the protein row index ...
|
||||||
|
# ... and the sequence
|
||||||
|
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
|
||||||
|
names(APSI)[i] <- (myDB$protein$name[sel])
|
||||||
}
|
}
|
||||||
|
|
||||||
# Let's name the rows of our vector with the BiCode part of the protein name.
|
head(APSI)
|
||||||
# This is important so we can keep track of which sequence is which. We use the
|
|
||||||
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
|
|
||||||
names(APSES) <- gsub("^MBP1_", "", APSESnames)
|
|
||||||
|
|
||||||
# inspect the result: what do you expect? Is this what you expect?
|
|
||||||
head(APSES)
|
|
||||||
|
|
||||||
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
|
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
|
||||||
# phylogegetic tree (see the Assignment Course Wiki page for details on the
|
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
|
||||||
# sequence).
|
|
||||||
|
|
||||||
APSES[length(APSES) + 1] <-
|
APSI <- c(APSI,
|
||||||
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
|
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
|
||||||
names(APSES)[length(APSES)] <- "ESCCO"
|
names(APSI)[length(APSI)] <- "KILA_ESCCO"
|
||||||
|
tail(APSI)
|
||||||
|
|
||||||
|
|
||||||
# ==============================================================================
|
# = 1 Multiple Sequence Alignment
|
||||||
# PART TWO: Multiple sequence alignment
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# This vector of sequences with named elements fulfills the requirements to be
|
# This vector of sequences with named elements fulfills the requirements to be
|
||||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
||||||
# the MSA algorithms in Biostrings.
|
# the MSA algorithms in Biostrings.
|
||||||
#
|
#
|
||||||
|
|
||||||
APSESSeqSet <- AAStringSet(APSES)
|
APSESSet <- AAStringSet(APSI)
|
||||||
|
APSESMsa <- msaMuscle(APSESSet, order = "aligned")
|
||||||
APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")
|
|
||||||
|
|
||||||
# inspect the alignment.
|
# inspect the alignment.
|
||||||
writeSeqSet(APSESMsaSet, format = "ali")
|
writeALN(APSESMsa)
|
||||||
|
|
||||||
|
|
||||||
# What do you think? Is this a good alignment for phylogenetic inference?
|
# What do you think? Is this a good alignment for phylogenetic inference?
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
# PART THREE: reviewing and editing alignments
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# Head back to the assignment 7 course wiki page and read up on the background
|
# = 1 Reviewing and Editing Alignments
|
||||||
|
|
||||||
|
|
||||||
|
# Head back to the Wiki page for this unit and read up on the background
|
||||||
# first.
|
# first.
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Let's mask out all columns that have observations for
|
# Let's mask out all columns that have observations for
|
||||||
# less than 1/3 of the sequences in the dataset. This
|
# less than 1/3 of the sequences in the dataset. This
|
||||||
@ -116,82 +146,55 @@ writeSeqSet(APSESMsaSet, format = "ali")
|
|||||||
# go through the matrix, column by column and decide
|
# go through the matrix, column by column and decide
|
||||||
# whether we want to include that column.
|
# whether we want to include that column.
|
||||||
|
|
||||||
# Step 1. Go through this by hand...
|
# = 1.1 Masking workflow
|
||||||
|
|
||||||
# get the length of the alignment
|
# get the length of the alignment
|
||||||
lenAli <- APSESMsaSet@unmasked@ranges@width[1]
|
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
||||||
|
|
||||||
# initialize a matrix that can hold all characters
|
# initialize a matrix that can hold all characters
|
||||||
# individually
|
# individually
|
||||||
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
|
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
|
||||||
ncol = lenAli)
|
ncol = lenAli)
|
||||||
|
|
||||||
# assign the correct rownames
|
# assign the correct rownames
|
||||||
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
|
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
|
||||||
for (i in 1:nrow(APSESMsaSet)) {
|
for (i in 1:nrow(APSESMsa)) {
|
||||||
seq <- as.character(APSESMsaSet@unmasked[i])
|
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
|
||||||
msaMatrix[i, ] <- unlist(strsplit(seq, ""))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# inspect the result
|
# inspect the result
|
||||||
msaMatrix[1:5, ]
|
msaMatrix[1:7, 1:14]
|
||||||
|
|
||||||
# Now let's make a logical vector with an element
|
# Now let's make a logical vector with an element for each column that selects
|
||||||
# for each column that selects which columns should
|
# which columns should be masked out.
|
||||||
# be masked out.
|
|
||||||
|
|
||||||
# To count the number of elements in a vector, R has
|
# The number of hyphens in a column is easy to count. Consider:
|
||||||
# the table() function. For example ...
|
|
||||||
table(msaMatrix[ , 1])
|
|
||||||
table(msaMatrix[ , 10])
|
|
||||||
table(msaMatrix[ , 20])
|
|
||||||
table(msaMatrix[ , 30])
|
|
||||||
|
|
||||||
|
msaMatrix[ , 20]
|
||||||
|
msaMatrix[ , 20] == "-"
|
||||||
|
sum(msaMatrix[ , 20] == "-")
|
||||||
|
|
||||||
# Since the return value of table() is a named vector, where
|
# Thus filling our logical vector is simple:
|
||||||
# the name is the element that was counted in each slot,
|
|
||||||
# we can simply get the counts for hyphens from the
|
|
||||||
# return value of table(). We don't even need to assign
|
|
||||||
# the result to an intermediate variable, but we
|
|
||||||
# can attach the selection via square brackets,
|
|
||||||
# i.e.: ["-"], directly to the function call:
|
|
||||||
table(msaMatrix[ , 1])["-"]
|
|
||||||
|
|
||||||
# ... to get the number of hyphens. And we can compare
|
# initialize a mask
|
||||||
# whether it is eg. > 4.
|
colMask <- logical(ncol(msaMatrix))
|
||||||
table(msaMatrix[ , 1])["-"] > 4
|
|
||||||
|
|
||||||
# Thus filling our logical vector is really simple:
|
|
||||||
|
|
||||||
# initialize the mask
|
|
||||||
colMask <- logical(lenAli)
|
|
||||||
|
|
||||||
# define the threshold for rejecting a column
|
# define the threshold for rejecting a column
|
||||||
limit <- round(nrow(APSESMsaSet) * (2/3))
|
limit <- round(nrow(APSESMsa) * (2/3))
|
||||||
|
|
||||||
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
|
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
|
||||||
# hyphens, FALSE if there are more.
|
# hyphens, FALSE if there are more.
|
||||||
for (i in 1:lenAli) {
|
for (i in 1:ncol(msaMatrix)) {
|
||||||
count <- table(msaMatrix[ , i])["-"]
|
count <- sum(msaMatrix[ , i] == "-")
|
||||||
if (is.na(count)) { # No hyphen
|
colMask[i] <- count <= limit # FALSE if less-or-equal to limit, TRUE if not
|
||||||
count <- 0
|
|
||||||
}
|
|
||||||
colMask[i] <- count <= limit
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# inspect the mask
|
# inspect the mask
|
||||||
colMask
|
colMask
|
||||||
|
|
||||||
# How many positions were masked? R has a simple trick
|
# How many positions were masked?
|
||||||
# to count the number of TRUE and FALSE in a logical
|
|
||||||
# vector. If a logical TRUE or FALSE is converted into
|
|
||||||
# a number, it becomes 1 or 0 respectively. If we use
|
|
||||||
# the sum() function on the vector, the conversion is
|
|
||||||
# done implicitly. Thus ...
|
|
||||||
sum(colMask)
|
sum(colMask)
|
||||||
|
|
||||||
# ... gives the number of TRUE elements.
|
|
||||||
|
|
||||||
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
|
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
|
||||||
100 * (1 - (sum(colMask) / length(colMask)))))
|
100 * (1 - (sum(colMask) / length(colMask)))))
|
||||||
|
|
||||||
@ -203,46 +206,22 @@ maskedMatrix <- msaMatrix[ , colMask]
|
|||||||
# check:
|
# check:
|
||||||
ncol(maskedMatrix)
|
ncol(maskedMatrix)
|
||||||
|
|
||||||
|
# ... then collapse each row of single characters back into a string ...
|
||||||
# ... then collapse each row back into a sequence ...
|
APSESphyloSet <- character()
|
||||||
|
|
||||||
apsMaskedSeq <- character()
|
|
||||||
for (i in 1:nrow(maskedMatrix)) {
|
for (i in 1:nrow(maskedMatrix)) {
|
||||||
apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
|
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
|
||||||
}
|
}
|
||||||
names(apsMaskedSeq) <- rownames(maskedMatrix)
|
names(APSESphyloSet) <- rownames(maskedMatrix)
|
||||||
|
|
||||||
# ... and read it back into an AAStringSet object
|
|
||||||
|
|
||||||
apsMaskedSet <- AAStringSet(apsMaskedSeq)
|
|
||||||
|
|
||||||
# inspect ...
|
# inspect ...
|
||||||
writeSeqSet(apsMaskedSet, format = "ali")
|
writeALN(APSESphyloSet)
|
||||||
|
|
||||||
|
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
|
||||||
|
# several indels from the KILA_ESCCO outgroup sequence.
|
||||||
|
|
||||||
|
|
||||||
# Step 2. Turn this code into a function...
|
|
||||||
|
|
||||||
# Even though the procedure is simple, doing this more than once is tedious and
|
|
||||||
# prone to errors. I have assembled the steps we just went through into a
|
|
||||||
# function maskSet() and put it into the utilities.R file, from where it has
|
|
||||||
# been loaded when you started this sesssion.
|
|
||||||
|
|
||||||
maskSet
|
|
||||||
|
|
||||||
# Check that the function gives identical results
|
|
||||||
# to what we did before by hand:
|
|
||||||
identical(apsMaskedSet, maskSet(APSESMsaSet))
|
|
||||||
|
|
||||||
# The result must be TRUE. If it's not TRUE you have
|
|
||||||
# an error somewhere.
|
|
||||||
|
|
||||||
# We save the aligned, masked domains to a file in multi-FASTA format.
|
# We save the aligned, masked domains to a file in multi-FASTA format.
|
||||||
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa", format = "mfa")
|
writeMFA(APSESphyloSet, myCon = "APSESphyloSet.mfa")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tasks
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,46 +3,70 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 10. 31
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 First 2017 version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 ___Section___
|
# = 1 ___Section___
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
# PART FIVE: Tree analysis
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# A Entrez restriction command
|
if (!require(Rphylip, quietly=TRUE)) {
|
||||||
cat(paste(paste(c(myDB$taxonomy$ID, "83333"), "[taxid]", sep=""), collapse=" OR "))
|
install.packages("Rphylip")
|
||||||
|
library(Rphylip)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = Rphylip) # basic information
|
||||||
|
# browseVignettes("Rphylip") # available vignettes
|
||||||
|
# data(package = "Rphylip") # available datasets
|
||||||
|
|
||||||
# The Common Tree from NCBI
|
|
||||||
# Download the EDITED phyliptree.phy
|
|
||||||
commonTree <- read.tree("phyliptree.phy")
|
# Read the species tree that you have created at the phyloT Website:
|
||||||
|
fungiTree <- read.tree("fungiTree.txt")
|
||||||
|
|
||||||
|
plot(fungiTree)
|
||||||
|
|
||||||
|
# The tree produced by phyloT contains full length species names, but it would
|
||||||
|
# be more convenient if it had bicodes instead.
|
||||||
|
str(fungiTree)
|
||||||
|
|
||||||
|
# The species names are in a vector $tip.label of this list. We can use bicode()
|
||||||
|
# to shorten them - but note that they have underscores as word separators. Thus
|
||||||
|
# we will use gsub("-", " ", ...) to replace the underscores with spaces.
|
||||||
|
|
||||||
|
for (i in seq_along(fungiTree$tip.label)) {
|
||||||
|
fungiTree$tip.label[i] <- biCode(gsub("_", " ", fungiTree$tip.label[i]))
|
||||||
|
}
|
||||||
|
|
||||||
# Plot the tree
|
# Plot the tree
|
||||||
plot(commonTree, cex=1.0, root.edge=TRUE, no.margin=TRUE)
|
plot(fungiTree, cex=1.0, root.edge=TRUE, no.margin=TRUE)
|
||||||
nodelabels(text=commonTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
|
nodelabels(text=orgTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
|
||||||
|
# Note that you can use the arrow buttons in the menu above the plot to scroll
|
||||||
|
# back to plots you have created earlier - so you can reference back to the
|
||||||
|
# species tree.
|
||||||
|
|
||||||
|
|
||||||
# === Visualizing your tree ====================================================
|
# = 1 Tree Analysis
|
||||||
|
|
||||||
|
|
||||||
|
# 1.1 Visualizing your tree
|
||||||
# The trees that are produced by Rphylip are stored as an object of class
|
# The trees that are produced by Rphylip are stored as an object of class
|
||||||
# "phylo". This is a class for phylogenetic trees that is widely used in the
|
# "phylo". This is a class for phylogenetic trees that is widely used in the
|
||||||
# community, practically all R phylogenetics packages will options to read and
|
# community, practically all R phylogenetics packages will options to read and
|
||||||
@ -51,21 +75,25 @@ nodelabels(text=commonTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
|
|||||||
# trees in Newick format and visualize them elsewhere.
|
# trees in Newick format and visualize them elsewhere.
|
||||||
|
|
||||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
||||||
# print it have been added to the system. You can simply call plot(<your-tree>)
|
# print it have been defined with the Rphylip package, and the package ape that
|
||||||
# and R knows what to do with <your-tree> and how to plot it. The underlying
|
# Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to
|
||||||
# function is plot.phylo(), and documentation for its many options can by found
|
# do with <your-tree> and how to plot it. The underlying function is
|
||||||
# by typing:
|
# plot.phylo(), and documentation for its many options can by found by typing:
|
||||||
|
|
||||||
?plot.phylo
|
?plot.phylo
|
||||||
|
|
||||||
|
# We load the APSES sequence tree that you produced in the
|
||||||
|
# BIN-PHYLO-Tree_building unit:
|
||||||
|
load(file = "APSEStreeRproml.RData")
|
||||||
|
|
||||||
plot(apsTree) # default type is "phylogram"
|
plot(apsTree) # default type is "phylogram"
|
||||||
plot(apsTree, type="unrooted")
|
plot(apsTree, type="unrooted")
|
||||||
plot(apsTree, type="fan", no.margin = TRUE)
|
plot(apsTree, type="fan", no.margin = TRUE)
|
||||||
|
|
||||||
# rescale to show all of the labels:
|
# rescale to show all of the labels:
|
||||||
# record the current plot parameters ...
|
# record the current plot parameters by assigning them to a variable ...
|
||||||
tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)
|
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
|
||||||
# ... and adjust the plot limits for a new plot
|
# ... and adjust the plot limits for a new plot:
|
||||||
plot(apsTree,
|
plot(apsTree,
|
||||||
type="fan",
|
type="fan",
|
||||||
x.lim = tmp$x.lim * 1.8,
|
x.lim = tmp$x.lim * 1.8,
|
||||||
@ -94,7 +122,7 @@ Ntip(apsTree)
|
|||||||
# Finally, write the tree to console in Newick format
|
# Finally, write the tree to console in Newick format
|
||||||
write.tree(apsTree)
|
write.tree(apsTree)
|
||||||
|
|
||||||
# === Rooting Trees ============================================================
|
# = 1.1 Rooting Trees
|
||||||
|
|
||||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||||
@ -110,9 +138,9 @@ plot(apsTree)
|
|||||||
nodelabels(cex=0.5, frame="circle")
|
nodelabels(cex=0.5, frame="circle")
|
||||||
tiplabels(cex=0.5, frame="rect")
|
tiplabels(cex=0.5, frame="rect")
|
||||||
|
|
||||||
# The outgroup of the tree is tip "8" in my sample tree, it may be a different
|
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
|
||||||
# number in yours. Substitute the correct node number below for "outgroup".
|
# number in yours. Substitute the correct node number below for "outgroup".
|
||||||
apsTree <- root(apsTree, outgroup = 8, resolve.root = TRUE)
|
apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
is.rooted(apsTree)
|
is.rooted(apsTree)
|
||||||
|
|
||||||
@ -140,24 +168,24 @@ plot(apsTree, cex=0.7, root.edge=TRUE)
|
|||||||
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
|
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
|
||||||
|
|
||||||
|
|
||||||
# === Rotating Clades ==========================================================
|
# = 1.1 Rotating Clades
|
||||||
|
|
||||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||||
# in the order expected from the cladogram of species.
|
# in the order expected from the cladogram of species.
|
||||||
|
|
||||||
# We can either rotate around individual internal nodes:
|
# We can either rotate around individual internal nodes ...
|
||||||
layout(matrix(1:2, 1, 2))
|
layout(matrix(1:2, 1, 2))
|
||||||
plot(apsTree, no.margin=TRUE, root.edge=TRUE)
|
plot(apsTree, no.margin=TRUE, root.edge=TRUE)
|
||||||
nodelabels(node=17, cex=0.7, bg="#ff8866")
|
nodelabels(node=17, cex=0.7, bg="#ff8866")
|
||||||
plot(rotate(apsTree, node=17), no.margin=TRUE, root.edge=TRUE)
|
plot(rotate(apsTree, node=17), no.margin=TRUE, root.edge=TRUE)
|
||||||
nodelabels(node=17, cex=0.7, bg="#88ff66")
|
nodelabels(node=17, cex=0.7, bg="#88ff66")
|
||||||
|
# Note that the species at the bottom of the clade descending from node
|
||||||
|
# 17 is now plotted at the top.
|
||||||
layout(matrix(1), widths=1.0, heights=1.0)
|
layout(matrix(1), widths=1.0, heights=1.0)
|
||||||
|
|
||||||
# ... or we can plot the tree so it corresponds as well as possible to a
|
# ... or we can plot the tree so it corresponds as well as possible to a
|
||||||
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
|
# predefined tip ordering. Here we use the ordering that phyloT has returned
|
||||||
# returns for the reference species - we have used it above to make the vector
|
# for the species tree.
|
||||||
# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
|
|
||||||
# move it to its correct position in the cladogram.
|
|
||||||
|
|
||||||
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
||||||
# expression [nOrg:1] below instead of using the vector directly.)
|
# expression [nOrg:1] below instead of using the vector directly.)
|
||||||
@ -165,9 +193,9 @@ layout(matrix(1), widths=1.0, heights=1.0)
|
|||||||
nOrg <- length(apsTree$tip.label)
|
nOrg <- length(apsTree$tip.label)
|
||||||
|
|
||||||
layout(matrix(1:2, 1, 2))
|
layout(matrix(1:2, 1, 2))
|
||||||
plot(commonTree,
|
plot(fungiTree,
|
||||||
no.margin=TRUE, root.edge=TRUE)
|
no.margin=TRUE, root.edge=TRUE)
|
||||||
nodelabels(text=commonTree$node.label, cex=0.5, adj=0.2, bg="#D4F2DA")
|
nodelabels(text=fungiTree$node.label, cex=0.5, adj=0.2, bg="#D4F2DA")
|
||||||
|
|
||||||
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
||||||
no.margin=TRUE, root.edge=TRUE)
|
no.margin=TRUE, root.edge=TRUE)
|
||||||
@ -175,16 +203,9 @@ add.scale.bar(length=0.5)
|
|||||||
layout(matrix(1), widths=1.0, heights=1.0)
|
layout(matrix(1), widths=1.0, heights=1.0)
|
||||||
|
|
||||||
# Study the two trees and consider their similarities and differences. What do
|
# Study the two trees and consider their similarities and differences. What do
|
||||||
# you expect? What do you find?
|
# you expect? What do you find? Note that this is not a "mixed" gene tree yet,
|
||||||
#
|
# since it contains only a single gene for the species we considered. All of the
|
||||||
|
# branch points in this tree are speciation events.
|
||||||
# Print the two trees on one sheet of paper, write your name and student number,
|
|
||||||
# and bring it to class as your deliverable for this assignment. Also write two
|
|
||||||
# or three sentences about if/how the gene tree matches the species tree or not.
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tasks
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,33 +1,33 @@
|
|||||||
# ___ID___ .R
|
# BIN-PHYLO-Tree_building.R
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the ___ID___ unit.
|
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 10. 31
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 First 2017 version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
|
||||||
# ==============================================================================
|
|
||||||
# PART FOUR: Calculating trees
|
# = 1 Calculating Trees
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# Follow the instructions found at phylip's home on the Web to install. If you
|
# Follow the instructions found at phylip's home on the Web to install. If you
|
||||||
# are on a Windows computer, take note of the installation directory.
|
# are on a Windows computer, take note of the installation directory.
|
||||||
@ -82,16 +82,15 @@ if (!require(Rphylip, quietly=TRUE)) {
|
|||||||
# Confirm that the settings are right.
|
# Confirm that the settings are right.
|
||||||
PROMLPATH # returns the path
|
PROMLPATH # returns the path
|
||||||
list.dirs(PROMLPATH) # returns the directories in that path
|
list.dirs(PROMLPATH) # returns the directories in that path
|
||||||
list.files(PROMLPATH) # lists the files
|
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
||||||
|
|
||||||
# If "proml" is NOT among the files that the last command returns, you
|
# If "proml" is NOT among the files that the last command returns, you
|
||||||
# can't continue.
|
# can't continue. Ask on the mailing list for advice.
|
||||||
|
|
||||||
# Now read the mfa file you have saved, as a "proseq" object with the
|
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||||
# read.protein() function of the RPhylip package:
|
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||||
|
|
||||||
apsIn <- read.protein("APSES.mfa")
|
apsIn <- read.protein("APSESphyloSet.mfa")
|
||||||
apsIn <- read.protein("~/Desktop/APSES_HISCA.mfa")
|
|
||||||
|
|
||||||
# ... and you are ready to build a tree.
|
# ... and you are ready to build a tree.
|
||||||
|
|
||||||
@ -103,15 +102,14 @@ apsIn <- read.protein("~/Desktop/APSES_HISCA.mfa")
|
|||||||
|
|
||||||
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# A quick first look:
|
# A quick first look:
|
||||||
|
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
|
|
||||||
|
# save your tree:
|
||||||
|
save(apsTree, file = "APSEStreeRproml.RData")
|
||||||
|
|
||||||
|
# If this did not work, ask for advice.
|
||||||
# = 1 Tasks
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user