bch441-work-abc-units/BIN-PHYLO-Data_preparation.R

# BIN-PHYLO-Data_preparation.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version:  0.1
#
# Date:     2017  08  28
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.1    First code copied from 2016 material.

#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================

# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...

# ==============================================================================

# = 1 ___Section___

# ==============================================================================
#        PART ONE: Choosing sequences
# ==============================================================================

# Start by loading libraries. You already have the packages installed.
library(Biostrings)
library(msa)
library(stringr)

# What is the latest version of myDB that you have saved?
list.files(pattern = "myDB.*")

# ... load it (probably myDB.05.RData - if not, change the code below).
load("myDB.05.RData")

# The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for MYSPE.
#
# We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature.

# Collect APSES domain sequences from your database. The function
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
# from its start and end coordinates. Try:

dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")

# Lets put all APSES sequences into a vector:
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
APSES <- character(length(APSESnames))

for (i in 1:length(APSESnames)) {
  APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
}

# Let's name the rows of our vector with the BiCode part of the protein name.
# This is important so we can keep track of which sequence is which. We use the
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
names(APSES) <- gsub("^MBP1_", "", APSESnames)

# inspect the result: what do you expect? Is this what you expect?
head(APSES)

# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogegetic tree (see the Assignment Course Wiki page for details on the
# sequence).

APSES[length(APSES) + 1] <-
  "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
names(APSES)[length(APSES)] <- "ESCCO"


# ==============================================================================
#        PART TWO: Multiple sequence alignment
# ==============================================================================

# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#

APSESSeqSet <- AAStringSet(APSES)

APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")

# inspect the alignment.
writeSeqSet(APSESMsaSet, format = "ali")


# What do you think? Is this a good alignment for phylogenetic inference?

# ==============================================================================
#        PART THREE: reviewing and editing alignments
# ==============================================================================

# Head back to the assignment 7 course wiki page and read up on the background
# first.
#


# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.

# Step 1. Go through this by hand...

# get the length of the alignment
lenAli <- APSESMsaSet@unmasked@ranges@width[1]

# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
                    ncol = lenAli)

# assign the correct rownames
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsaSet)) {
  seq <- as.character(APSESMsaSet@unmasked[i])
  msaMatrix[i, ] <- unlist(strsplit(seq, ""))
}

# inspect the result
msaMatrix[1:5, ]

# Now let's make a logical vector with an element
# for each column that selects which columns should
# be masked out.

# To count the number of elements in a vector, R has
# the table() function. For example ...
table(msaMatrix[ , 1])
table(msaMatrix[ , 10])
table(msaMatrix[ , 20])
table(msaMatrix[ , 30])


# Since the return value of table() is a named vector, where
# the name is the element that was counted in each slot,
# we can simply get the counts for hyphens from the
# return value of table(). We don't even need to assign
# the result to an intermediate variable, but we
# can attach the selection via square brackets,
# i.e.: ["-"],  directly to the function call:
table(msaMatrix[ , 1])["-"]

# ... to get the number of hyphens. And we can compare
# whether it is eg. > 4.
table(msaMatrix[ , 1])["-"] > 4

# Thus filling our logical vector is really simple:

# initialize the mask
colMask <- logical(lenAli)

# define the threshold for rejecting a column
limit <- round(nrow(APSESMsaSet) * (2/3))

# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more.
for (i in 1:lenAli) {
  count <- table(msaMatrix[ , i])["-"]
  if (is.na(count)) { # No hyphen
    count <- 0
  }
  colMask[i] <- count <= limit
}

# inspect the mask
colMask

# How many positions were masked? R has a simple trick
# to count the number of TRUE and FALSE in a logical
# vector. If a logical TRUE or FALSE is converted into
# a number, it becomes 1 or 0 respectively. If we use
# the sum() function on the vector, the conversion is
# done implicitly. Thus ...
sum(colMask)

# ... gives the number of TRUE elements.

cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
            100 * (1 - (sum(colMask) / length(colMask)))))


# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]

# check:
ncol(maskedMatrix)


# ... then collapse each row back into a sequence ...

apsMaskedSeq <- character()
for (i in 1:nrow(maskedMatrix)) {
  apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(apsMaskedSeq) <- rownames(maskedMatrix)

# ... and read it back into an AAStringSet object

apsMaskedSet <- AAStringSet(apsMaskedSeq)

# inspect ...
writeSeqSet(apsMaskedSet, format = "ali")


# Step 2. Turn this code into a function...

# Even though the procedure is simple, doing this more than once is tedious and
# prone to errors. I have assembled the steps we just went through into a
# function maskSet() and put it into the utilities.R file, from where it has
# been loaded when you started this sesssion.

maskSet

# Check that the function gives identical results
# to what we did before by hand:
identical(apsMaskedSet, maskSet(APSESMsaSet))

# The result must be TRUE. If it's not TRUE you have
# an error somewhere.

# We save the aligned, masked domains to a file in multi-FASTA format.
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa",   format = "mfa")


# = 1 Tasks


# [END]
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# BIN-PHYLO-Data_preparation.R`
			`#`
			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the BIN-PHYLO-Data_preparation unit.`
			`#`
			`# Version: 0.1`
			`#`
			`# Date: 2017 08 28`
			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
			`# 0.1 First code copied from 2016 material.`

			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`

			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`

			`# ==============================================================================`

			`# = 1 ___Section___`

			`# ==============================================================================`
			`# PART ONE: Choosing sequences`
			`# ==============================================================================`

			`# Start by loading libraries. You already have the packages installed.`
			`library(Biostrings)`
			`library(msa)`
			`library(stringr)`

			`# What is the latest version of myDB that you have saved?`
			`list.files(pattern = "myDB.*")`

			`# ... load it (probably myDB.05.RData - if not, change the code below).`
			`load("myDB.05.RData")`

			`# The database contains the ten Mbp1 orthologues from the reference species`
Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`# and the Mbp1 RBM for MYSPE.`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`#`
			`# We will construct a phylogenetic tree from the proteins' APSES domains.`
			`# You have annotated their ranges as a feature.`

			`# Collect APSES domain sequences from your database. The function`
			`# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature`
			`# from its start and end coordinates. Try:`

			`dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")`

			`# Lets put all APSES sequences into a vector:`
			`APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]`
			`APSES <- character(length(APSESnames))`

			`for (i in 1:length(APSESnames)) {`
			`APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")`
			`}`

			`# Let's name the rows of our vector with the BiCode part of the protein name.`
			`# This is important so we can keep track of which sequence is which. We use the`
			`# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.`
			`names(APSES) <- gsub("^MBP1_", "", APSESnames)`

			`# inspect the result: what do you expect? Is this what you expect?`
			`head(APSES)`

			`# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our`
			`# phylogegetic tree (see the Assignment Course Wiki page for details on the`
			`# sequence).`

			`APSES[length(APSES) + 1] <-`
			`"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"`
			`names(APSES)[length(APSES)] <- "ESCCO"`


			`# ==============================================================================`
			`# PART TWO: Multiple sequence alignment`
			`# ==============================================================================`

			`# This vector of sequences with named elements fulfills the requirements to be`
			`# imported as a Biostrings object - an AAStringSet - which we need as input for`
			`# the MSA algorithms in Biostrings.`
			`#`

			`APSESSeqSet <- AAStringSet(APSES)`

			`APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")`

			`# inspect the alignment.`
			`writeSeqSet(APSESMsaSet, format = "ali")`


			`# What do you think? Is this a good alignment for phylogenetic inference?`

			`# ==============================================================================`
			`# PART THREE: reviewing and editing alignments`
			`# ==============================================================================`

			`# Head back to the assignment 7 course wiki page and read up on the background`
			`# first.`
			`#`



			`# Let's mask out all columns that have observations for`
			`# less than 1/3 of the sequences in the dataset. This`
			`# means they have more than round(nrow(msaSet) * (2/3))`
			`# hyphens in a column.`
			`#`
			`# We take all sequences, split them into single`
			`# characters, and put them into a matrix. Then we`
			`# go through the matrix, column by column and decide`
			`# whether we want to include that column.`

			`# Step 1. Go through this by hand...`

			`# get the length of the alignment`
			`lenAli <- APSESMsaSet@unmasked@ranges@width[1]`

			`# initialize a matrix that can hold all characters`
			`# individually`
			`msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),`
			`ncol = lenAli)`

			`# assign the correct rownames`
			`rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES`
			`for (i in 1:nrow(APSESMsaSet)) {`
			`seq <- as.character(APSESMsaSet@unmasked[i])`
			`msaMatrix[i, ] <- unlist(strsplit(seq, ""))`
			`}`

			`# inspect the result`
			`msaMatrix[1:5, ]`

			`# Now let's make a logical vector with an element`
			`# for each column that selects which columns should`
			`# be masked out.`

			`# To count the number of elements in a vector, R has`
			`# the table() function. For example ...`
			`table(msaMatrix[ , 1])`
			`table(msaMatrix[ , 10])`
			`table(msaMatrix[ , 20])`
			`table(msaMatrix[ , 30])`


			`# Since the return value of table() is a named vector, where`
			`# the name is the element that was counted in each slot,`
			`# we can simply get the counts for hyphens from the`
			`# return value of table(). We don't even need to assign`
			`# the result to an intermediate variable, but we`
			`# can attach the selection via square brackets,`
			`# i.e.: ["-"], directly to the function call:`
			`table(msaMatrix[ , 1])["-"]`

			`# ... to get the number of hyphens. And we can compare`
			`# whether it is eg. > 4.`
			`table(msaMatrix[ , 1])["-"] > 4`

			`# Thus filling our logical vector is really simple:`

			`# initialize the mask`
			`colMask <- logical(lenAli)`

			`# define the threshold for rejecting a column`
			`limit <- round(nrow(APSESMsaSet) * (2/3))`

			`# iterate over all columns, and write TRUE if there are less-or-equal to "limit"`
			`# hyphens, FALSE if there are more.`
			`for (i in 1:lenAli) {`
			`count <- table(msaMatrix[ , i])["-"]`
			`if (is.na(count)) { # No hyphen`
			`count <- 0`
			`}`
			`colMask[i] <- count <= limit`
			`}`

			`# inspect the mask`
			`colMask`

			`# How many positions were masked? R has a simple trick`
			`# to count the number of TRUE and FALSE in a logical`
			`# vector. If a logical TRUE or FALSE is converted into`
			`# a number, it becomes 1 or 0 respectively. If we use`
			`# the sum() function on the vector, the conversion is`
			`# done implicitly. Thus ...`
			`sum(colMask)`

			`# ... gives the number of TRUE elements.`

			`cat(sprintf("We are masking %4.2f %% of alignment columns.\n",`
			`100 * (1 - (sum(colMask) / length(colMask)))))`


			`# Next, we use colMask to remove the masked columns from the matrix`
			`# in one step:`
			`maskedMatrix <- msaMatrix[ , colMask]`

			`# check:`
			`ncol(maskedMatrix)`


			`# ... then collapse each row back into a sequence ...`

			`apsMaskedSeq <- character()`
			`for (i in 1:nrow(maskedMatrix)) {`
			`apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")`
			`}`
			`names(apsMaskedSeq) <- rownames(maskedMatrix)`

			`# ... and read it back into an AAStringSet object`

			`apsMaskedSet <- AAStringSet(apsMaskedSeq)`

			`# inspect ...`
			`writeSeqSet(apsMaskedSet, format = "ali")`



			`# Step 2. Turn this code into a function...`

			`# Even though the procedure is simple, doing this more than once is tedious and`
			`# prone to errors. I have assembled the steps we just went through into a`
			`# function maskSet() and put it into the utilities.R file, from where it has`
			`# been loaded when you started this sesssion.`

			`maskSet`

			`# Check that the function gives identical results`
			`# to what we did before by hand:`
			`identical(apsMaskedSet, maskSet(APSESMsaSet))`

			`# The result must be TRUE. If it's not TRUE you have`
			`# an error somewhere.`

			`# We save the aligned, masked domains to a file in multi-FASTA format.`
			`writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa", format = "mfa")`



			`# = 1 Tasks`




			`# [END]`