bch441-work-abc-units/RPR-Genetic_code_optimality.R

374 lines
13 KiB
R

# RPR-Genetic_code_optimality.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit.
#
# Version: 1.1
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 54
#TOC> 2 Setting up the tools 70
#TOC> 2.1 Natural and alternative genetic codes 73
#TOC> 2.2 Effect of mutations 132
#TOC> 2.2.1 reverse-translate 143
#TOC> 2.2.2 Randomly mutate 168
#TOC> 2.2.3 Forward- translate 193
#TOC> 2.2.4 measure effect 211
#TOC> 3 Run the experiment 258
#TOC> 4 Task solutions 351
#TOC>
#TOC> ==========================================================================
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
# their robsustness to code changes. The approaches are quite simple and you
# will be able to come up with obvious refinements; the point of this code is to
# demonstrate some R programming techniques, in preparation for more
# sophisticated questions later.
# = 1 Designing a computational experiment ================================
# Computational experiments are conducted like wet-lab experiments. We begin
# with a hypothesis, then define the observables that relate to the hypothesis,
# then define the measures we apply to observations, and finally we interpret
# our observations. If we want to learn something about the evolution of the
# genetic code ...
# - we construct a hypothesis such as: the genetic code has evolved so as to
# minimize the effect of mutations;
# - we define the observables: the effect of mutations in
# sequences, given the natural and possible alternative codes;
# - we define the measures to quantify the effect of mutations;
# - then we compute alternatives and interpret the results.
# = 2 Setting up the tools ================================================
# == 2.1 Natural and alternative genetic codes =============================
# Load genetic code tables from the Biostrings package
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("Biostrings")
library(Biostrings)
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# There are many ways to generate alternative codes. The simplest way is to
# randomly assign amino acids to codons. A more sophisticated way is to keep the
# redundancy of codons intact, since it may reflect some form of symmetry
# breaking that ignores the third nucleotide of a codon for the most part;
# therefore we only replace the amino acids of the existing code with random
# others. Here are two functions that implement these two ideas about alternate
# codes.
randomGC <- function(GC) {
# Return a genetic code with randomly assigned amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments in which
# every amino acid and "*" is encoded at least once.
aa <- unique(GC) # the amino acids in the input code
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
# does not contain all amino acids,
# then we sample() again.
GC[1:64] <- sample(aa, 64, replace = TRUE)
}
return(GC)
}
swappedGC <- function(GC) {
# Return a genetic code with randomly swapped amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments where the
# amino acids have been swapped.
aaOrig <- unique(GC) # the amino acids in the input code
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
names(aaSwap) <- aaOrig # name them after the original
GC[1:64] <- aaSwap[GC] # replace original with shuffled
return(GC)
}
# == 2.2 Effect of mutations ===============================================
# To evaluate the effects of mutations we will do the following:
# - we take an amino acid sequence (Mbp1 will do just nicely);
# - we reverse-translate it into a nucleotide sequence;
# - we mutate it randomly;
# - we translate it back to amino acids;
# - we count the number of mutations and evaluate their severity.
# === 2.2.1 reverse-translate
# To reverse-translate an amino acid vector, we randomly pick one of its
# codons from a genetic code, and assemble all codons to a sequence.
traRev <- function(s, GC) {
# Parameters:
# s chr a sequence vector
# GC chr a genetic code
# Value:
# A reverse-translated vector of codons
vC <- character(length(s))
for (i in seq_along(s)) {
codon <- names(GC)[GC == s[i]] # get all codons for this AA
if (length(codon) > 1) { # if there's more than one ...
codon <- sample(codon, 1) # pick one at random ...
}
vC[i] <- codon # store it
}
return(vC)
}
# === 2.2.2 Randomly mutate
# To mutate, we split a codon into it's three nucleotides, then randomly replace
# one of the three with another nucleotide.
randMut <- function(vC) {
# Parameter:
# vC chr a vector of codons
# Value: chr a vector of codons with a single point mutation from vC
nuc <- c("A", "C", "G", "T")
for (i in seq_along(vC)) {
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
iNuc <- sample(1:3, 1) # choose one of the three
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
triplet[iNuc] <- mutNuc # replace the original
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
}
return(vC)
}
# === 2.2.3 Forward- translate
traFor <- function(vC, GC) {
# Parameters:
# vC chr a codon vector
# GC chr a genetic code
# Value:
# A vector of amino acids
vAA <- character(length(vC))
for (i in seq_along(vC)) {
vAA[i] <- GC[vC[i]] # translate and store
}
return(vAA)
}
# === 2.2.4 measure effect
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
# categories, according to their free energy of transfer from water to octanol:
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
# Then we will penalize as follows:
# Changes within one category: 0.1
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
# Changes from hydrophobic to hydrophilic or back: 1.0
# Changes to stop-codon: 3.0
evalMut <- function(nat, mut) {
# Evaluate severity of mutations between amino acid sequence vectors nat and
# mut in an ad hoc approach based on hydrophobicity changes.
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
penalties <- numeric(length(nat))
lMut <- nat != mut # logical TRUE for all mutated positions
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
return(sum(penalties))
}
# A more sophisticated approach could take additional quantities into account,
# such as charge, size, or flexibility - and it could add heuristics, such as:
# proline is always bad in secondary structure, charged amino acids are terrible
# in the folded core of a protein, replacing a small by a large amino acid in
# the core is very disruptive ... etc.
# = 3 Run the experiment ==================================================
# Fetch the nucleotide sequence for MBP1:
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
myDNA <- paste0(myDNA, collapse = "")
myDNA <- as.character(codons(DNAString(myDNA)))
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
myAA <- traFor(myDNA, GENETIC_CODE)
# Mutate and evaluate
set.seed(112358)
x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, GENETIC_CODE)
evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed.
N <- 200
valUGC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
x <- randMut(myDNA) # mutate
x <- traFor(x, GENETIC_CODE) # translate
valUGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valUGC,
breaks = 15,
col = "palegoldenrod",
xlim = c(0, 400),
ylim = c(0, N/4),
main = "Universal vs. Synthetic Genetic Code",
xlab = "Mutation penalty")
# This looks like a normal distribution. Let's assume the effect of mutations
# under the universal genetic code is the mean of this distribution:
effectUGC <- mean(valUGC) # 178.1
# Now we can look at the effects of alternate genetic codes:
set.seed(112358)
# choose a new code
GC <- randomGC(GENETIC_CODE)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC)
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
x <- traFor(x, GC) # translate back, with the new code
evalMut(myAA, x) # evaluate mutation effects: 298.5
# That seems a fair bit higher than what we saw as "effectUGC"
# Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code.
N <- 200
valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- randomGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valXGC,
col = "plum",
breaks = 15,
add = TRUE)
# These two distributions are very widely separated!
# Task: Perform the same experiment with the swapped genetic code.
# Compare the distributions. Interpret the result.
# These are simple experiments, under assumptions that can be refined in
# meaningful ways. Yet, even those simple computational experiments show
# that the Universal Genetic Code has features that one would predict if
# it has evolved under selective pressure to minimize the effects of mutations.
# Gradual change under mutation is benificial to evolution, disruptive
# change is not.
# = 4 Task solutions ======================================================
N <- 200
valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- swappedGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSGC,
col = "#6688FF88",
breaks = 15,
add = TRUE)
# [END]