365 lines
13 KiB
R
365 lines
13 KiB
R
# RPR-Genetic_code_optimality.R
|
|
#
|
|
# Purpose: A Bioinformatics Course:
|
|
# R code accompanying the RPR-Genetic_code_optimality unit.
|
|
#
|
|
# Version: 1.0.1
|
|
#
|
|
# Date: 2017 10 16
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
#
|
|
# Versions:
|
|
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
|
# 1.0 New material.
|
|
#
|
|
#
|
|
# TODO:
|
|
#
|
|
#
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
|
#
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
# going on. That's not how it works ...
|
|
#
|
|
# ==============================================================================
|
|
|
|
|
|
#TOC> ==========================================================================
|
|
#TOC>
|
|
#TOC> Section Title Line
|
|
#TOC> --------------------------------------------------------
|
|
#TOC> 1 Designing a computational experiment 57
|
|
#TOC> 2 Setting up the tools 73
|
|
#TOC> 2.1 Natural and alternative genetic codes 76
|
|
#TOC> 2.2 Effect of mutations 135
|
|
#TOC> 2.2.1 reverse-translate 146
|
|
#TOC> 2.2.2 Randomly mutate 171
|
|
#TOC> 2.2.3 Forward- translate 196
|
|
#TOC> 2.2.4 measure effect 214
|
|
#TOC> 3 Run the experiment 261
|
|
#TOC> 4 Task solutions 348
|
|
#TOC>
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
|
|
# their robsustness to code changes. The approaches are quite simple and you
|
|
# will be able to come up with obvious refinements; the point of this code is to
|
|
# demonstrate some R programming techniques, in preparation for more
|
|
# sophisticated questions later.
|
|
|
|
|
|
# = 1 Designing a computational experiment ================================
|
|
|
|
# Computational experiments are conducted like wet-lab experiments. We begin
|
|
# with a hypothesis, then define the observables that relate to the hypothesis,
|
|
# then define the measures we apply to observations, and finally we interpret
|
|
# our observations. If we want to learn something about the evolution of the
|
|
# genetic code ...
|
|
|
|
# - we construct a hypothesis such as: the genetic code has evolved so as to
|
|
# minimize the effect of mutations;
|
|
# - we define the observables: the effect of mutations in
|
|
# sequences, given the natural and possible alternative codes;
|
|
# - we define the measures to quantify the effect of mutations;
|
|
# - then we compute alternatives and interpret the results.
|
|
|
|
|
|
# = 2 Setting up the tools ================================================
|
|
|
|
|
|
# == 2.1 Natural and alternative genetic codes =============================
|
|
|
|
# Load genetic code tables from the Biostrings package
|
|
if (! require(Biostrings, quietly=TRUE)) {
|
|
if (! exists("biocLite")) {
|
|
source("https://bioconductor.org/biocLite.R")
|
|
}
|
|
biocLite("Biostrings")
|
|
library(Biostrings)
|
|
}
|
|
# Package information:
|
|
# library(help = Biostrings) # basic information
|
|
# browseVignettes("Biostrings") # available vignettes
|
|
# data(package = "Biostrings") # available datasets
|
|
|
|
|
|
# There are many ways to generate alternative codes. The simplest way is to
|
|
# randomly assign amino acids to codons. A more sophisticated way is to keep the
|
|
# redundancy of codons intact, since it may reflect some form of symmetry
|
|
# breaking that ignores the third nucleotide of a codon for the most part;
|
|
# therefore we only replace the amino acids of the existing code with random
|
|
# others. Here are two functions that implement these two ideas about alternate
|
|
# codes.
|
|
|
|
randomGC <- function(GC) {
|
|
# Return a genetic code with randomly assigned amino acids.
|
|
# Parameters:
|
|
# GC named chr length-64 character vector of 20 amino acid one-letter
|
|
# codes plus "*" (stop), named with the codon triplet.
|
|
# Value: named chr same vector with random amino acid assignments in which
|
|
# every amino acid and "*" is encoded at least once.
|
|
|
|
aa <- unique(GC) # the amino acids in the input code
|
|
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
|
|
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
|
|
# does not contain all amino acids,
|
|
# then we sample() again.
|
|
GC[1:64] <- sample(aa, 64, replace = TRUE)
|
|
}
|
|
return(GC)
|
|
}
|
|
|
|
swappedGC <- function(GC) {
|
|
# Return a genetic code with randomly swapped amino acids.
|
|
# Parameters:
|
|
# GC named chr length-64 character vector of 20 amino acid one-letter
|
|
# codes plus "*" (stop), named with the codon triplet.
|
|
# Value: named chr same vector with random amino acid assignments where the
|
|
# amino acids have been swapped.
|
|
|
|
aaOrig <- unique(GC) # the amino acids in the input code
|
|
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
|
|
names(aaSwap) <- aaOrig # name them after the original
|
|
GC[1:64] <- aaSwap[GC] # replace original with shuffled
|
|
|
|
return(GC)
|
|
}
|
|
|
|
|
|
# == 2.2 Effect of mutations ===============================================
|
|
|
|
|
|
# To evaluate the effects of mutations we will do the following:
|
|
# - we take an amino acid sequence (Mbp1 will do just nicely);
|
|
# - we reverse-translate it into a nucleotide sequence;
|
|
# - we mutate it randomly;
|
|
# - we translate it back to amino acids;
|
|
# - we count the number of mutations and evaluate their severity.
|
|
|
|
|
|
# === 2.2.1 reverse-translate
|
|
|
|
# To reverse-translate an amino acid vector, we randomly pick one of its
|
|
# codons from a genetic code, and assemble all codons to a sequence.
|
|
|
|
traRev <- function(s, GC) {
|
|
# Parameters:
|
|
# s chr a sequence vector
|
|
# GC chr a genetic code
|
|
# Value:
|
|
# A reverse-translated vector of codons
|
|
vC <- character(length(s))
|
|
|
|
for (i in seq_along(s)) {
|
|
codon <- names(GC)[GC == s[i]] # get all codons for this AA
|
|
if (length(codon) > 1) { # if there's more than one ...
|
|
codon <- sample(codon, 1) # pick one at random ...
|
|
}
|
|
vC[i] <- codon # store it
|
|
}
|
|
|
|
return(vC)
|
|
}
|
|
|
|
|
|
# === 2.2.2 Randomly mutate
|
|
|
|
# To mutate, we split a codon into it's three nucleotides, then randomly replace
|
|
# one of the three with another nucleotide.
|
|
|
|
randMut <- function(vC) {
|
|
# Parameter:
|
|
# vC chr a vector of codons
|
|
# Value: chr a vector of codons with a single point mutation from vC
|
|
|
|
nuc <- c("A", "C", "G", "T")
|
|
|
|
for (i in seq_along(vC)) {
|
|
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
|
|
iNuc <- sample(1:3, 1) # choose one of the three
|
|
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
|
|
triplet[iNuc] <- mutNuc # replace the original
|
|
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
|
|
}
|
|
return(vC)
|
|
|
|
}
|
|
|
|
|
|
|
|
# === 2.2.3 Forward- translate
|
|
|
|
traFor <- function(vC, GC) {
|
|
# Parameters:
|
|
# vC chr a codon vector
|
|
# GC chr a genetic code
|
|
# Value:
|
|
# A vector of amino acids
|
|
vAA <- character(length(vC))
|
|
|
|
for (i in seq_along(vC)) {
|
|
vAA[i] <- GC[vC[i]] # translate and store
|
|
}
|
|
return(vAA)
|
|
|
|
}
|
|
|
|
|
|
# === 2.2.4 measure effect
|
|
|
|
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
|
|
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
|
|
# categories, according to their free energy of transfer from water to octanol:
|
|
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
|
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
|
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
|
|
|
# Then we will penalize as follows:
|
|
# Changes within one category: 0.1
|
|
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
|
|
# Changes from hydrophobic to hydrophilic or back: 1.0
|
|
# Changes to stop-codon: 3.0
|
|
|
|
evalMut <- function(nat, mut) {
|
|
# Evaluate severity of mutations between amino acid sequence vectors nat and
|
|
# mut in an ad hoc approach based on hydrophobicity changes.
|
|
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
|
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
|
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
|
|
|
penalties <- numeric(length(nat))
|
|
lMut <- nat != mut # logical TRUE for all mutated positions
|
|
|
|
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
|
|
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
|
|
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
|
|
|
|
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
|
|
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
|
|
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
|
|
|
|
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
|
|
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
|
|
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
|
|
|
|
return(sum(penalties))
|
|
}
|
|
|
|
# A more sophisticated approach could take additional quantities into account,
|
|
# such as charge, size, or flexibility - and it could add heuristics, such as:
|
|
# proline is always bad in secondary structure, charged amino acids are terrible
|
|
# in the folded core of a protein, replacing a small by a large amino acid in
|
|
# the core is very disruptive ... etc.
|
|
|
|
|
|
# = 3 Run the experiment ==================================================
|
|
|
|
|
|
# Fetch the nucleotide sequence for MBP1:
|
|
|
|
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
|
myDNA <- paste0(myDNA, collapse = "")
|
|
myDNA <- as.character(codons(DNAString(myDNA)))
|
|
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
|
|
|
myAA <- traFor(myDNA, GENETIC_CODE)
|
|
|
|
# Mutate and evaluate
|
|
set.seed(112358)
|
|
x <- randMut(myDNA)
|
|
x <- traFor(x, GENETIC_CODE)
|
|
evalMut(myAA, x) # 166.4
|
|
|
|
# Try this 200 times, and see how the values are distributed.
|
|
set.seed(112358)
|
|
N <- 200
|
|
valUGC <- numeric(N)
|
|
for (i in 1:N) {
|
|
x <- randMut(myDNA) # mutate
|
|
x <- traFor(x, GENETIC_CODE) # translate
|
|
valUGC[i] <- evalMut(myAA, x) # evaluate
|
|
}
|
|
|
|
hist(valUGC,
|
|
breaks = 15,
|
|
col = "palegoldenrod",
|
|
xlim = c(0, 400),
|
|
ylim = c(0, N/4),
|
|
main = "Universal vs. Synthetic Genetic Code",
|
|
xlab = "Mutation penalty")
|
|
|
|
# This looks like a normal distribution. Let's assume the effect of mutations
|
|
# under the universal genetic code is the mean of this distribution:
|
|
effectUGC <- mean(valUGC) # 178.1
|
|
|
|
# Now we can look at the effects of alternate genetic codes:
|
|
|
|
set.seed(112358)
|
|
# choose a new code
|
|
GC <- randomGC(GENETIC_CODE)
|
|
|
|
# reverse translate hypothetical sequence according to the new code
|
|
x <- traRev(myAA, GC)
|
|
|
|
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
|
|
x <- traFor(x, GC) # translate back, with the new code
|
|
evalMut(myAA, x) # evaluate mutation effects: 298.5
|
|
|
|
# That seems a fair bit higher than what we saw as "effectUGC"
|
|
# Let's try with different genetic codes. 200 trials - but this time every trial
|
|
# is with a different, synthetic genetic code.
|
|
|
|
set.seed(1414214)
|
|
N <- 200
|
|
valXGC <- numeric(N)
|
|
for (i in 1:N) {
|
|
GC <- randomGC(GENETIC_CODE) # Choose code
|
|
x <- traRev(myAA, GC) # reverse translate
|
|
x <- randMut(x) # mutate
|
|
x <- traFor(x, GC) # translate
|
|
valXGC[i] <- evalMut(myAA, x) # evaluate
|
|
}
|
|
|
|
hist(valXGC,
|
|
col = "plum",
|
|
breaks = 15,
|
|
add = TRUE)
|
|
|
|
# These two distributions are very widely separated!
|
|
|
|
# Task: Perform the same experiment with the swapped genetic code.
|
|
# Compare the distributions. Interpret the result.
|
|
|
|
|
|
# These are simple experiments, under assumptions that can be refined in
|
|
# meaningful ways. Yet, even those simple computational experiments show
|
|
# that the Universal Genetic Code has features that one would predict if
|
|
# it has evolved under selective pressure to minimize the effects of mutations.
|
|
# Gradual change under mutation is benificial to evolution, disruptive
|
|
# change is not.
|
|
|
|
|
|
# = 4 Task solutions ======================================================
|
|
|
|
set.seed(2718282)
|
|
N <- 200
|
|
valSGC <- numeric(N)
|
|
for (i in 1:N) {
|
|
GC <- swappedGC(GENETIC_CODE) # Choose code
|
|
x <- traRev(myAA, GC) # reverse translate
|
|
x <- randMut(x) # mutate
|
|
x <- traFor(x, GC) # translate
|
|
valSGC[i] <- evalMut(myAA, x) # evaluate
|
|
}
|
|
|
|
hist(valSGC,
|
|
col = "#6688FF88",
|
|
breaks = 15,
|
|
add = TRUE)
|
|
|
|
|
|
|
|
# [END]
|