New unit

2017-10-15 23:40:13 -04:00 · 2017-10-15 23:40:13 -04:00 · a942bef91e
commit a942bef91e
parent 05958349cf
1 changed files with 359 additions and 0 deletions
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -0,0 +1,359 @@
+# RPR-Genetic_code_optimality.R
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Genetic_code_optimality unit.
+#
+# Version:  1.0
+#
+# Date:     2017  10  16
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    New material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                    Line
+#TOC> --------------------------------------------------------
+#TOC>   1        Designing a computational experiment       53
+#TOC>   2        Setting up the tools                       69
+#TOC>   2.1      Natural and alternative genetic codes      72
+#TOC>   2.2      Effect of mutations                       126
+#TOC>   2.2.1    reverse-translate                         137
+#TOC>   2.2.2    Randomly mutate                           162
+#TOC>   2.2.3    Forward- translate                        187
+#TOC>   2.2.4    measure effect                            205
+#TOC>   3        Run the experiment                        252
+#TOC>   4        Task solutions                            339
+#TOC> 
+#TOC> ==========================================================================
+ 
+
+
+
+# This unit demonstrates R code to simulate alternate genetic codes and evaluate
+# their robsustness to code changes. The approaches are quite simple and you
+# will be able to come up with obvious refinements; the point of this code is to
+# demonstrate some R programming techniques, in preparation for more
+# sophisticated questions later.
+
+
+# =    1  Designing a computational experiment  ================================
+
+# Computational experiments are conducted like wet-lab experiments. We begin
+# with a hypothesis, then define the observables that relate to the hypothesis,
+# then define the measures we apply to observations, and finally we interpret
+# our observations. If we want to learn something about the evolution of the
+# genetic code ...
+
+#  - we construct a hypothesis such as: the genetic code has evolved so as to
+#      minimize the effect of mutations;
+#  - we define the observables: the effect of mutations in
+#      sequences, given the natural and possible alternative codes;
+#  - we define the measures to quantify the effect of mutations;
+#  - then we compute alternatives and interpret the results.
+
+
+# =    2  Setting up the tools  ================================================
+
+
+# ==   2.1  Natural and alternative genetic codes  =============================
+
+# Load the code from the Biostrings package
+if (! require(Biostrings)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("Biostrings")
+  library(Biostrings)
+}
+
+# There are many ways to generate alternative codes. The simplest way is to
+# randomly assign amino acids to codons. A more sophisticated way is to keep the
+# redundancy of codons intact, since it may reflect some form of symmetry
+# breaking that ignores the third nucleotide of a codon for the most part;
+# therefore we only replace the amino acids of the existing code with random
+# others. Here are two functions that implement these two ideas about alternate
+# codes.
+
+randomGC <- function(GC) {
+  # Return a genetic code with randomly assigned amino acids.
+  # Parameters:
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #                       codes plus "*" (stop), named with the codon triplet.
+  # Value:  named chr  same vector with random amino acid assignments in which
+  #                       every amino acid and "*" is encoded at least once.
+
+  aa <- unique(GC)                           # the amino acids in the input code
+  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
+  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
+                                             # does not contain all amino acids,
+                                             # then we sample() again.
+    GC[1:64] <- sample(aa, 64, replace = TRUE)
+  }
+  return(GC)
+}
+
+swappedGC <- function(GC) {
+  # Return a genetic code with randomly swapped amino acids.
+  # Parameters:
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #                       codes plus "*" (stop), named with the codon triplet.
+  # Value:  named chr  same vector with random amino acid assignments where the
+  #                       amino acids have been swapped.
+
+  aaOrig <- unique(GC)                # the amino acids in the input code
+  aaSwap <- sample(aa, length(aa))    # shuffled
+  names(aaSwap) <- aaOrig             # name them after the original
+  GC[1:64] <- aaSwap[GC]              # replace original with shuffled
+
+  return(GC)
+}
+
+
+# ==   2.2  Effect of mutations  ===============================================
+
+
+# To evaluate the effects of mutations we will do the following:
+#   - we take an amino acid sequence (Mbp1 will do just nicely);
+#   - we reverse-translate it into a nucleotide sequence;
+#   - we mutate it randomly;
+#   - we translate it back to amino acids;
+#   - we count the number of mutations and evaluate their severity.
+
+
+# ===  2.2.1  reverse-translate                    
+
+# To reverse-translate an amino acid vector, we randomly pick one of its
+# codons from a genetic code, and assemble all codons to a sequence.
+
+traRev <- function(s, GC) {
+  # Parameters:
+  #      s   chr   a sequence vector
+  #      GC  chr   a genetic code
+  # Value:
+  #      A reverse-translated vector of codons
+  vC <- character(length(s))
+
+  for (i in seq_along(s)) {
+    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
+    if (length(codon) > 1) {         # if there's more than one ...
+      codon <- sample(codon, 1)      # pick one at random ...
+    }
+    vC[i] <- codon                   # store it
+  }
+
+  return(vC)
+}
+
+
+# ===  2.2.2  Randomly mutate                      
+
+# To mutate, we split a codon into it's three nucleotides, then randomly replace
+# one of the three with another nucleotide.
+
+randMut <- function(vC) {
+  # Parameter:
+  #    vC   chr     a vector of codons
+  # Value:  chr     a vector of codons with a single point mutation from vC
+
+  nuc <- c("A", "C", "G", "T")
+
+  for (i in seq_along(vC)) {
+    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
+    iNuc <- sample(1:3, 1)                         # choose one of the three
+    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
+    triplet[iNuc] <- mutNuc                        # replace the original
+    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
+  }
+  return(vC)
+
+}
+
+
+
+# ===  2.2.3  Forward- translate                   
+
+traFor <- function(vC, GC) {
+  # Parameters:
+  #      vC   chr   a codon vector
+  #      GC   chr   a genetic code
+  # Value:
+  #      A vector of amino acids
+  vAA <- character(length(vC))
+
+  for (i in seq_along(vC)) {
+    vAA[i] <- GC[vC[i]]         # translate and store
+  }
+  return(vAA)
+
+  }
+
+
+# ===  2.2.4  measure effect                       
+
+# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
+# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
+# categories, according to their free energy of transfer from water to octanol:
+aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+aaNeutral <- c("A", "H", "T", "S", "V", "G")
+
+# Then we will penalize as follows:
+# Changes within one category: 0.1
+# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
+# Changes from hydrophobic to hydrophilic or back: 1.0
+# Changes to stop-codon: 3.0
+
+evalMut <- function(nat, mut) {
+  # Evaluate severity of mutations between amino acid sequence vectors nat and
+  # mut in an ad hoc approach based on hydrophobicity changes.
+  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+  aaNeutral <- c("A", "H", "T", "S", "V", "G")
+
+  penalties <- numeric(length(nat))
+  lMut <- nat != mut    # logical TRUE for all mutated positions
+
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
+
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
+
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
+
+  return(sum(penalties))
+}
+
+# A more sophisticated approach could take additional quantities into account,
+# such as charge, size, or flexibility - and it could add heuristics, such as:
+# proline is always bad in secondary structure, charged amino acids are terrible
+# in the folded core of a protein, replacing a small by a large amino acid in
+# the core is very disruptive ... etc.
+
+
+# =    3  Run the experiment  ==================================================
+
+
+# Fetch the nucleotide sequence for MBP1:
+
+myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
+myDNA <- paste0(myDNA, collapse = "")
+myDNA <- as.character(codons(DNAString(myDNA)))
+myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
+
+myAA <- traFor(myDNA, GENETIC_CODE)
+
+# Mutate and evaluate
+set.seed(112358)
+x <- randMut(myDNA)
+x <- traFor(x, GENETIC_CODE)
+evalMut(myAA, x)  # 166.4
+
+# Try this 200 times, and see how the values are distributed.
+set.seed(112358)
+N <- 200
+valUGC <- numeric(N)
+for (i in 1:N) {
+  x <- randMut(myDNA)            # mutate
+  x <- traFor(x, GENETIC_CODE)   # translate
+  valUGC[i] <- evalMut(myAA, x)     # evaluate
+}
+
+hist(valUGC,
+     breaks = 15,
+     col = "palegoldenrod",
+     xlim = c(0, 400),
+     ylim = c(0, N/4),
+     main = "Universal vs. Synthetic Genetic Code",
+     xlab = "Mutation penalty")
+
+# This looks like a normal distribution. Let's assume the effect of mutations
+# under the universal genetic code is the mean of this distribution:
+effectUGC <- mean(val)  # 178.1
+
+# Now we can look at the effects of alternate genetic codes:
+
+set.seed(112358)
+# choose a new code
+GC <- randomGC(GENETIC_CODE)
+
+# reverse translate hypothetical sequence according to the new code
+x <- traRev(myAA, GC)
+
+x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
+x <- traFor(x, GC)     # translate back, with the new code
+evalMut(myAA, x)       # evaluate mutation effects: 298.5
+
+# That seems a fair bit higher than what we saw as "effectUGC"
+# Let's try with different genetic codes. 200 trials - but this time every trial
+# is with a different, synthetic genetic code.
+
+set.seed(1414214)
+N <- 200
+valXGC <- numeric(N)
+for (i in 1:N) {
+  GC <- randomGC(GENETIC_CODE)   # Choose code
+  x <- traRev(myAA, GC)          # reverse translate
+  x <- randMut(x)                # mutate
+  x <- traFor(x, GC)             # translate
+  valXGC[i] <- evalMut(myAA, x)  # evaluate
+}
+
+hist(valXGC,
+     col = "plum",
+     breaks = 15,
+     add = TRUE)
+
+# These two distributions are very widely separated!
+
+# Task: Perform the same experiment with the swapped genetic code.
+#       Compare the distributions. Interpret the result.
+
+
+# These are simple experiments, under assumptions that can be refined in
+# meaningful ways. Yet, even those simple computational experiments show
+# that the Universal Genetic Code has features that one would predict if
+# it has evolved under selective pressure to minimize the effects of mutations.
+# Gradual change under mutation is benificial to evolution, disruptive
+# change is not.
+
+
+# =    4  Task solutions  ======================================================
+
+set.seed(2718282)
+N <- 200
+valSGC <- numeric(N)
+for (i in 1:N) {
+  GC <- swappedGC(GENETIC_CODE)  # Choose code
+  x <- traRev(myAA, GC)          # reverse translate
+  x <- randMut(x)                # mutate
+  x <- traFor(x, GC)             # translate
+  valSGC[i] <- evalMut(myAA, x)  # evaluate
+}
+
+hist(valSGC,
+     col = "#6688FF88",
+     breaks = 15,
+     add = TRUE)
+
+
+
+# [END]