New unit
This commit is contained in:
		
							
								
								
									
										359
									
								
								RPR-Genetic_code_optimality.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										359
									
								
								RPR-Genetic_code_optimality.R
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,359 @@
 | 
			
		||||
# RPR-Genetic_code_optimality.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Genetic_code_optimality unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017  10  16
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    New material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                    Line
 | 
			
		||||
#TOC> --------------------------------------------------------
 | 
			
		||||
#TOC>   1        Designing a computational experiment       53
 | 
			
		||||
#TOC>   2        Setting up the tools                       69
 | 
			
		||||
#TOC>   2.1      Natural and alternative genetic codes      72
 | 
			
		||||
#TOC>   2.2      Effect of mutations                       126
 | 
			
		||||
#TOC>   2.2.1    reverse-translate                         137
 | 
			
		||||
#TOC>   2.2.2    Randomly mutate                           162
 | 
			
		||||
#TOC>   2.2.3    Forward- translate                        187
 | 
			
		||||
#TOC>   2.2.4    measure effect                            205
 | 
			
		||||
#TOC>   3        Run the experiment                        252
 | 
			
		||||
#TOC>   4        Task solutions                            339
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
 | 
			
		||||
# their robsustness to code changes. The approaches are quite simple and you
 | 
			
		||||
# will be able to come up with obvious refinements; the point of this code is to
 | 
			
		||||
# demonstrate some R programming techniques, in preparation for more
 | 
			
		||||
# sophisticated questions later.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Designing a computational experiment  ================================
 | 
			
		||||
 | 
			
		||||
# Computational experiments are conducted like wet-lab experiments. We begin
 | 
			
		||||
# with a hypothesis, then define the observables that relate to the hypothesis,
 | 
			
		||||
# then define the measures we apply to observations, and finally we interpret
 | 
			
		||||
# our observations. If we want to learn something about the evolution of the
 | 
			
		||||
# genetic code ...
 | 
			
		||||
 | 
			
		||||
#  - we construct a hypothesis such as: the genetic code has evolved so as to
 | 
			
		||||
#      minimize the effect of mutations;
 | 
			
		||||
#  - we define the observables: the effect of mutations in
 | 
			
		||||
#      sequences, given the natural and possible alternative codes;
 | 
			
		||||
#  - we define the measures to quantify the effect of mutations;
 | 
			
		||||
#  - then we compute alternatives and interpret the results.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Setting up the tools  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Natural and alternative genetic codes  =============================
 | 
			
		||||
 | 
			
		||||
# Load the code from the Biostrings package
 | 
			
		||||
if (! require(Biostrings)) {
 | 
			
		||||
  if (! exists("biocLite")) {
 | 
			
		||||
    source("https://bioconductor.org/biocLite.R")
 | 
			
		||||
  }
 | 
			
		||||
  biocLite("Biostrings")
 | 
			
		||||
  library(Biostrings)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# There are many ways to generate alternative codes. The simplest way is to
 | 
			
		||||
# randomly assign amino acids to codons. A more sophisticated way is to keep the
 | 
			
		||||
# redundancy of codons intact, since it may reflect some form of symmetry
 | 
			
		||||
# breaking that ignores the third nucleotide of a codon for the most part;
 | 
			
		||||
# therefore we only replace the amino acids of the existing code with random
 | 
			
		||||
# others. Here are two functions that implement these two ideas about alternate
 | 
			
		||||
# codes.
 | 
			
		||||
 | 
			
		||||
randomGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly assigned amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments in which
 | 
			
		||||
  #                       every amino acid and "*" is encoded at least once.
 | 
			
		||||
 | 
			
		||||
  aa <- unique(GC)                           # the amino acids in the input code
 | 
			
		||||
  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
 | 
			
		||||
  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
 | 
			
		||||
                                             # does not contain all amino acids,
 | 
			
		||||
                                             # then we sample() again.
 | 
			
		||||
    GC[1:64] <- sample(aa, 64, replace = TRUE)
 | 
			
		||||
  }
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
swappedGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly swapped amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments where the
 | 
			
		||||
  #                       amino acids have been swapped.
 | 
			
		||||
 | 
			
		||||
  aaOrig <- unique(GC)                # the amino acids in the input code
 | 
			
		||||
  aaSwap <- sample(aa, length(aa))    # shuffled
 | 
			
		||||
  names(aaSwap) <- aaOrig             # name them after the original
 | 
			
		||||
  GC[1:64] <- aaSwap[GC]              # replace original with shuffled
 | 
			
		||||
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Effect of mutations  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# To evaluate the effects of mutations we will do the following:
 | 
			
		||||
#   - we take an amino acid sequence (Mbp1 will do just nicely);
 | 
			
		||||
#   - we reverse-translate it into a nucleotide sequence;
 | 
			
		||||
#   - we mutate it randomly;
 | 
			
		||||
#   - we translate it back to amino acids;
 | 
			
		||||
#   - we count the number of mutations and evaluate their severity.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===  2.2.1  reverse-translate                    
 | 
			
		||||
 | 
			
		||||
# To reverse-translate an amino acid vector, we randomly pick one of its
 | 
			
		||||
# codons from a genetic code, and assemble all codons to a sequence.
 | 
			
		||||
 | 
			
		||||
traRev <- function(s, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      s   chr   a sequence vector
 | 
			
		||||
  #      GC  chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A reverse-translated vector of codons
 | 
			
		||||
  vC <- character(length(s))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(s)) {
 | 
			
		||||
    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
 | 
			
		||||
    if (length(codon) > 1) {         # if there's more than one ...
 | 
			
		||||
      codon <- sample(codon, 1)      # pick one at random ...
 | 
			
		||||
    }
 | 
			
		||||
    vC[i] <- codon                   # store it
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(vC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===  2.2.2  Randomly mutate                      
 | 
			
		||||
 | 
			
		||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
 | 
			
		||||
# one of the three with another nucleotide.
 | 
			
		||||
 | 
			
		||||
randMut <- function(vC) {
 | 
			
		||||
  # Parameter:
 | 
			
		||||
  #    vC   chr     a vector of codons
 | 
			
		||||
  # Value:  chr     a vector of codons with a single point mutation from vC
 | 
			
		||||
 | 
			
		||||
  nuc <- c("A", "C", "G", "T")
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
 | 
			
		||||
    iNuc <- sample(1:3, 1)                         # choose one of the three
 | 
			
		||||
    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
 | 
			
		||||
    triplet[iNuc] <- mutNuc                        # replace the original
 | 
			
		||||
    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
 | 
			
		||||
  }
 | 
			
		||||
  return(vC)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===  2.2.3  Forward- translate                   
 | 
			
		||||
 | 
			
		||||
traFor <- function(vC, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      vC   chr   a codon vector
 | 
			
		||||
  #      GC   chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A vector of amino acids
 | 
			
		||||
  vAA <- character(length(vC))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    vAA[i] <- GC[vC[i]]         # translate and store
 | 
			
		||||
  }
 | 
			
		||||
  return(vAA)
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===  2.2.4  measure effect                       
 | 
			
		||||
 | 
			
		||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
 | 
			
		||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
 | 
			
		||||
# categories, according to their free energy of transfer from water to octanol:
 | 
			
		||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
# Then we will penalize as follows:
 | 
			
		||||
# Changes within one category: 0.1
 | 
			
		||||
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
 | 
			
		||||
# Changes from hydrophobic to hydrophilic or back: 1.0
 | 
			
		||||
# Changes to stop-codon: 3.0
 | 
			
		||||
 | 
			
		||||
evalMut <- function(nat, mut) {
 | 
			
		||||
  # Evaluate severity of mutations between amino acid sequence vectors nat and
 | 
			
		||||
  # mut in an ad hoc approach based on hydrophobicity changes.
 | 
			
		||||
  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
  aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
  penalties <- numeric(length(nat))
 | 
			
		||||
  lMut <- nat != mut    # logical TRUE for all mutated positions
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
 | 
			
		||||
 | 
			
		||||
  return(sum(penalties))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# A more sophisticated approach could take additional quantities into account,
 | 
			
		||||
# such as charge, size, or flexibility - and it could add heuristics, such as:
 | 
			
		||||
# proline is always bad in secondary structure, charged amino acids are terrible
 | 
			
		||||
# in the folded core of a protein, replacing a small by a large amino acid in
 | 
			
		||||
# the core is very disruptive ... etc.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Run the experiment  ==================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Fetch the nucleotide sequence for MBP1:
 | 
			
		||||
 | 
			
		||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
 | 
			
		||||
myDNA <- paste0(myDNA, collapse = "")
 | 
			
		||||
myDNA <- as.character(codons(DNAString(myDNA)))
 | 
			
		||||
myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
 | 
			
		||||
 | 
			
		||||
myAA <- traFor(myDNA, GENETIC_CODE)
 | 
			
		||||
 | 
			
		||||
# Mutate and evaluate
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
x <- randMut(myDNA)
 | 
			
		||||
x <- traFor(x, GENETIC_CODE)
 | 
			
		||||
evalMut(myAA, x)  # 166.4
 | 
			
		||||
 | 
			
		||||
# Try this 200 times, and see how the values are distributed.
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
N <- 200
 | 
			
		||||
valUGC <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  x <- randMut(myDNA)            # mutate
 | 
			
		||||
  x <- traFor(x, GENETIC_CODE)   # translate
 | 
			
		||||
  valUGC[i] <- evalMut(myAA, x)     # evaluate
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
hist(valUGC,
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     col = "palegoldenrod",
 | 
			
		||||
     xlim = c(0, 400),
 | 
			
		||||
     ylim = c(0, N/4),
 | 
			
		||||
     main = "Universal vs. Synthetic Genetic Code",
 | 
			
		||||
     xlab = "Mutation penalty")
 | 
			
		||||
 | 
			
		||||
# This looks like a normal distribution. Let's assume the effect of mutations
 | 
			
		||||
# under the universal genetic code is the mean of this distribution:
 | 
			
		||||
effectUGC <- mean(val)  # 178.1
 | 
			
		||||
 | 
			
		||||
# Now we can look at the effects of alternate genetic codes:
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
# choose a new code
 | 
			
		||||
GC <- randomGC(GENETIC_CODE)
 | 
			
		||||
 | 
			
		||||
# reverse translate hypothetical sequence according to the new code
 | 
			
		||||
x <- traRev(myAA, GC)
 | 
			
		||||
 | 
			
		||||
x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
 | 
			
		||||
x <- traFor(x, GC)     # translate back, with the new code
 | 
			
		||||
evalMut(myAA, x)       # evaluate mutation effects: 298.5
 | 
			
		||||
 | 
			
		||||
# That seems a fair bit higher than what we saw as "effectUGC"
 | 
			
		||||
# Let's try with different genetic codes. 200 trials - but this time every trial
 | 
			
		||||
# is with a different, synthetic genetic code.
 | 
			
		||||
 | 
			
		||||
set.seed(1414214)
 | 
			
		||||
N <- 200
 | 
			
		||||
valXGC <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- randomGC(GENETIC_CODE)   # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valXGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
hist(valXGC,
 | 
			
		||||
     col = "plum",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
# These two distributions are very widely separated!
 | 
			
		||||
 | 
			
		||||
# Task: Perform the same experiment with the swapped genetic code.
 | 
			
		||||
#       Compare the distributions. Interpret the result.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# These are simple experiments, under assumptions that can be refined in
 | 
			
		||||
# meaningful ways. Yet, even those simple computational experiments show
 | 
			
		||||
# that the Universal Genetic Code has features that one would predict if
 | 
			
		||||
# it has evolved under selective pressure to minimize the effects of mutations.
 | 
			
		||||
# Gradual change under mutation is benificial to evolution, disruptive
 | 
			
		||||
# change is not.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
set.seed(2718282)
 | 
			
		||||
N <- 200
 | 
			
		||||
valSGC <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- swappedGC(GENETIC_CODE)  # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valSGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
hist(valSGC,
 | 
			
		||||
     col = "#6688FF88",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
		Reference in New Issue
	
	Block a user