Genetic code unit and data

2017-09-28 09:19:59 -04:00
parent 131ec842e4
commit 8462b7d336
2 changed files with 341 additions and 0 deletions
--- a/FND-Genetic_code.R
+++ b/FND-Genetic_code.R
@@ -0,0 +1,298 @@
 # FND-Genetic_code.R
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-Genetic_code unit.
 #
 # Version:  1.0
 #
 # Date:     2017  09  28
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.0    First live version
 #
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
 #
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
 #
 # ==============================================================================
 #TOC> ==========================================================================
 #TOC>
 #TOC>   Section  Title                                      Line
 #TOC> ----------------------------------------------------------
 #TOC>   1        Storing the genetic code                     41
 #TOC>   1.1      Genetic code in Biostrings                   59
 #TOC>   2        Working with the genetic code                86
 #TOC>   2.1      Translate a sequence.                       115
 #TOC>   3        An alternative representation: 3D array     176
 #TOC>   3.1      Print a Genetic code table                  209
 #TOC>   4        Tasks                                       235
 #TOC>
 #TOC> ==========================================================================
 # =    1  Storing the genetic code  ============================================
 # The genetic code maps trinucleotide codons to amino acids. To store it, we
 # need some mechanism to associate these two informattion items. The most
 # convenient way to do that is a "named vector" which holds the maino acid
 # code and assigns the codons as names to its elements.
 x <- c("M", "*")
 names(x) <- c("ATG", "TAA")
 x
 # Then we can access the vector by the codon as name, and retrieve the
 # amino acid.
 x["ATG"]
 x["TAA"]
 # ==   1.1  Genetic code in Biostrings  ========================================
 # Coveniently, the standard genetic code as well as its alternatives are
 # available in the Bioconductor "Biostrings" package:
 if (! require(Biostrings)) {
  if (! exists("biocLite")) {
    source("https://bioconductor.org/biocLite.R")
  }
  biocLite("Biostrings")
  library(Biostrings)
 }
 # The standard genetic code vector
 GENETIC_CODE
 # The table of genetic codes. This information corresponds to this page
 # at the NCBI:
 # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
 GENETIC_CODE_TABLE
 # Most of the alternative codes are mitochondrial codes. The id of the
 # Alternative Yeast Nuclear code is "12"
 getGeneticCode("12")  # Alternative Yeast Nuclear
 # =    2  Working with the genetic code  =======================================
 # GENETIC_CODE is a "named vector"
 str(GENETIC_CODE)
 # ... which also stores the alternative initiation codons TTG and CTG in
 # an attribute of the vector. (Alternative initiation codons sometimes are
 # used instead of ATG to intiate translation, if if not ATG they are translated
 # with fMet.)
 attr(GENETIC_CODE, "alt_init_codons")
 # But the key to use this vector is in the "names" which we use for subsetting
 # the list of amino acids in whatever way we need.
 names(GENETIC_CODE)
 # The translation of "TGG" ...
 GENETIC_CODE["TGG"]
 # All stop codons
 names(GENETIC_CODE)[GENETIC_CODE == "*"]
 # All start codons
 names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or
 c(names(GENETIC_CODE)[GENETIC_CODE == "M"],
  attr(GENETIC_CODE, "alt_init_codons"))
 # ==   2.1  Translate a sequence.  =============================================
 # I have provided a gene sequence in the data directory:
 # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
 # read it
 mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 head(mbp1)
 # drop the first line (header)
 mbp1 <- mbp1[-1]
 head(mbp1)
 # concatenate it all to a single string
 mbp1 <- paste(mbp1, sep = "", collapse = "")
 # how long ist it?
 nchar(mbp1)
 # how many codons?
 nchar(mbp1)/3
 # That looks correct for the 833 aa sequence plus 1 stop codon.
 # Extract the codons. There are many ways to split a long string into chunks
 # of three characters. Here we use Biostrings  codons()  function. codons()
 # requires an object of type DNAstring - a special kind of string with
 # attributes that are useful for Biostrings. Thus we convert the sequence first
 # with DNAstring(), then split it up, then convert it into a plain
 # character vector.
 mbp1Codons <- as.character(codons(DNAString(mbp1)))
 head(mbp1Codons)
 # now translate each codon
 mbp1AA <- character(834)
 for (i in seq_along(mbp1Codons)) {
  mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]]
 }
 head(mbp1AA)
 tail(mbp1AA) # Note the stop!
 # We can work with this vector, for example if we want to tabulate the amino
 # acid frequencies:
 table(mbp1AA)
 sort(table(mbp1AA), decreasing = TRUE)
 # Or we can paste all elements together into a single string. But let's remove
 # the stop, it's not actually a part of the sequence. To remove the last element
 # of a vector, re-assign it with a vector minus the index of the last element:
 mbp1AA <- mbp1AA[-(length(mbp1AA))]
 tail(mbp1AA) # Note the stop is gone!
 # paste it together, collapsing the elements without separation-character
 (Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
 # =    3  An alternative representation: 3D array  =============================
 # We don't use 3D arrays often - usually just 2D tables and data frames, so
 # here is a good opportunity to review the syntax with a genetic code cube:
 # Initialize, using A C G T as the names of the elements in each dimension
 cCube <- array(data     = character(64),
               dim      = c(4, 4, 4),
               dimnames = list(c("A", "C", "G", "T"),
                               c("A", "C", "G", "T"),
                               c("A", "C", "G", "T")))
 # fill it with amino acid codes using three nested loops
 for (i in 1:4) {
  for (j in 1:4) {
    for (k in 1:4) {
      myCodon <- paste(dimnames(cCube)[[1]][i],
                       dimnames(cCube)[[2]][j],
                       dimnames(cCube)[[3]][k],
                       sep = "",
                       collapse = "")
      cCube[i, j, k] <- GENETIC_CODE[myCodon]
    }
  }
 }
 # confirm
 cCube["A", "T", "G"] # methionine
 cCube["T", "T", "T"] # phenylalanine
 cCube["T", "A", "G"] # stop (amber)
 # ==   3.1  Print a Genetic code table  ========================================
 # The data structure of our cCube is well suited to print a table. In the
 # "standard" way to print the genetic code, we write codons with the same
 # second nucleotide in columns, and arrange rows in blocks of same
 # first nucleotide, varying the third nucleotide fastest. This maximizes the
 # similarity of adjacent amino acids in the table if we print the
 # nucleotides in the order T C A G. It's immidiately obvious that the code
 # is not random: the universal genetic code is exceptionally error tolerant in
 # the sense that mutations (or single-nucleotide translation errors) are likely
 # to result in an amino acid with similar biophysical properties as the
 # original.
 nuc <- c("T", "C", "A", "G")
 for (i in nuc) {
  for (k in nuc) {
    for (j in nuc) {
      cat(sprintf("%s%s%s: %s   ", i, j, k, cCube[i, j, k]))
    }
    cat("\n")
  }
 }
 # =    4  Tasks  ===============================================================
 # Task: What do you need to change to print the table with U instead
 #         of T? Try it.
 # Task: Point mutations are more often transitions (purine -> purine;
 #         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
 #         pyrimidine -> purine), even though twice as many transversions
 #         are possible in the code. This is most likely due a deamination /
 #         tautomerization process that favours C -> T changes. If the code
 #         indeed minimizes the effect of mutations, you would expect that
 #         codons that differ by a transition code for more similar amino acids
 #         than codons that differ by a transversion. Is that true? List the set
 #         of all amino acid pairs that are encoded by codons with a C -> T
 #         transition. Then list the set of amino acid pairs with a C -> A
 #         transversion. Which set of pairs is more similar?
 # Task: How many stop codons do the two mbp1-gene derived amino acid sequences
 #         have if you translate them in the 2. or the 3. frame?
 # Task: How does the amino acid composition change if you translate the mbp1
 #         gene with the Alternative Yeast Nuclear code that is used by the
 #         "GTC clade" of fungi?
 #         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
 # Solution:
          # Fetch the code
          GENETIC_CODE_TABLE
          GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"]
          altYcode <- getGeneticCode("12")
          # what's the difference?
          (delta <- which(GENETIC_CODE != altYcode))
          GENETIC_CODE[delta]
          altYcode[delta]
          # translate
          altYAA <- character(834)
          for (i in seq_along(mbp1Codons)) {
            altYAA[i] <- altYcode[mbp1Codons[i]]
          }
          table(mbp1AA)
          table(altYAA)
 # Task: The genetic code has significant redundacy, i.e. there are up to six
 #         codons that code for the same amino acid. Write code that lists how
 #         many amino acids are present how often i.e. it should tell you that
 #         two amino acids are encoded only with a single codon, three amino
 #         acids have six codons, etc. Solution below, but don't peek. There
 #         are many possible ways to do this.
 #
 #
 # Solution:
 table(table(GENETIC_CODE))
 # [END]
--- a/data/S288C_YDL056W_MBP1_coding.fsa
+++ b/data/S288C_YDL056W_MBP1_coding.fsa
@@ -0,0 +1,43 @@
 >MBP1 YDL056W SGDID:S000002214
 ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
 TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
 AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
 GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
 AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
 GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
 TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
 AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
 CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
 AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
 CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
 TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
 CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
 GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
 CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
 CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
 CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
 CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
 TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
 CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
 TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
 ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
 TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
 ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
 TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
 AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
 TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
 ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
 ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
 GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
 GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
 ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
 CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
 ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
 GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
 AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
 CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
 GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
 CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
 ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
 AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
 GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA