Genetic code unit and data
This commit is contained in:
parent
131ec842e4
commit
8462b7d336
298
FND-Genetic_code.R
Normal file
298
FND-Genetic_code.R
Normal file
@ -0,0 +1,298 @@
|
|||||||
|
# FND-Genetic_code.R
|
||||||
|
#
|
||||||
|
# Purpose: A Bioinformatics Course:
|
||||||
|
# R code accompanying the FND-Genetic_code unit.
|
||||||
|
#
|
||||||
|
# Version: 1.0
|
||||||
|
#
|
||||||
|
# Date: 2017 09 28
|
||||||
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
|
#
|
||||||
|
# Versions:
|
||||||
|
# 1.0 First live version
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# TODO:
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> ----------------------------------------------------------
|
||||||
|
#TOC> 1 Storing the genetic code 41
|
||||||
|
#TOC> 1.1 Genetic code in Biostrings 59
|
||||||
|
#TOC> 2 Working with the genetic code 86
|
||||||
|
#TOC> 2.1 Translate a sequence. 115
|
||||||
|
#TOC> 3 An alternative representation: 3D array 176
|
||||||
|
#TOC> 3.1 Print a Genetic code table 209
|
||||||
|
#TOC> 4 Tasks 235
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 Storing the genetic code ============================================
|
||||||
|
|
||||||
|
# The genetic code maps trinucleotide codons to amino acids. To store it, we
|
||||||
|
# need some mechanism to associate these two informattion items. The most
|
||||||
|
# convenient way to do that is a "named vector" which holds the maino acid
|
||||||
|
# code and assigns the codons as names to its elements.
|
||||||
|
|
||||||
|
x <- c("M", "*")
|
||||||
|
names(x) <- c("ATG", "TAA")
|
||||||
|
x
|
||||||
|
|
||||||
|
# Then we can access the vector by the codon as name, and retrieve the
|
||||||
|
# amino acid.
|
||||||
|
|
||||||
|
x["ATG"]
|
||||||
|
x["TAA"]
|
||||||
|
|
||||||
|
|
||||||
|
# == 1.1 Genetic code in Biostrings ========================================
|
||||||
|
|
||||||
|
# Coveniently, the standard genetic code as well as its alternatives are
|
||||||
|
# available in the Bioconductor "Biostrings" package:
|
||||||
|
|
||||||
|
|
||||||
|
if (! require(Biostrings)) {
|
||||||
|
if (! exists("biocLite")) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
}
|
||||||
|
biocLite("Biostrings")
|
||||||
|
library(Biostrings)
|
||||||
|
}
|
||||||
|
|
||||||
|
# The standard genetic code vector
|
||||||
|
GENETIC_CODE
|
||||||
|
|
||||||
|
# The table of genetic codes. This information corresponds to this page
|
||||||
|
# at the NCBI:
|
||||||
|
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
||||||
|
GENETIC_CODE_TABLE
|
||||||
|
|
||||||
|
# Most of the alternative codes are mitochondrial codes. The id of the
|
||||||
|
# Alternative Yeast Nuclear code is "12"
|
||||||
|
getGeneticCode("12") # Alternative Yeast Nuclear
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 Working with the genetic code =======================================
|
||||||
|
|
||||||
|
# GENETIC_CODE is a "named vector"
|
||||||
|
|
||||||
|
str(GENETIC_CODE)
|
||||||
|
|
||||||
|
# ... which also stores the alternative initiation codons TTG and CTG in
|
||||||
|
# an attribute of the vector. (Alternative initiation codons sometimes are
|
||||||
|
# used instead of ATG to intiate translation, if if not ATG they are translated
|
||||||
|
# with fMet.)
|
||||||
|
|
||||||
|
attr(GENETIC_CODE, "alt_init_codons")
|
||||||
|
|
||||||
|
# But the key to use this vector is in the "names" which we use for subsetting
|
||||||
|
# the list of amino acids in whatever way we need.
|
||||||
|
names(GENETIC_CODE)
|
||||||
|
|
||||||
|
# The translation of "TGG" ...
|
||||||
|
GENETIC_CODE["TGG"]
|
||||||
|
|
||||||
|
# All stop codons
|
||||||
|
names(GENETIC_CODE)[GENETIC_CODE == "*"]
|
||||||
|
|
||||||
|
# All start codons
|
||||||
|
names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or
|
||||||
|
c(names(GENETIC_CODE)[GENETIC_CODE == "M"],
|
||||||
|
attr(GENETIC_CODE, "alt_init_codons"))
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.1 Translate a sequence. =============================================
|
||||||
|
|
||||||
|
|
||||||
|
# I have provided a gene sequence in the data directory:
|
||||||
|
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
|
||||||
|
|
||||||
|
# read it
|
||||||
|
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||||
|
|
||||||
|
head(mbp1)
|
||||||
|
|
||||||
|
# drop the first line (header)
|
||||||
|
mbp1 <- mbp1[-1]
|
||||||
|
head(mbp1)
|
||||||
|
|
||||||
|
# concatenate it all to a single string
|
||||||
|
mbp1 <- paste(mbp1, sep = "", collapse = "")
|
||||||
|
|
||||||
|
# how long ist it?
|
||||||
|
nchar(mbp1)
|
||||||
|
|
||||||
|
# how many codons?
|
||||||
|
nchar(mbp1)/3
|
||||||
|
|
||||||
|
# That looks correct for the 833 aa sequence plus 1 stop codon.
|
||||||
|
|
||||||
|
# Extract the codons. There are many ways to split a long string into chunks
|
||||||
|
# of three characters. Here we use Biostrings codons() function. codons()
|
||||||
|
# requires an object of type DNAstring - a special kind of string with
|
||||||
|
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
||||||
|
# with DNAstring(), then split it up, then convert it into a plain
|
||||||
|
# character vector.
|
||||||
|
mbp1Codons <- as.character(codons(DNAString(mbp1)))
|
||||||
|
|
||||||
|
head(mbp1Codons)
|
||||||
|
|
||||||
|
# now translate each codon
|
||||||
|
|
||||||
|
mbp1AA <- character(834)
|
||||||
|
for (i in seq_along(mbp1Codons)) {
|
||||||
|
mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]]
|
||||||
|
}
|
||||||
|
|
||||||
|
head(mbp1AA)
|
||||||
|
tail(mbp1AA) # Note the stop!
|
||||||
|
|
||||||
|
# We can work with this vector, for example if we want to tabulate the amino
|
||||||
|
# acid frequencies:
|
||||||
|
table(mbp1AA)
|
||||||
|
sort(table(mbp1AA), decreasing = TRUE)
|
||||||
|
|
||||||
|
# Or we can paste all elements together into a single string. But let's remove
|
||||||
|
# the stop, it's not actually a part of the sequence. To remove the last element
|
||||||
|
# of a vector, re-assign it with a vector minus the index of the last element:
|
||||||
|
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
||||||
|
tail(mbp1AA) # Note the stop is gone!
|
||||||
|
|
||||||
|
# paste it together, collapsing the elements without separation-character
|
||||||
|
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
||||||
|
|
||||||
|
|
||||||
|
# = 3 An alternative representation: 3D array =============================
|
||||||
|
|
||||||
|
|
||||||
|
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
||||||
|
# here is a good opportunity to review the syntax with a genetic code cube:
|
||||||
|
|
||||||
|
# Initialize, using A C G T as the names of the elements in each dimension
|
||||||
|
cCube <- array(data = character(64),
|
||||||
|
dim = c(4, 4, 4),
|
||||||
|
dimnames = list(c("A", "C", "G", "T"),
|
||||||
|
c("A", "C", "G", "T"),
|
||||||
|
c("A", "C", "G", "T")))
|
||||||
|
|
||||||
|
# fill it with amino acid codes using three nested loops
|
||||||
|
for (i in 1:4) {
|
||||||
|
for (j in 1:4) {
|
||||||
|
for (k in 1:4) {
|
||||||
|
myCodon <- paste(dimnames(cCube)[[1]][i],
|
||||||
|
dimnames(cCube)[[2]][j],
|
||||||
|
dimnames(cCube)[[3]][k],
|
||||||
|
sep = "",
|
||||||
|
collapse = "")
|
||||||
|
cCube[i, j, k] <- GENETIC_CODE[myCodon]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# confirm
|
||||||
|
cCube["A", "T", "G"] # methionine
|
||||||
|
cCube["T", "T", "T"] # phenylalanine
|
||||||
|
cCube["T", "A", "G"] # stop (amber)
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.1 Print a Genetic code table ========================================
|
||||||
|
|
||||||
|
|
||||||
|
# The data structure of our cCube is well suited to print a table. In the
|
||||||
|
# "standard" way to print the genetic code, we write codons with the same
|
||||||
|
# second nucleotide in columns, and arrange rows in blocks of same
|
||||||
|
# first nucleotide, varying the third nucleotide fastest. This maximizes the
|
||||||
|
# similarity of adjacent amino acids in the table if we print the
|
||||||
|
# nucleotides in the order T C A G. It's immidiately obvious that the code
|
||||||
|
# is not random: the universal genetic code is exceptionally error tolerant in
|
||||||
|
# the sense that mutations (or single-nucleotide translation errors) are likely
|
||||||
|
# to result in an amino acid with similar biophysical properties as the
|
||||||
|
# original.
|
||||||
|
|
||||||
|
nuc <- c("T", "C", "A", "G")
|
||||||
|
|
||||||
|
for (i in nuc) {
|
||||||
|
for (k in nuc) {
|
||||||
|
for (j in nuc) {
|
||||||
|
cat(sprintf("%s%s%s: %s ", i, j, k, cCube[i, j, k]))
|
||||||
|
}
|
||||||
|
cat("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# = 4 Tasks ===============================================================
|
||||||
|
|
||||||
|
|
||||||
|
# Task: What do you need to change to print the table with U instead
|
||||||
|
# of T? Try it.
|
||||||
|
|
||||||
|
|
||||||
|
# Task: Point mutations are more often transitions (purine -> purine;
|
||||||
|
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
|
||||||
|
# pyrimidine -> purine), even though twice as many transversions
|
||||||
|
# are possible in the code. This is most likely due a deamination /
|
||||||
|
# tautomerization process that favours C -> T changes. If the code
|
||||||
|
# indeed minimizes the effect of mutations, you would expect that
|
||||||
|
# codons that differ by a transition code for more similar amino acids
|
||||||
|
# than codons that differ by a transversion. Is that true? List the set
|
||||||
|
# of all amino acid pairs that are encoded by codons with a C -> T
|
||||||
|
# transition. Then list the set of amino acid pairs with a C -> A
|
||||||
|
# transversion. Which set of pairs is more similar?
|
||||||
|
|
||||||
|
|
||||||
|
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
|
||||||
|
# have if you translate them in the 2. or the 3. frame?
|
||||||
|
|
||||||
|
|
||||||
|
# Task: How does the amino acid composition change if you translate the mbp1
|
||||||
|
# gene with the Alternative Yeast Nuclear code that is used by the
|
||||||
|
# "GTC clade" of fungi?
|
||||||
|
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
|
||||||
|
|
||||||
|
# Solution:
|
||||||
|
|
||||||
|
# Fetch the code
|
||||||
|
GENETIC_CODE_TABLE
|
||||||
|
GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"]
|
||||||
|
altYcode <- getGeneticCode("12")
|
||||||
|
|
||||||
|
# what's the difference?
|
||||||
|
(delta <- which(GENETIC_CODE != altYcode))
|
||||||
|
|
||||||
|
GENETIC_CODE[delta]
|
||||||
|
altYcode[delta]
|
||||||
|
|
||||||
|
# translate
|
||||||
|
altYAA <- character(834)
|
||||||
|
for (i in seq_along(mbp1Codons)) {
|
||||||
|
altYAA[i] <- altYcode[mbp1Codons[i]]
|
||||||
|
}
|
||||||
|
|
||||||
|
table(mbp1AA)
|
||||||
|
table(altYAA)
|
||||||
|
|
||||||
|
# Task: The genetic code has significant redundacy, i.e. there are up to six
|
||||||
|
# codons that code for the same amino acid. Write code that lists how
|
||||||
|
# many amino acids are present how often i.e. it should tell you that
|
||||||
|
# two amino acids are encoded only with a single codon, three amino
|
||||||
|
# acids have six codons, etc. Solution below, but don't peek. There
|
||||||
|
# are many possible ways to do this.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Solution:
|
||||||
|
table(table(GENETIC_CODE))
|
||||||
|
|
||||||
|
|
||||||
|
# [END]
|
43
data/S288C_YDL056W_MBP1_coding.fsa
Normal file
43
data/S288C_YDL056W_MBP1_coding.fsa
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
>MBP1 YDL056W SGDID:S000002214
|
||||||
|
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
|
||||||
|
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
|
||||||
|
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
|
||||||
|
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
|
||||||
|
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
|
||||||
|
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
|
||||||
|
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
|
||||||
|
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
|
||||||
|
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
|
||||||
|
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
|
||||||
|
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
|
||||||
|
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
|
||||||
|
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
|
||||||
|
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
|
||||||
|
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
|
||||||
|
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
|
||||||
|
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
|
||||||
|
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
|
||||||
|
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
|
||||||
|
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
|
||||||
|
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
|
||||||
|
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
|
||||||
|
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
|
||||||
|
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
|
||||||
|
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
|
||||||
|
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
|
||||||
|
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
|
||||||
|
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
|
||||||
|
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
|
||||||
|
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
|
||||||
|
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
|
||||||
|
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
|
||||||
|
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
|
||||||
|
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
|
||||||
|
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
|
||||||
|
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
|
||||||
|
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
|
||||||
|
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
|
||||||
|
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
|
||||||
|
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
|
||||||
|
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
|
||||||
|
GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA
|
Loading…
Reference in New Issue
Block a user