bch441-work-abc-units/FND-Genetic_code.R

342 lines
12 KiB
R
Raw Normal View History

# tocID <- "FND-Genetic_code.R"
#
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
2017-09-28 13:19:59 +00:00
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-Genetic_code unit.
#
# Version: 1.1
2017-09-28 13:19:59 +00:00
#
# Date: 2017 10 - 2019 01
2017-09-28 13:19:59 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.1 Comment on "incomplete final line" warning in FASTA
2017-09-28 13:19:59 +00:00
# 1.0 First live version
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
2017-09-28 13:19:59 +00:00
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------------
#TOC> 1 Storing the genetic code 45
#TOC> 1.1 Genetic code in Biostrings 63
#TOC> 2 Working with the genetic code 94
#TOC> 2.1 Translate a sequence. 129
#TOC> 3 An alternative representation: 3D array 212
#TOC> 3.1 Print a Genetic code table 246
#TOC> 4 Tasks 272
#TOC>
2017-09-28 13:19:59 +00:00
#TOC> ==========================================================================
# = 1 Storing the genetic code ============================================
# The genetic code maps trinucleotide codons to amino acids. To store it, we
# need some mechanism to associate these two informattion items. The most
2017-11-18 04:44:05 +00:00
# convenient way to do that is a "named vector" which holds the amino acid
2017-09-28 13:19:59 +00:00
# code and assigns the codons as names to its elements.
x <- c("M", "*")
names(x) <- c("ATG", "TAA")
x
# Then we can access the vector by the codon as name, and retrieve the
# amino acid.
x["ATG"]
x["TAA"]
# == 1.1 Genetic code in Biostrings ========================================
# Coveniently, the standard genetic code as well as its alternatives are
# available in the Bioconductor "Biostrings" package:
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
2017-09-28 13:19:59 +00:00
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
2017-09-28 13:19:59 +00:00
# The standard genetic code vector
Biostrings::GENETIC_CODE
2017-09-28 13:19:59 +00:00
# The table of genetic codes. This information corresponds to this page
# at the NCBI:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
Biostrings::GENETIC_CODE_TABLE
2017-09-28 13:19:59 +00:00
# Most of the alternative codes are mitochondrial codes. The id of the
# Alternative Yeast Nuclear code is "12"
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
2017-09-28 13:19:59 +00:00
# = 2 Working with the genetic code =======================================
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
# to a "local" variable, rather than retrieving it from the package all the
# time.
genCode <- Biostrings::GENETIC_CODE
2017-09-28 13:19:59 +00:00
# This is a named vector of characters ...
str(genCode)
2017-09-28 13:19:59 +00:00
# ... which also stores the alternative initiation codons TTG and CTG in
# an attribute of the vector. (Alternative initiation codons sometimes are
# used instead of ATG to intiate translation, if if not ATG they are translated
# with fMet.)
attr(genCode, "alt_init_codons")
2017-09-28 13:19:59 +00:00
# But the key to use this vector is in the "names" which we use for subsetting
# the list of amino acids in whatever way we need.
names(genCode)
2017-09-28 13:19:59 +00:00
# The translation of "TGG" ...
genCode["TGG"]
2017-09-28 13:19:59 +00:00
# All stop codons
names(genCode)[genCode == "*"]
2017-09-28 13:19:59 +00:00
# All start codons
names(genCode)[genCode == "M"] # ... or
c(names(genCode)[genCode == "M"],
attr(genCode, "alt_init_codons"))
2017-09-28 13:19:59 +00:00
# == 2.1 Translate a sequence. =============================================
# I have provided a gene sequence in the data directory:
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
# read it
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# You will notice that this generates a Warning message:
# Warning message:
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
# The reason for this is that the last character of the file is the letter "A"
# and not a "\n" line break. This file is exactly how it was sent from the
# server; I think good, defensive programming practice would have been to
# include some kind of an end-marker in the file, like a final "\n". This helps
# us recognize an incomplete transmission. Let's parse the actual sequence from
# the file, and then check for completeness.
2017-09-28 13:19:59 +00:00
head(mbp1)
# drop the first line (header)
mbp1 <- mbp1[-1]
head(mbp1)
# concatenate it all to a single string
mbp1 <- paste(mbp1, sep = "", collapse = "")
# how long is it?
2017-09-28 13:19:59 +00:00
nchar(mbp1)
# how many codons?
nchar(mbp1)/3
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
# first verification that the file we read is complete, the nucleotides of a
# complete ORF should be divisible by 3.
2017-09-28 13:19:59 +00:00
# Extract the codons. There are many ways to split a long string into chunks
# of three characters. Here we use the Biostrings codons() function. codons()
2017-09-28 13:19:59 +00:00
# requires an object of type DNAstring - a special kind of string with
# attributes that are useful for Biostrings. Thus we convert the sequence first
# with DNAstring(), then split it up, then convert it into a plain
# character vector.
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
2017-09-28 13:19:59 +00:00
head(mbp1Codons)
# now translate each codon
mbp1AA <- character(834)
for (i in seq_along(mbp1Codons)) {
mbp1AA[i] <- genCode[mbp1Codons[i]]
2017-09-28 13:19:59 +00:00
}
head(mbp1Codons)
2017-09-28 13:19:59 +00:00
head(mbp1AA)
tail(mbp1Codons)
2017-09-28 13:19:59 +00:00
tail(mbp1AA) # Note the stop!
# The TAA "ochre" stop codon is our second verification that the nucleotide
# sequence is complete: a stop codon can't appear internally in an ORF.
# We can work with the mbp1AA vector, for example to tabulate the
# amino acid frequencies:
2017-09-28 13:19:59 +00:00
table(mbp1AA)
sort(table(mbp1AA), decreasing = TRUE)
# Or we can paste all elements together into a single string. But let's remove
# the stop, it's not actually a part of the sequence. To remove the last element
# of a vector, re-assign it with a vector minus the index of the last element:
mbp1AA <- mbp1AA[-(length(mbp1AA))]
tail(mbp1AA) # Note the stop is gone!
# paste it together, collapsing the elements using an empty string as the
# separation-character (i.e.: nothing)
2017-09-28 13:19:59 +00:00
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
# = 3 An alternative representation: 3D array =============================
# We don't use 3D arrays often - usually just 2D tables and data frames, so
# here is a good opportunity to review the syntax of 3D arrays with a
# genetic code cube:
2017-09-28 13:19:59 +00:00
# Initialize, using A G C T as the names of the elements in each dimension
2017-09-28 13:19:59 +00:00
cCube <- array(data = character(64),
dim = c(4, 4, 4),
dimnames = list(c("A", "G", "C", "T"),
c("A", "G", "C", "T"),
c("A", "G", "C", "T")))
2017-09-28 13:19:59 +00:00
# fill it with amino acid codes using three nested loops
for (i in 1:4) {
for (j in 1:4) {
for (k in 1:4) {
myCodon <- paste(dimnames(cCube)[[1]][i],
dimnames(cCube)[[2]][j],
dimnames(cCube)[[3]][k],
sep = "",
collapse = "")
cCube[i, j, k] <- genCode[myCodon]
2017-09-28 13:19:59 +00:00
}
}
}
# confirm
cCube["A", "T", "G"] # methionine
cCube["T", "T", "T"] # phenylalanine
cCube["T", "A", "G"] # stop (amber)
# == 3.1 Print a Genetic code table ========================================
# The data structure of our cCube is well suited to print a table. In the
# "standard" way to print the genetic code, we write codons with the same
# second nucleotide in columns, and arrange rows in blocks of same
# first nucleotide, varying the third nucleotide fastest. This maximizes the
# similarity of adjacent amino acids in the table if we print the
# nucleotides in the order T C A G. It's immidiately obvious that the code
# is not random: the universal genetic code is exceptionally error tolerant in
# the sense that mutations (or single-nucleotide translation errors) are likely
# to result in an amino acid with similar biophysical properties as the
# original.
nuc <- c("T", "C", "A", "G")
for (i in nuc) {
for (k in nuc) {
for (j in nuc) {
cat(sprintf("%s%s%s: %s ", i, j, k, cCube[i, j, k]))
}
cat("\n")
}
}
# = 4 Tasks ===============================================================
# Task: What do you need to change to print the table with U instead
# of T? Try it.
# Task: Point mutations are more often transitions (purine -> purine;
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
# pyrimidine -> purine), even though twice as many transversions
# are possible in the code. This is most likely due a deamination /
# tautomerization process that favours C -> T changes. If the code
# indeed minimizes the effect of mutations, you would expect that
# codons that differ by a transition code for more similar amino acids
# than codons that differ by a transversion. Is that true? List the set
# of all amino acid pairs that are encoded by codons with a C -> T
# transition. Then list the set of amino acid pairs with a C -> A
# transversion. Which set of pairs is more similar?
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
# have if you translate them in the 2. or the 3. frame?
# Task: How does the amino acid composition change if you translate the mbp1
# gene with the Alternative Yeast Nuclear code that is used by the
# "GTC clade" of fungi?
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
# Solution:
# Fetch the code
Biostrings::GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
altYcode <- Biostrings::getGeneticCode("12")
2017-09-28 13:19:59 +00:00
# what's the difference?
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
2017-09-28 13:19:59 +00:00
Biostrings::GENETIC_CODE[delta]
altYcode[delta]
2017-09-28 13:19:59 +00:00
# translate
altYAA <- character(834)
for (i in seq_along(mbp1Codons)) {
altYAA[i] <- altYcode[mbp1Codons[i]]
}
2017-09-28 13:19:59 +00:00
table(mbp1AA)
table(altYAA)
2017-09-28 13:19:59 +00:00
# Task: The genetic code has significant redundacy, i.e. there are up to six
# codons that code for the same amino acid. Write code that lists how
# many amino acids are present how often i.e. it should tell you that
# two amino acids are encoded only with a single codon, three amino
# acids have six codons, etc. Solution below, but don't peek. There
# are many possible ways to do this.
#
#
# Solution:
table(table(Biostrings::GENETIC_CODE))
2017-09-28 13:19:59 +00:00
# [END]