New AA colouring scheme - with better separation and transparent colours for overlapping plots. prcomp() analysis of amino acid similarity.

This commit is contained in:
hyginn 2020-09-25 21:06:24 +10:00
parent 069d8136e3
commit 85d53ae2ee
2 changed files with 61 additions and 35 deletions

View File

@ -71,30 +71,31 @@ AAVALID <- "acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY*-"
NUCVALID <- "acgtuACGTU-"
NUCAMBIG <- "acgtACGTryswkmbdhvnRYSWKMBDHVN-"
# A colorpallette for amino acid properties
# A colour palette for amino acid properties
AACOLS <- character()
AACOLS["R"] <- "#577EFF" # Positive
AACOLS["K"] <- "#479EEE" #
AACOLS["H"] <- "#37BFDE" #
AACOLS["E"] <- "#ffa587" # Negative
AACOLS["D"] <- "#ff87ad" #
AACOLS["N"] <- "#9FC6FC" # Hydrophilic
AACOLS["Q"] <- "#A7CFF5" #
AACOLS["S"] <- "#AFD8EE" #
AACOLS["T"] <- "#B7E2E8" #
AACOLS["Y"] <- "#F5FFD9" # Hydrophobic
AACOLS["W"] <- "#F1FFDB" #
AACOLS["F"] <- "#EDFFDD" #
AACOLS["I"] <- "#E9FFDF" #
AACOLS["L"] <- "#E5FFE2" #
AACOLS["M"] <- "#E1FFE4" #
AACOLS["V"] <- "#DDFFE6" #
AACOLS["A"] <- "#D9FFE9" #
AACOLS["G"] <- "#e0e0e0" # Glycine
AACOLS["C"] <- "#fffb91" # Cysteine
AACOLS["P"] <- "#e8f7e1" # Proline
AACOLS["R"] <- "#5770ff" # Positive
AACOLS["K"] <- "#4785EE" #
AACOLS["H"] <- "#37a1de" #
AACOLS["E"] <- "#ff6f59" # Negative
AACOLS["D"] <- "#ff7391" #
AACOLS["N"] <- "#C9D4FF" # Hydrophilic
AACOLS["Q"] <- "#CADFFC" #
AACOLS["S"] <- "#CBEAF9" #
AACOLS["T"] <- "#CDF5F7" #
AACOLS["Y"] <- "#FBFFC9" # Hydrophobic
AACOLS["W"] <- "#EDFDC8" #
AACOLS["F"] <- "#DFFCC8" #
AACOLS["I"] <- "#D2FBC8" #
AACOLS["L"] <- "#C4FAC7" #
AACOLS["M"] <- "#B7F9C7" #
AACOLS["V"] <- "#A9F8C7" #
AACOLS["A"] <- "#9CF7C7" #
AACOLS["G"] <- "#d2d2d2" # Glycine
AACOLS["C"] <- "#fff963" # Cysteine
AACOLS["P"] <- "#edc06d" # Proline
AACOLS <- gsub("$", "80", AACOLS) # Make the colors 50% transparent
# barplot(rep(1, 20), col = AACOLS)
# colorRampPalette(c("#fbffc9","#9cf7c7"))(8)
# 10 species of fungi for reference analysis.
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi

View File

@ -1,20 +1,15 @@
# tocID <- "BIN-ALI-Similarity.R"
#
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot.
@ -22,6 +17,7 @@
#
#
# TODO:
# Update ggtern:: ternary plot to use aacol dots under text
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
@ -61,7 +57,7 @@ if (! requireNamespace("seqinr", quietly=TRUE)) {
# data:
?aaindex
data(aaindex) # load the aaindex list from the package
data(aaindex, package = "seqinr") # load the aaindex list from the package
length(aaindex)
@ -124,11 +120,26 @@ names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
plot(Y$I, V$I, col="white", xlab = "hydrophobicity", ylab = "volume")
text(Y$I, V$I, names(Y$I))
plot(Y$I, K$I, col="white", xlab = "hydrophobicity", ylab = "pK")
text(Y$I, K$I, names(Y$I))
# pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ...
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
plot(Y$I, V$I,
xlab = "hydrophobicity", ylab = "volume",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, V$I, names(Y$I), cex = 0.8)
plot(Y$I, K$I,
xlab = "hydrophobicity", ylab = "pK",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, K$I, names(Y$I), cex = 0.8)
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a
@ -160,6 +171,20 @@ ggtern::ggtern(data = myDat,
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
# ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...)
prc <- prcomp(myDat)
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
pch=19, cex=6, col=aad, cex.main=0.7,
main="Principal Component Analysis of Amino Acid Features")
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
# This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein.
# = 2 Mutation Data matrix ================================================