New AA colouring scheme - with better separation and transparent colours for overlapping plots. prcomp() analysis of amino acid similarity.

2020-09-25 21:06:24 +10:00
parent 069d8136e3
commit 85d53ae2ee
2 changed files with 61 additions and 35 deletions
--- a/.utilities.R
+++ b/.utilities.R
@@ -71,30 +71,31 @@ AAVALID  <- "acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY*-"
 NUCVALID <- "acgtuACGTU-"
 NUCAMBIG <- "acgtACGTryswkmbdhvnRYSWKMBDHVN-"
-# A colorpallette for amino acid properties
+# A colour palette for amino acid properties
 AACOLS <- character()
-AACOLS["R"] <- "#577EFF" # Positive
+AACOLS["R"] <- "#5770ff" # Positive
-AACOLS["K"] <- "#479EEE" #
+AACOLS["K"] <- "#4785EE" #
-AACOLS["H"] <- "#37BFDE" #
+AACOLS["H"] <- "#37a1de" #
-AACOLS["E"] <- "#ffa587" # Negative
+AACOLS["E"] <- "#ff6f59" # Negative
-AACOLS["D"] <- "#ff87ad" #
+AACOLS["D"] <- "#ff7391" #
-AACOLS["N"] <- "#9FC6FC" # Hydrophilic
+AACOLS["N"] <- "#C9D4FF" # Hydrophilic
-AACOLS["Q"] <- "#A7CFF5" #
+AACOLS["Q"] <- "#CADFFC" #
-AACOLS["S"] <- "#AFD8EE" #
+AACOLS["S"] <- "#CBEAF9" #
-AACOLS["T"] <- "#B7E2E8" #
+AACOLS["T"] <- "#CDF5F7" #
-AACOLS["Y"] <- "#F5FFD9" # Hydrophobic
+AACOLS["Y"] <- "#FBFFC9" # Hydrophobic
-AACOLS["W"] <- "#F1FFDB" #
+AACOLS["W"] <- "#EDFDC8" #
-AACOLS["F"] <- "#EDFFDD" #
+AACOLS["F"] <- "#DFFCC8" #
-AACOLS["I"] <- "#E9FFDF" #
+AACOLS["I"] <- "#D2FBC8" #
-AACOLS["L"] <- "#E5FFE2" #
+AACOLS["L"] <- "#C4FAC7" #
-AACOLS["M"] <- "#E1FFE4" #
+AACOLS["M"] <- "#B7F9C7" #
-AACOLS["V"] <- "#DDFFE6" #
+AACOLS["V"] <- "#A9F8C7" #
-AACOLS["A"] <- "#D9FFE9" #
+AACOLS["A"] <- "#9CF7C7" #
-AACOLS["G"] <- "#e0e0e0" # Glycine
+AACOLS["G"] <- "#d2d2d2" # Glycine
-AACOLS["C"] <- "#fffb91" # Cysteine
+AACOLS["C"] <- "#fff963" # Cysteine
-AACOLS["P"] <- "#e8f7e1" # Proline
+AACOLS["P"] <- "#edc06d" # Proline
 AACOLS <- gsub("$", "80", AACOLS)  # Make the colors 50% transparent
 # barplot(rep(1, 20), col = AACOLS)
-
+# colorRampPalette(c("#fbffc9","#9cf7c7"))(8)
 # 10 species of fungi for reference analysis.
 # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
--- a/BIN-ALI-Similarity.R
+++ b/BIN-ALI-Similarity.R
@@ -1,20 +1,15 @@
 # tocID <- "BIN-ALI-Similarity.R"
 #
 # ---------------------------------------------------------------------------- #
 #  PATIENCE  ...                                                               #
 #    Do not yet work wih this code. Updates in progress. Thank you.            #
 #    boris.steipe@utoronto.ca                                                  #
 # ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Similarity unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10  -  2019  01
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    2020 Updates
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout
 #           1.0    Refactored for 2017; add aaindex, ternary plot.
@@ -22,6 +17,7 @@
 #
 #
 # TODO:
 #   Update ggtern:: ternary plot to use aacol dots under text
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
@@ -61,7 +57,7 @@ if (! requireNamespace("seqinr", quietly=TRUE)) {
 #  data:
 ?aaindex
-data(aaindex)  # load the aaindex list from the package
+data(aaindex, package = "seqinr")  # load the aaindex list from the package
 length(aaindex)
@@ -124,11 +120,26 @@ names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
 # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
 plot(Y$I, V$I, col="white", xlab = "hydrophobicity", ylab = "volume")
 text(Y$I, V$I, names(Y$I))
-plot(Y$I, K$I, col="white", xlab = "hydrophobicity", ylab = "pK")
+# pull the names from Y$I, convert them to single letter code, and reorder the
-text(Y$I, K$I, names(Y$I))
+# AACOLS palette accordingly ...
 aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
 plot(Y$I, V$I,
     xlab = "hydrophobicity", ylab = "volume",
     pch = 21,
     cex = 6,
     col = aac,
     bg  = aac)
 text(Y$I, V$I, names(Y$I), cex = 0.8)
 plot(Y$I, K$I,
     xlab = "hydrophobicity", ylab = "pK",
     pch = 21,
     cex = 6,
     col = aac,
     bg  = aac)
 text(Y$I, K$I, names(Y$I), cex = 0.8)
 # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
 # plots are in general unintuitive and hard to interpret. One alternative is a
@@ -160,6 +171,20 @@ ggtern::ggtern(data = myDat,
 # This results in a mapping of amino acids relative to each other that is
 # similar to the Venn diagram you have seen in the notes.
 # ... or we could use principal components analysis, to pull out the
 # best projection of the three feature dimensions into two. (Done here without delving
 # into the theory ...)
 prc <- prcomp(myDat)
 plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
     pch=19, cex=6, col=aad, cex.main=0.7,
     main="Principal Component Analysis of Amino Acid Features")
 text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
 # This matches the intuition rather well in that "similar" amino acids are close
 # on the plot. But we can't interpret the distances in terms of just one of the
 # parameters. Whatever - nature has a different way to define similarity:
 # mutations to similar amino acids are less likely to break the protein.
 # =    2  Mutation Data matrix  ================================================