New AA colouring scheme - with better separation and transparent colours for overlapping plots. prcomp() analysis of amino acid similarity.
This commit is contained in:
parent
069d8136e3
commit
85d53ae2ee
45
.utilities.R
45
.utilities.R
@ -71,30 +71,31 @@ AAVALID <- "acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY*-"
|
|||||||
NUCVALID <- "acgtuACGTU-"
|
NUCVALID <- "acgtuACGTU-"
|
||||||
NUCAMBIG <- "acgtACGTryswkmbdhvnRYSWKMBDHVN-"
|
NUCAMBIG <- "acgtACGTryswkmbdhvnRYSWKMBDHVN-"
|
||||||
|
|
||||||
# A colorpallette for amino acid properties
|
# A colour palette for amino acid properties
|
||||||
AACOLS <- character()
|
AACOLS <- character()
|
||||||
AACOLS["R"] <- "#577EFF" # Positive
|
AACOLS["R"] <- "#5770ff" # Positive
|
||||||
AACOLS["K"] <- "#479EEE" #
|
AACOLS["K"] <- "#4785EE" #
|
||||||
AACOLS["H"] <- "#37BFDE" #
|
AACOLS["H"] <- "#37a1de" #
|
||||||
AACOLS["E"] <- "#ffa587" # Negative
|
AACOLS["E"] <- "#ff6f59" # Negative
|
||||||
AACOLS["D"] <- "#ff87ad" #
|
AACOLS["D"] <- "#ff7391" #
|
||||||
AACOLS["N"] <- "#9FC6FC" # Hydrophilic
|
AACOLS["N"] <- "#C9D4FF" # Hydrophilic
|
||||||
AACOLS["Q"] <- "#A7CFF5" #
|
AACOLS["Q"] <- "#CADFFC" #
|
||||||
AACOLS["S"] <- "#AFD8EE" #
|
AACOLS["S"] <- "#CBEAF9" #
|
||||||
AACOLS["T"] <- "#B7E2E8" #
|
AACOLS["T"] <- "#CDF5F7" #
|
||||||
AACOLS["Y"] <- "#F5FFD9" # Hydrophobic
|
AACOLS["Y"] <- "#FBFFC9" # Hydrophobic
|
||||||
AACOLS["W"] <- "#F1FFDB" #
|
AACOLS["W"] <- "#EDFDC8" #
|
||||||
AACOLS["F"] <- "#EDFFDD" #
|
AACOLS["F"] <- "#DFFCC8" #
|
||||||
AACOLS["I"] <- "#E9FFDF" #
|
AACOLS["I"] <- "#D2FBC8" #
|
||||||
AACOLS["L"] <- "#E5FFE2" #
|
AACOLS["L"] <- "#C4FAC7" #
|
||||||
AACOLS["M"] <- "#E1FFE4" #
|
AACOLS["M"] <- "#B7F9C7" #
|
||||||
AACOLS["V"] <- "#DDFFE6" #
|
AACOLS["V"] <- "#A9F8C7" #
|
||||||
AACOLS["A"] <- "#D9FFE9" #
|
AACOLS["A"] <- "#9CF7C7" #
|
||||||
AACOLS["G"] <- "#e0e0e0" # Glycine
|
AACOLS["G"] <- "#d2d2d2" # Glycine
|
||||||
AACOLS["C"] <- "#fffb91" # Cysteine
|
AACOLS["C"] <- "#fff963" # Cysteine
|
||||||
AACOLS["P"] <- "#e8f7e1" # Proline
|
AACOLS["P"] <- "#edc06d" # Proline
|
||||||
|
AACOLS <- gsub("$", "80", AACOLS) # Make the colors 50% transparent
|
||||||
# barplot(rep(1, 20), col = AACOLS)
|
# barplot(rep(1, 20), col = AACOLS)
|
||||||
|
# colorRampPalette(c("#fbffc9","#9cf7c7"))(8)
|
||||||
|
|
||||||
# 10 species of fungi for reference analysis.
|
# 10 species of fungi for reference analysis.
|
||||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
# tocID <- "BIN-ALI-Similarity.R"
|
# tocID <- "BIN-ALI-Similarity.R"
|
||||||
#
|
#
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
# PATIENCE ... #
|
|
||||||
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
||||||
# boris.steipe@utoronto.ca #
|
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
#
|
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Similarity unit.
|
# R code accompanying the BIN-ALI-Similarity unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2019 01
|
# Date: 2017-10 - 2020-09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 2020 Updates
|
||||||
# 1.1 Change from require() to requireNamespace(),
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
||||||
@ -22,6 +17,7 @@
|
|||||||
#
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
|
# Update ggtern:: ternary plot to use aacol dots under text
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
@ -61,7 +57,7 @@ if (! requireNamespace("seqinr", quietly=TRUE)) {
|
|||||||
# data:
|
# data:
|
||||||
|
|
||||||
?aaindex
|
?aaindex
|
||||||
data(aaindex) # load the aaindex list from the package
|
data(aaindex, package = "seqinr") # load the aaindex list from the package
|
||||||
|
|
||||||
length(aaindex)
|
length(aaindex)
|
||||||
|
|
||||||
@ -124,11 +120,26 @@ names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
|
|||||||
|
|
||||||
|
|
||||||
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
|
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
|
||||||
plot(Y$I, V$I, col="white", xlab = "hydrophobicity", ylab = "volume")
|
|
||||||
text(Y$I, V$I, names(Y$I))
|
|
||||||
|
|
||||||
plot(Y$I, K$I, col="white", xlab = "hydrophobicity", ylab = "pK")
|
# pull the names from Y$I, convert them to single letter code, and reorder the
|
||||||
text(Y$I, K$I, names(Y$I))
|
# AACOLS palette accordingly ...
|
||||||
|
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
|
||||||
|
|
||||||
|
plot(Y$I, V$I,
|
||||||
|
xlab = "hydrophobicity", ylab = "volume",
|
||||||
|
pch = 21,
|
||||||
|
cex = 6,
|
||||||
|
col = aac,
|
||||||
|
bg = aac)
|
||||||
|
text(Y$I, V$I, names(Y$I), cex = 0.8)
|
||||||
|
|
||||||
|
plot(Y$I, K$I,
|
||||||
|
xlab = "hydrophobicity", ylab = "pK",
|
||||||
|
pch = 21,
|
||||||
|
cex = 6,
|
||||||
|
col = aac,
|
||||||
|
bg = aac)
|
||||||
|
text(Y$I, K$I, names(Y$I), cex = 0.8)
|
||||||
|
|
||||||
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
|
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
|
||||||
# plots are in general unintuitive and hard to interpret. One alternative is a
|
# plots are in general unintuitive and hard to interpret. One alternative is a
|
||||||
@ -160,6 +171,20 @@ ggtern::ggtern(data = myDat,
|
|||||||
# This results in a mapping of amino acids relative to each other that is
|
# This results in a mapping of amino acids relative to each other that is
|
||||||
# similar to the Venn diagram you have seen in the notes.
|
# similar to the Venn diagram you have seen in the notes.
|
||||||
|
|
||||||
|
# ... or we could use principal components analysis, to pull out the
|
||||||
|
# best projection of the three feature dimensions into two. (Done here without delving
|
||||||
|
# into the theory ...)
|
||||||
|
prc <- prcomp(myDat)
|
||||||
|
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
|
||||||
|
pch=19, cex=6, col=aad, cex.main=0.7,
|
||||||
|
main="Principal Component Analysis of Amino Acid Features")
|
||||||
|
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
|
||||||
|
|
||||||
|
# This matches the intuition rather well in that "similar" amino acids are close
|
||||||
|
# on the plot. But we can't interpret the distances in terms of just one of the
|
||||||
|
# parameters. Whatever - nature has a different way to define similarity:
|
||||||
|
# mutations to similar amino acids are less likely to break the protein.
|
||||||
|
|
||||||
|
|
||||||
# = 2 Mutation Data matrix ================================================
|
# = 2 Mutation Data matrix ================================================
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user