258 lines
8.0 KiB
R
258 lines
8.0 KiB
R
# 2021-10-12_In-Class_exploration.R
|
|
#
|
|
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
|
|
#
|
|
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
|
|
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
|
|
# Scribe: boris.steipe@utoronto.ca
|
|
#
|
|
# ==============================================================================
|
|
#
|
|
# In our last session we explored some properties of amino acids and noted that
|
|
# we can arrange them in a scatter-plot according to some properties. But can
|
|
# we also arrange them according to generic properties, i.e. taking all
|
|
# published property scales into account? We will try to use all tables from
|
|
# the seqinr package.
|
|
|
|
# First we load the package - this makes all datasets immediately available and
|
|
# we don't have to load them one by one.
|
|
|
|
library(seqinr)
|
|
|
|
# Determine what datasets are available
|
|
#
|
|
# Using "find in topic" ... "amino acid"
|
|
data(aacost)
|
|
data(aaindex)
|
|
data(pK)
|
|
|
|
# We note that datasets may be sorted in different ways: for example
|
|
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
|
|
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
|
|
# acids are sorted in the same way.
|
|
|
|
# Build a datastructure ...
|
|
# rows: amino acids
|
|
# columns: properties
|
|
|
|
# Are all lists in aaindex organized in the same way?
|
|
|
|
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
|
|
# index as a reference list
|
|
|
|
# Loop over each list in aaindex
|
|
for (i in 1:length(aaindex)) {
|
|
# get the I-vector
|
|
x <- aaindex[[i]]$I
|
|
# get the names
|
|
x <- names(x)
|
|
# compare with the names of our reference list
|
|
# the == and != operators are vectorized. Applying them to two vectors
|
|
# gives TRUE or FALSE for each pair of elements. any() or all() can be
|
|
# applied to logical vectors to anylise them and return a soingle result.
|
|
# if (...) conditions evaluate only a single value and will throw a warning if
|
|
# there is more than one.
|
|
|
|
if (any(x != refNames)) {
|
|
# There was at least one not-equal pair - so: complain
|
|
print(sprintf("Problem in list %d: names don't match", i))
|
|
}
|
|
}
|
|
|
|
# If we get here without identifying problems, it means all pairs of
|
|
# rownames match throughout the aainfex list.
|
|
|
|
|
|
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
|
|
# one of the list elements) to our dataframe?
|
|
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
|
|
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
|
|
|
|
str(aaData) # Confirm: we now have a two-column dataframe
|
|
|
|
# Next: add the rest ...
|
|
for (i in 3:length(aaindex)) {
|
|
# get the I-vector and write it into our dataframe
|
|
aaData[,i] <- aaindex[[i]]$I
|
|
}
|
|
|
|
# Sanity check
|
|
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
|
|
|
|
# Looks good.
|
|
|
|
# We finished building our data structure ... but let's add the aacost table
|
|
# aacost is ordered differently:
|
|
rownames(aaData)
|
|
aacost[ , 1]
|
|
|
|
# using order(), applied to aacost - ordering the column with column-name
|
|
# "aaa"
|
|
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
|
|
aacost[sel, "aaa"] # applying the order vector sorts the column
|
|
|
|
# Is this the same order as refNames?
|
|
refNames == aacost[sel, "aaa"] # Yes!
|
|
|
|
# add the data from column "tot" (i.e. total metabolic cost) after the
|
|
# last column of aaData
|
|
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
|
|
|
|
# Done.
|
|
str(aaData) # A dataframe with 20 rows and 545 columns
|
|
|
|
# To answer the question "Which amino acids are similar to each other?" we
|
|
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
|
|
# we will succumb to the "Curse of Dimensionality":
|
|
#
|
|
# "in high dimensional data, however, all objects appear
|
|
# to be sparse and dissimilar in many ways..."
|
|
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
|
|
#
|
|
# A classic way to do this is Principal Component Analysis (PCA) ...
|
|
# (Principal components analysis)
|
|
#
|
|
# PCA expects objects in columns, properties in rows. Therefore we need to
|
|
# transpose our dataset:
|
|
|
|
aaPCA <- prcomp(t(aaData))
|
|
|
|
# This creates an error, because some of our indicews contain NA values!
|
|
# Which indices are this?
|
|
|
|
# We create a vector "sel" for which we check whether any element in each
|
|
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
|
|
# then use this vector to subset ourt dataframe.
|
|
|
|
sel <- logical()
|
|
|
|
for (i in 1:ncol(aaData)) { # for each index
|
|
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
|
|
sel <- c(sel, FALSE) # add a FALSE element to the vector
|
|
} else { # else
|
|
sel <- c(sel, TRUE) # add a TRUE element
|
|
}
|
|
}
|
|
|
|
# Done. sel now subsets only the NA-free columns
|
|
545 - sum(sel) # 13 columns excluded
|
|
|
|
# Do the PCA ... use the prcomp() function
|
|
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
|
|
|
|
str(aaPCA) # structure of the result
|
|
|
|
plot(aaPCA) # plot the contributions of the
|
|
# components to the variance
|
|
|
|
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
|
|
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
|
|
type ="n") # just to set up the coordinate system
|
|
|
|
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
|
|
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
|
|
labels = rownames(aaPCA$rotation))
|
|
|
|
# PCA results are sensitive to the absolute numeric value of the features that
|
|
# we are comparing. The prcomp() function has an option scale. = TRUE that
|
|
# scales each row of features so that the variance of the value is 1.0 This
|
|
# ensures that each feature is given approximately equal weight
|
|
|
|
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
|
|
|
|
plot(aaPCA)
|
|
|
|
plot(aaPCA$rotation[ , 1],
|
|
aaPCA$rotation[ , 2],
|
|
type ="n")
|
|
text(aaPCA$rotation[ , 1],
|
|
aaPCA$rotation[ , 2],
|
|
labels = rownames(aaPCA$rotation))
|
|
|
|
|
|
# Next we try to identify what the PCs correspond to. We see whether there are
|
|
# specific features that are highly correlated with the PCs
|
|
|
|
# ==== Rotation 1 ===================
|
|
#
|
|
|
|
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
|
|
|
|
# The function cor() calculates Pearson coefficients of correlation
|
|
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
|
|
|
|
|
|
# Iterate over all columns and calculate correlations
|
|
cors <- numeric()
|
|
|
|
for (i in 1:ncol(aaData)) {
|
|
cors[i] <- cor(PC1, aaData[ , i])
|
|
}
|
|
|
|
summary(cors)
|
|
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
|
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
|
|
#
|
|
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
|
|
|
|
which(cors == max(cors, na.rm = TRUE))
|
|
|
|
aaindex[[504]] # Linker propensity ???
|
|
|
|
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
|
|
|
|
# Plot this ...
|
|
plot(aaPCA$rotation[ , 1],
|
|
aaindex[[504]]$I,
|
|
type ="n")
|
|
text(aaPCA$rotation[ , 1],
|
|
aaindex[[504]]$I,
|
|
labels = rownames(aaPCA$rotation))
|
|
|
|
# This is essentially a random correlation but for Cysteine ...
|
|
|
|
|
|
# ==== Rotation 2 ===================
|
|
#
|
|
# same process
|
|
PC2 <- aaPCA$rotation[ , 2]
|
|
|
|
cors2 <- numeric()
|
|
|
|
for (i in 1:ncol(aaData)) {
|
|
cors2[i] <- cor(PC2, aaData[ , i])
|
|
}
|
|
|
|
summary(cors2)
|
|
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
|
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
|
|
|
|
# Here we have quite strong correlations
|
|
|
|
which(cors2 == max(cors2, na.rm = TRUE))
|
|
|
|
aaindex[[148]]
|
|
|
|
# this index itself is correlated with many other indices
|
|
|
|
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
|
|
|
|
# Plot this too...
|
|
plot(aaPCA$rotation[ , 2],
|
|
aaindex[[148]]$I,
|
|
type ="n")
|
|
text(aaPCA$rotation[ , 2],
|
|
aaindex[[148]]$I,
|
|
labels = rownames(aaPCA$rotation))
|
|
|
|
# This correlates well with hydrophobicity measures. In this case the
|
|
# PC is to a certain degree interpretable - but this is not always the case
|
|
# with PCA (see the example of the first PC).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# [END]
|