bch441-work-abc-units/2021-10-12_In-Class_exploration.R

258 lines
8.0 KiB
R
Raw Normal View History

2021-10-19 20:44:34 +00:00
# 2021-10-12_In-Class_exploration.R
#
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
#
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
# Scribe: boris.steipe@utoronto.ca
#
# ==============================================================================
#
# In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from
# the seqinr package.
# First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one.
library(seqinr)
# Determine what datasets are available
#
# Using "find in topic" ... "amino acid"
data(aacost)
data(aaindex)
data(pK)
# We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way.
# Build a datastructure ...
# rows: amino acids
# columns: properties
# Are all lists in aaindex organized in the same way?
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
# index as a reference list
# Loop over each list in aaindex
for (i in 1:length(aaindex)) {
# get the I-vector
x <- aaindex[[i]]$I
# get the names
x <- names(x)
# compare with the names of our reference list
# the == and != operators are vectorized. Applying them to two vectors
# gives TRUE or FALSE for each pair of elements. any() or all() can be
# applied to logical vectors to anylise them and return a soingle result.
# if (...) conditions evaluate only a single value and will throw a warning if
# there is more than one.
if (any(x != refNames)) {
# There was at least one not-equal pair - so: complain
print(sprintf("Problem in list %d: names don't match", i))
}
}
# If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list.
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
str(aaData) # Confirm: we now have a two-column dataframe
# Next: add the rest ...
for (i in 3:length(aaindex)) {
# get the I-vector and write it into our dataframe
aaData[,i] <- aaindex[[i]]$I
}
# Sanity check
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
# Looks good.
# We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently:
rownames(aaData)
aacost[ , 1]
# using order(), applied to aacost - ordering the column with column-name
# "aaa"
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column
# Is this the same order as refNames?
refNames == aacost[sel, "aaa"] # Yes!
# add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
# Done.
str(aaData) # A dataframe with 20 rows and 545 columns
# To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality":
#
# "in high dimensional data, however, all objects appear
# to be sparse and dissimilar in many ways..."
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
#
# A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis)
#
# PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset:
aaPCA <- prcomp(t(aaData))
# This creates an error, because some of our indicews contain NA values!
# Which indices are this?
# We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe.
sel <- logical()
for (i in 1:ncol(aaData)) { # for each index
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
sel <- c(sel, FALSE) # add a FALSE element to the vector
} else { # else
sel <- c(sel, TRUE) # add a TRUE element
}
}
# Done. sel now subsets only the NA-free columns
545 - sum(sel) # 13 columns excluded
# Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
str(aaPCA) # structure of the result
plot(aaPCA) # plot the contributions of the
# components to the variance
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
type ="n") # just to set up the coordinate system
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
labels = rownames(aaPCA$rotation))
# PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0 This
# ensures that each feature is given approximately equal weight
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
plot(aaPCA)
plot(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
type ="n")
text(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
labels = rownames(aaPCA$rotation))
# Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs
# ==== Rotation 1 ===================
#
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
# The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
# Iterate over all columns and calculate correlations
cors <- numeric()
for (i in 1:ncol(aaData)) {
cors[i] <- cor(PC1, aaData[ , i])
}
summary(cors)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
#
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
which(cors == max(cors, na.rm = TRUE))
aaindex[[504]] # Linker propensity ???
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
# Plot this ...
plot(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
type ="n")
text(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
labels = rownames(aaPCA$rotation))
# This is essentially a random correlation but for Cysteine ...
# ==== Rotation 2 ===================
#
# same process
PC2 <- aaPCA$rotation[ , 2]
cors2 <- numeric()
for (i in 1:ncol(aaData)) {
cors2[i] <- cor(PC2, aaData[ , i])
}
summary(cors2)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
# Here we have quite strong correlations
which(cors2 == max(cors2, na.rm = TRUE))
aaindex[[148]]
# this index itself is correlated with many other indices
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
# Plot this too...
plot(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
type ="n")
text(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
labels = rownames(aaPCA$rotation))
# This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC).
# [END]