Class session notes for 2021-10-12
This commit is contained in:
parent
bb2e2a4c85
commit
4b2bab0f12
257
2021-10-12_In-Class_exploration.R
Normal file
257
2021-10-12_In-Class_exploration.R
Normal file
@ -0,0 +1,257 @@
|
||||
# 2021-10-12_In-Class_exploration.R
|
||||
#
|
||||
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
|
||||
#
|
||||
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
|
||||
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
|
||||
# Scribe: boris.steipe@utoronto.ca
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# In our last session we explored some properties of amino acids and noted that
|
||||
# we can arrange them in a scatter-plot according to some properties. But can
|
||||
# we also arrange them according to generic properties, i.e. taking all
|
||||
# published property scales into account? We will try to use all tables from
|
||||
# the seqinr package.
|
||||
|
||||
# First we load the package - this makes all datasets immediately available and
|
||||
# we don't have to load them one by one.
|
||||
|
||||
library(seqinr)
|
||||
|
||||
# Determine what datasets are available
|
||||
#
|
||||
# Using "find in topic" ... "amino acid"
|
||||
data(aacost)
|
||||
data(aaindex)
|
||||
data(pK)
|
||||
|
||||
# We note that datasets may be sorted in different ways: for example
|
||||
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
|
||||
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
|
||||
# acids are sorted in the same way.
|
||||
|
||||
# Build a datastructure ...
|
||||
# rows: amino acids
|
||||
# columns: properties
|
||||
|
||||
# Are all lists in aaindex organized in the same way?
|
||||
|
||||
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
|
||||
# index as a reference list
|
||||
|
||||
# Loop over each list in aaindex
|
||||
for (i in 1:length(aaindex)) {
|
||||
# get the I-vector
|
||||
x <- aaindex[[i]]$I
|
||||
# get the names
|
||||
x <- names(x)
|
||||
# compare with the names of our reference list
|
||||
# the == and != operators are vectorized. Applying them to two vectors
|
||||
# gives TRUE or FALSE for each pair of elements. any() or all() can be
|
||||
# applied to logical vectors to anylise them and return a soingle result.
|
||||
# if (...) conditions evaluate only a single value and will throw a warning if
|
||||
# there is more than one.
|
||||
|
||||
if (any(x != refNames)) {
|
||||
# There was at least one not-equal pair - so: complain
|
||||
print(sprintf("Problem in list %d: names don't match", i))
|
||||
}
|
||||
}
|
||||
|
||||
# If we get here without identifying problems, it means all pairs of
|
||||
# rownames match throughout the aainfex list.
|
||||
|
||||
|
||||
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
|
||||
# one of the list elements) to our dataframe?
|
||||
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
|
||||
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
|
||||
|
||||
str(aaData) # Confirm: we now have a two-column dataframe
|
||||
|
||||
# Next: add the rest ...
|
||||
for (i in 3:length(aaindex)) {
|
||||
# get the I-vector and write it into our dataframe
|
||||
aaData[,i] <- aaindex[[i]]$I
|
||||
}
|
||||
|
||||
# Sanity check
|
||||
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
|
||||
|
||||
# Looks good.
|
||||
|
||||
# We finished building our data structure ... but let's add the aacost table
|
||||
# aacost is ordered differently:
|
||||
rownames(aaData)
|
||||
aacost[ , 1]
|
||||
|
||||
# using order(), applied to aacost - ordering the column with column-name
|
||||
# "aaa"
|
||||
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
|
||||
aacost[sel, "aaa"] # applying the order vector sorts the column
|
||||
|
||||
# Is this the same order as refNames?
|
||||
refNames == aacost[sel, "aaa"] # Yes!
|
||||
|
||||
# add the data from column "tot" (i.e. total metabolic cost) after the
|
||||
# last column of aaData
|
||||
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
|
||||
|
||||
# Done.
|
||||
str(aaData) # A dataframe with 20 rows and 545 columns
|
||||
|
||||
# To answer the question "Which amino acids are similar to each other?" we
|
||||
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
|
||||
# we will succumb to the "Curse of Dimensionality":
|
||||
#
|
||||
# "in high dimensional data, however, all objects appear
|
||||
# to be sparse and dissimilar in many ways..."
|
||||
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
|
||||
#
|
||||
# A classic way to do this is Principal Component Analysis (PCA) ...
|
||||
# (Principal components analysis)
|
||||
#
|
||||
# PCA expects objects in columns, properties in rows. Therefore we need to
|
||||
# transpose our dataset:
|
||||
|
||||
aaPCA <- prcomp(t(aaData))
|
||||
|
||||
# This creates an error, because some of our indicews contain NA values!
|
||||
# Which indices are this?
|
||||
|
||||
# We create a vector "sel" for which we check whether any element in each
|
||||
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
|
||||
# then use this vector to subset ourt dataframe.
|
||||
|
||||
sel <- logical()
|
||||
|
||||
for (i in 1:ncol(aaData)) { # for each index
|
||||
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
|
||||
sel <- c(sel, FALSE) # add a FALSE element to the vector
|
||||
} else { # else
|
||||
sel <- c(sel, TRUE) # add a TRUE element
|
||||
}
|
||||
}
|
||||
|
||||
# Done. sel now subsets only the NA-free columns
|
||||
545 - sum(sel) # 13 columns excluded
|
||||
|
||||
# Do the PCA ... use the prcomp() function
|
||||
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
|
||||
|
||||
str(aaPCA) # structure of the result
|
||||
|
||||
plot(aaPCA) # plot the contributions of the
|
||||
# components to the variance
|
||||
|
||||
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
|
||||
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
|
||||
type ="n") # just to set up the coordinate system
|
||||
|
||||
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
|
||||
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# PCA results are sensitive to the absolute numeric value of the features that
|
||||
# we are comparing. The prcomp() function has an option scale. = TRUE that
|
||||
# scales each row of features so that the variance of the value is 1.0 This
|
||||
# ensures that each feature is given approximately equal weight
|
||||
|
||||
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
|
||||
|
||||
plot(aaPCA)
|
||||
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
|
||||
# Next we try to identify what the PCs correspond to. We see whether there are
|
||||
# specific features that are highly correlated with the PCs
|
||||
|
||||
# ==== Rotation 1 ===================
|
||||
#
|
||||
|
||||
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
|
||||
|
||||
# The function cor() calculates Pearson coefficients of correlation
|
||||
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
|
||||
|
||||
|
||||
# Iterate over all columns and calculate correlations
|
||||
cors <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors[i] <- cor(PC1, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
|
||||
#
|
||||
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
|
||||
|
||||
which(cors == max(cors, na.rm = TRUE))
|
||||
|
||||
aaindex[[504]] # Linker propensity ???
|
||||
|
||||
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
|
||||
|
||||
# Plot this ...
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This is essentially a random correlation but for Cysteine ...
|
||||
|
||||
|
||||
# ==== Rotation 2 ===================
|
||||
#
|
||||
# same process
|
||||
PC2 <- aaPCA$rotation[ , 2]
|
||||
|
||||
cors2 <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors2[i] <- cor(PC2, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors2)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
|
||||
|
||||
# Here we have quite strong correlations
|
||||
|
||||
which(cors2 == max(cors2, na.rm = TRUE))
|
||||
|
||||
aaindex[[148]]
|
||||
|
||||
# this index itself is correlated with many other indices
|
||||
|
||||
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
|
||||
|
||||
# Plot this too...
|
||||
plot(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This correlates well with hydrophobicity measures. In this case the
|
||||
# PC is to a certain degree interpretable - but this is not always the case
|
||||
# with PCA (see the example of the first PC).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user