diff --git a/2021-10-12_In-Class_exploration.R b/2021-10-12_In-Class_exploration.R new file mode 100644 index 0000000..0b168d1 --- /dev/null +++ b/2021-10-12_In-Class_exploration.R @@ -0,0 +1,257 @@ +# 2021-10-12_In-Class_exploration.R +# +# ===== T H E E V E N B E T T E R A M I N O A C I D ===== +# +# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 +# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu +# Scribe: boris.steipe@utoronto.ca +# +# ============================================================================== +# +# In our last session we explored some properties of amino acids and noted that +# we can arrange them in a scatter-plot according to some properties. But can +# we also arrange them according to generic properties, i.e. taking all +# published property scales into account? We will try to use all tables from +# the seqinr package. + +# First we load the package - this makes all datasets immediately available and +# we don't have to load them one by one. + +library(seqinr) + +# Determine what datasets are available +# +# Using "find in topic" ... "amino acid" +data(aacost) +data(aaindex) +data(pK) + +# We note that datasets may be sorted in different ways: for example +# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, +# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino +# acids are sorted in the same way. + +# Build a datastructure ... +# rows: amino acids +# columns: properties + +# Are all lists in aaindex organized in the same way? + +refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item + # index as a reference list + +# Loop over each list in aaindex +for (i in 1:length(aaindex)) { +# get the I-vector + x <- aaindex[[i]]$I +# get the names + x <- names(x) +# compare with the names of our reference list +# the == and != operators are vectorized. Applying them to two vectors +# gives TRUE or FALSE for each pair of elements. any() or all() can be +# applied to logical vectors to anylise them and return a soingle result. +# if (...) conditions evaluate only a single value and will throw a warning if +# there is more than one. + + if (any(x != refNames)) { + # There was at least one not-equal pair - so: complain + print(sprintf("Problem in list %d: names don't match", i)) + } +} + +# If we get here without identifying problems, it means all pairs of +# rownames match throughout the aainfex list. + + +# Next: what is the cvorrect syntax to add one vector (the "I" vector of +# one of the list elements) to our dataframe? +aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index +aaData[,2] <- aaindex[[2]]$I # ... add the secondf index + +str(aaData) # Confirm: we now have a two-column dataframe + +# Next: add the rest ... +for (i in 3:length(aaindex)) { + # get the I-vector and write it into our dataframe + aaData[,i] <- aaindex[[i]]$I +} + +# Sanity check +plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other + +# Looks good. + +# We finished building our data structure ... but let's add the aacost table +# aacost is ordered differently: +rownames(aaData) +aacost[ , 1] + +# using order(), applied to aacost - ordering the column with column-name +# "aaa" +sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes +aacost[sel, "aaa"] # applying the order vector sorts the column + +# Is this the same order as refNames? +refNames == aacost[sel, "aaa"] # Yes! + +# add the data from column "tot" (i.e. total metabolic cost) after the +# last column of aaData +aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] + +# Done. +str(aaData) # A dataframe with 20 rows and 545 columns + +# To answer the question "Which amino acids are similar to each other?" we +# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise +# we will succumb to the "Curse of Dimensionality": +# +# "in high dimensional data, however, all objects appear +# to be sparse and dissimilar in many ways..." +# https://en.wikipedia.org/wiki/Curse_of_dimensionality +# +# A classic way to do this is Principal Component Analysis (PCA) ... +# (Principal components analysis) +# +# PCA expects objects in columns, properties in rows. Therefore we need to +# transpose our dataset: + +aaPCA <- prcomp(t(aaData)) + +# This creates an error, because some of our indicews contain NA values! +# Which indices are this? + +# We create a vector "sel" for which we check whether any element in each +# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can +# then use this vector to subset ourt dataframe. + +sel <- logical() + +for (i in 1:ncol(aaData)) { # for each index + if (any(is.na(aaData[,i]))) { # if there is any NA value ... + sel <- c(sel, FALSE) # add a FALSE element to the vector + } else { # else + sel <- c(sel, TRUE) # add a TRUE element + } +} + +# Done. sel now subsets only the NA-free columns +545 - sum(sel) # 13 columns excluded + +# Do the PCA ... use the prcomp() function +aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set + +str(aaPCA) # structure of the result + +plot(aaPCA) # plot the contributions of the + # components to the variance + +plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC + aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame + type ="n") # just to set up the coordinate system + +text(aaPCA$rotation[ , 1], # plot the names of the amino acids into + aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions + labels = rownames(aaPCA$rotation)) + +# PCA results are sensitive to the absolute numeric value of the features that +# we are comparing. The prcomp() function has an option scale. = TRUE that +# scales each row of features so that the variance of the value is 1.0 This +# ensures that each feature is given approximately equal weight + +aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) + +plot(aaPCA) + +plot(aaPCA$rotation[ , 1], + aaPCA$rotation[ , 2], + type ="n") +text(aaPCA$rotation[ , 1], + aaPCA$rotation[ , 2], + labels = rownames(aaPCA$rotation)) + + +# Next we try to identify what the PCs correspond to. We see whether there are +# specific features that are highly correlated with the PCs + +# ==== Rotation 1 =================== +# + +(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1 + +# The function cor() calculates Pearson coefficients of correlation +cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 + + +# Iterate over all columns and calculate correlations +cors <- numeric() + +for (i in 1:ncol(aaData)) { + cors[i] <- cor(PC1, aaData[ , i]) +} + +summary(cors) +# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's +# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13 +# +# The max correlation is ~0.6. That is not very high. Which ijndex is it? + +which(cors == max(cors, na.rm = TRUE)) + +aaindex[[504]] # Linker propensity ??? + +cor(PC1, aaindex[[504]]$I) # Did we get the right index? + +# Plot this ... +plot(aaPCA$rotation[ , 1], + aaindex[[504]]$I, + type ="n") +text(aaPCA$rotation[ , 1], + aaindex[[504]]$I, + labels = rownames(aaPCA$rotation)) + +# This is essentially a random correlation but for Cysteine ... + + +# ==== Rotation 2 =================== +# +# same process +PC2 <- aaPCA$rotation[ , 2] + +cors2 <- numeric() + +for (i in 1:ncol(aaData)) { + cors2[i] <- cor(PC2, aaData[ , i]) +} + +summary(cors2) +# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's +# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13 + +# Here we have quite strong correlations + +which(cors2 == max(cors2, na.rm = TRUE)) + +aaindex[[148]] + +# this index itself is correlated with many other indices + +cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index + +# Plot this too... +plot(aaPCA$rotation[ , 2], + aaindex[[148]]$I, + type ="n") +text(aaPCA$rotation[ , 2], + aaindex[[148]]$I, + labels = rownames(aaPCA$rotation)) + +# This correlates well with hydrophobicity measures. In this case the +# PC is to a certain degree interpretable - but this is not always the case +# with PCA (see the example of the first PC). + + + + + + +# [END]