Class session notes for 2021-10-12

2021-10-19 22:44:34 +02:00 · 2021-10-19 22:44:34 +02:00 · 4b2bab0f12
commit 4b2bab0f12
parent bb2e2a4c85
1 changed files with 257 additions and 0 deletions
--- a/2021-10-12_In-Class_exploration.R
+++ b/2021-10-12_In-Class_exploration.R
@ -0,0 +1,257 @@
+# 2021-10-12_In-Class_exploration.R
+#
+#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
+#
+# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
+# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
+# Scribe:     boris.steipe@utoronto.ca
+#
+# ==============================================================================
+#
+# In our last session we explored some properties of amino acids and noted that
+# we can arrange them in a scatter-plot according to some properties. But can
+# we also arrange them according to generic properties, i.e. taking all
+# published property scales into account? We will try to use all tables from
+# the seqinr package.
+
+# First we load the package - this makes all datasets immediately available and
+# we don't have to load them one by one.
+
+library(seqinr)
+
+# Determine what datasets are available
+#
+# Using "find in topic" ... "amino acid"
+data(aacost)
+data(aaindex)
+data(pK)
+
+# We note that datasets may be sorted in different ways: for example
+# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
+# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
+# acids are sorted in the same way.
+
+# Build a datastructure ...
+# rows: amino acids
+# columns: properties
+
+# Are all lists in aaindex organized in the same way?
+
+refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
+                                  # index as a reference list
+
+# Loop over each list in aaindex
+for (i in 1:length(aaindex)) {
+#   get the I-vector
+  x <- aaindex[[i]]$I
+#   get the names
+  x <- names(x)
+#   compare with the names of our reference list
+#   the == and != operators are vectorized. Applying them to two vectors
+#   gives TRUE or FALSE for each pair of elements. any() or all() can be
+#   applied to logical vectors to anylise them and return a soingle result.
+#   if (...) conditions evaluate only a single value and will throw a warning if
+#   there is more than one.
+
+  if (any(x != refNames)) {
+    # There was at least one not-equal pair - so: complain
+    print(sprintf("Problem in list %d: names don't match", i))
+  }
+}
+
+# If we get here without identifying problems, it means all pairs of
+# rownames match throughout the aainfex list.
+
+
+# Next: what is the cvorrect syntax to add one vector (the "I" vector of
+# one of the list elements) to our dataframe?
+aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
+aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
+
+str(aaData)  # Confirm: we now have a two-column dataframe
+
+# Next: add the rest ...
+for (i in 3:length(aaindex)) {
+  #   get the I-vector and write it into our dataframe
+  aaData[,i] <- aaindex[[i]]$I
+}
+
+# Sanity check
+plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
+
+# Looks good.
+
+# We finished building our data structure ... but let's add the aacost table
+# aacost is ordered differently:
+rownames(aaData)
+aacost[ , 1]
+
+# using order(), applied to aacost - ordering the column with column-name
+# "aaa"
+sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
+aacost[sel, "aaa"] # applying the order vector sorts the column
+
+# Is this the same order as refNames?
+refNames == aacost[sel, "aaa"]  # Yes!
+
+# add the data from column "tot" (i.e. total metabolic cost) after the
+# last column of aaData
+aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
+
+# Done.
+str(aaData)  # A dataframe with 20 rows and 545 columns
+
+# To answer the question "Which amino acids are similar to each other?" we
+# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
+# we will succumb to the "Curse of Dimensionality":
+#
+#    "in high dimensional data, however, all objects appear
+#     to be sparse and dissimilar in many ways..."
+#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
+#
+# A classic way to do this is Principal Component Analysis (PCA) ...
+# (Principal components analysis)
+#
+# PCA expects objects in columns, properties in rows. Therefore we need to
+# transpose our dataset:
+
+aaPCA <- prcomp(t(aaData))
+
+# This creates an error, because some of our indicews contain NA values!
+# Which indices are this?
+
+# We create a vector "sel" for which we check whether any element in each
+# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
+# then use this vector to subset ourt dataframe.
+
+sel <- logical()
+
+for (i in 1:ncol(aaData)) {         # for each index
+  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
+    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
+  } else {                          #   else
+    sel <- c(sel, TRUE)             #     add a TRUE element
+  }
+}
+
+# Done. sel now subsets only the NA-free columns
+545 - sum(sel)                      # 13 columns excluded
+
+# Do the PCA ... use the prcomp() function
+aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
+
+str(aaPCA)   # structure of the result
+
+plot(aaPCA)                         # plot the contributions of the
+                                    # components to the variance
+
+plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
+     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
+     type ="n")                     # just to set up the coordinate system
+
+text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
+     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
+     labels = rownames(aaPCA$rotation))
+
+# PCA results are sensitive to the absolute numeric value of the features that
+# we are comparing. The prcomp() function has an option scale. = TRUE that
+# scales each row of features so that the variance of the value is 1.0  This
+# ensures that each feature is given approximately equal weight
+
+aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
+
+plot(aaPCA)
+
+plot(aaPCA$rotation[ , 1],
+     aaPCA$rotation[ , 2],
+     type ="n")
+text(aaPCA$rotation[ , 1],
+     aaPCA$rotation[ , 2],
+     labels = rownames(aaPCA$rotation))
+
+
+# Next we try to identify what the PCs correspond to. We see whether there are
+# specific features that are highly correlated with the PCs
+
+# ==== Rotation 1 ===================
+#
+
+(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
+
+# The function cor() calculates Pearson coefficients of correlation
+cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
+
+
+# Iterate over all columns and calculate correlations
+cors <- numeric()
+
+for (i in 1:ncol(aaData)) {
+  cors[i] <- cor(PC1, aaData[ , i])
+}
+
+summary(cors)
+#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
+#
+#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
+
+which(cors == max(cors, na.rm = TRUE))
+
+aaindex[[504]]   # Linker propensity ???
+
+cor(PC1, aaindex[[504]]$I) # Did we get the right index?
+
+# Plot this ...
+plot(aaPCA$rotation[ , 1],
+     aaindex[[504]]$I,
+     type ="n")
+text(aaPCA$rotation[ , 1],
+     aaindex[[504]]$I,
+     labels = rownames(aaPCA$rotation))
+
+# This is essentially a random correlation but for Cysteine ...
+
+
+# ==== Rotation 2 ===================
+#
+# same process
+PC2 <- aaPCA$rotation[ , 2]
+
+cors2 <- numeric()
+
+for (i in 1:ncol(aaData)) {
+  cors2[i] <- cor(PC2, aaData[ , i])
+}
+
+summary(cors2)
+#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
+
+# Here we have quite strong correlations
+
+which(cors2 == max(cors2, na.rm = TRUE))
+
+aaindex[[148]]
+
+# this index itself is correlated with many other indices
+
+cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
+
+# Plot this too...
+plot(aaPCA$rotation[ , 2],
+     aaindex[[148]]$I,
+     type ="n")
+text(aaPCA$rotation[ , 2],
+     aaindex[[148]]$I,
+     labels = rownames(aaPCA$rotation))
+
+# This correlates well with hydrophobicity measures. In this case the
+# PC is to a certain degree interpretable - but this is not always the case
+# with PCA (see the example of the first PC).
+
+
+
+
+
+
+# [END]