# 2021-10-12_In-Class_exploration.R
#
#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
#
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
# Scribe:     boris.steipe@utoronto.ca
#
# ==============================================================================
#
# In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from
# the seqinr package.

# First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one.

library(seqinr)

# Determine what datasets are available
#
# Using "find in topic" ... "amino acid"
data(aacost)
data(aaindex)
data(pK)

# We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way.

# Build a datastructure ...
# rows: amino acids
# columns: properties

# Are all lists in aaindex organized in the same way?

refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
                                  # index as a reference list

# Loop over each list in aaindex
for (i in 1:length(aaindex)) {
#   get the I-vector
  x <- aaindex[[i]]$I
#   get the names
  x <- names(x)
#   compare with the names of our reference list
#   the == and != operators are vectorized. Applying them to two vectors
#   gives TRUE or FALSE for each pair of elements. any() or all() can be
#   applied to logical vectors to anylise them and return a soingle result.
#   if (...) conditions evaluate only a single value and will throw a warning if
#   there is more than one.

  if (any(x != refNames)) {
    # There was at least one not-equal pair - so: complain
    print(sprintf("Problem in list %d: names don't match", i))
  }
}

# If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list.


# Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index

str(aaData)  # Confirm: we now have a two-column dataframe

# Next: add the rest ...
for (i in 3:length(aaindex)) {
  #   get the I-vector and write it into our dataframe
  aaData[,i] <- aaindex[[i]]$I
}

# Sanity check
plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other

# Looks good.

# We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently:
rownames(aaData)
aacost[ , 1]

# using order(), applied to aacost - ordering the column with column-name
# "aaa"
sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column

# Is this the same order as refNames?
refNames == aacost[sel, "aaa"]  # Yes!

# add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]

# Done.
str(aaData)  # A dataframe with 20 rows and 545 columns

# To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality":
#
#    "in high dimensional data, however, all objects appear
#     to be sparse and dissimilar in many ways..."
#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
#
# A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis)
#
# PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset:

aaPCA <- prcomp(t(aaData))

# This creates an error, because some of our indicews contain NA values!
# Which indices are this?

# We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe.

sel <- logical()

for (i in 1:ncol(aaData)) {         # for each index
  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
  } else {                          #   else
    sel <- c(sel, TRUE)             #     add a TRUE element
  }
}

# Done. sel now subsets only the NA-free columns
545 - sum(sel)                      # 13 columns excluded

# Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set

str(aaPCA)   # structure of the result

plot(aaPCA)                         # plot the contributions of the
                                    # components to the variance

plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
     type ="n")                     # just to set up the coordinate system

text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
     labels = rownames(aaPCA$rotation))

# PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0  This
# ensures that each feature is given approximately equal weight

aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)

plot(aaPCA)

plot(aaPCA$rotation[ , 1],
     aaPCA$rotation[ , 2],
     type ="n")
text(aaPCA$rotation[ , 1],
     aaPCA$rotation[ , 2],
     labels = rownames(aaPCA$rotation))


# Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs

# ==== Rotation 1 ===================
#

(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1

# The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37


# Iterate over all columns and calculate correlations
cors <- numeric()

for (i in 1:ncol(aaData)) {
  cors[i] <- cor(PC1, aaData[ , i])
}

summary(cors)
#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
#
#  The max correlation is ~0.6. That is not very high. Which ijndex is it?

which(cors == max(cors, na.rm = TRUE))

aaindex[[504]]   # Linker propensity ???

cor(PC1, aaindex[[504]]$I) # Did we get the right index?

# Plot this ...
plot(aaPCA$rotation[ , 1],
     aaindex[[504]]$I,
     type ="n")
text(aaPCA$rotation[ , 1],
     aaindex[[504]]$I,
     labels = rownames(aaPCA$rotation))

# This is essentially a random correlation but for Cysteine ...


# ==== Rotation 2 ===================
#
# same process
PC2 <- aaPCA$rotation[ , 2]

cors2 <- numeric()

for (i in 1:ncol(aaData)) {
  cors2[i] <- cor(PC2, aaData[ , i])
}

summary(cors2)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13

# Here we have quite strong correlations

which(cors2 == max(cors2, na.rm = TRUE))

aaindex[[148]]

# this index itself is correlated with many other indices

cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index

# Plot this too...
plot(aaPCA$rotation[ , 2],
     aaindex[[148]]$I,
     type ="n")
text(aaPCA$rotation[ , 2],
     aaindex[[148]]$I,
     labels = rownames(aaPCA$rotation))

# This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC).






# [END]