bch441-work-abc-units/2021-10-12_In-Class_exploration.R

# 2021-10-12_In-Class_exploration.R
#
#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
#
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
# Scribe:     boris.steipe@utoronto.ca
#
# ==============================================================================
#
# In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from
# the seqinr package.

# First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one.

library(seqinr)

# Determine what datasets are available
#
# Using "find in topic" ... "amino acid"
data(aacost)
data(aaindex)
data(pK)

# We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way.

# Build a datastructure ...
# rows: amino acids
# columns: properties

# Are all lists in aaindex organized in the same way?

refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
                                  # index as a reference list

# Loop over each list in aaindex
for (i in 1:length(aaindex)) {
#   get the I-vector
  x <- aaindex[[i]]$I
#   get the names
  x <- names(x)
#   compare with the names of our reference list
#   the == and != operators are vectorized. Applying them to two vectors
#   gives TRUE or FALSE for each pair of elements. any() or all() can be
#   applied to logical vectors to anylise them and return a soingle result.
#   if (...) conditions evaluate only a single value and will throw a warning if
#   there is more than one.

  if (any(x != refNames)) {
    # There was at least one not-equal pair - so: complain
    print(sprintf("Problem in list %d: names don't match", i))
  }
}

# If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list.


# Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index

str(aaData)  # Confirm: we now have a two-column dataframe

# Next: add the rest ...
for (i in 3:length(aaindex)) {
  #   get the I-vector and write it into our dataframe
  aaData[,i] <- aaindex[[i]]$I
}

# Sanity check
plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other

# Looks good.

# We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently:
rownames(aaData)
aacost[ , 1]

# using order(), applied to aacost - ordering the column with column-name
# "aaa"
sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column

# Is this the same order as refNames?
refNames == aacost[sel, "aaa"]  # Yes!

# add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]

# Done.
str(aaData)  # A dataframe with 20 rows and 545 columns

# To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality":
#
#    "in high dimensional data, however, all objects appear
#     to be sparse and dissimilar in many ways..."
#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
#
# A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis)
#
# PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset:

aaPCA <- prcomp(t(aaData))

# This creates an error, because some of our indicews contain NA values!
# Which indices are this?

# We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe.

sel <- logical()

for (i in 1:ncol(aaData)) {         # for each index
  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
  } else {                          #   else
    sel <- c(sel, TRUE)             #     add a TRUE element
  }
}

# Done. sel now subsets only the NA-free columns
545 - sum(sel)                      # 13 columns excluded

# Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set

str(aaPCA)   # structure of the result

plot(aaPCA)                         # plot the contributions of the
                                    # components to the variance

plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
     type ="n")                     # just to set up the coordinate system

text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
     labels = rownames(aaPCA$rotation))

# PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0  This
# ensures that each feature is given approximately equal weight

aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)

plot(aaPCA)

plot(aaPCA$rotation[ , 1],
     aaPCA$rotation[ , 2],
     type ="n")
text(aaPCA$rotation[ , 1],
     aaPCA$rotation[ , 2],
     labels = rownames(aaPCA$rotation))


# Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs

# ==== Rotation 1 ===================
#

(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1

# The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37


# Iterate over all columns and calculate correlations
cors <- numeric()

for (i in 1:ncol(aaData)) {
  cors[i] <- cor(PC1, aaData[ , i])
}

summary(cors)
#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
#
#  The max correlation is ~0.6. That is not very high. Which ijndex is it?

which(cors == max(cors, na.rm = TRUE))

aaindex[[504]]   # Linker propensity ???

cor(PC1, aaindex[[504]]$I) # Did we get the right index?

# Plot this ...
plot(aaPCA$rotation[ , 1],
     aaindex[[504]]$I,
     type ="n")
text(aaPCA$rotation[ , 1],
     aaindex[[504]]$I,
     labels = rownames(aaPCA$rotation))

# This is essentially a random correlation but for Cysteine ...


# ==== Rotation 2 ===================
#
# same process
PC2 <- aaPCA$rotation[ , 2]

cors2 <- numeric()

for (i in 1:ncol(aaData)) {
  cors2[i] <- cor(PC2, aaData[ , i])
}

summary(cors2)
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13

# Here we have quite strong correlations

which(cors2 == max(cors2, na.rm = TRUE))

aaindex[[148]]

# this index itself is correlated with many other indices

cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index

# Plot this too...
plot(aaPCA$rotation[ , 2],
     aaindex[[148]]$I,
     type ="n")
text(aaPCA$rotation[ , 2],
     aaindex[[148]]$I,
     labels = rownames(aaPCA$rotation))

# This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC).


# [END]
Class session notes for 2021-10-12 2021-10-19 20:44:34 +00:00			`# 2021-10-12_In-Class_exploration.R`
			`#`
			`# ===== T H E E V E N B E T T E R A M I N O A C I D =====`
			`#`
			`# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12`
			`# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu`
			`# Scribe: boris.steipe@utoronto.ca`
			`#`
			`# ==============================================================================`
			`#`
			`# In our last session we explored some properties of amino acids and noted that`
			`# we can arrange them in a scatter-plot according to some properties. But can`
			`# we also arrange them according to generic properties, i.e. taking all`
			`# published property scales into account? We will try to use all tables from`
			`# the seqinr package.`

			`# First we load the package - this makes all datasets immediately available and`
			`# we don't have to load them one by one.`

			`library(seqinr)`

			`# Determine what datasets are available`
			`#`
			`# Using "find in topic" ... "amino acid"`
			`data(aacost)`
			`data(aaindex)`
			`data(pK)`

			`# We note that datasets may be sorted in different ways: for example`
			`# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,`
			`# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino`
			`# acids are sorted in the same way.`

			`# Build a datastructure ...`
			`# rows: amino acids`
			`# columns: properties`

			`# Are all lists in aaindex organized in the same way?`

			`refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item`
			`# index as a reference list`

			`# Loop over each list in aaindex`
			`for (i in 1:length(aaindex)) {`
			`# get the I-vector`
			`x <- aaindex[[i]]$I`
			`# get the names`
			`x <- names(x)`
			`# compare with the names of our reference list`
			`# the == and != operators are vectorized. Applying them to two vectors`
			`# gives TRUE or FALSE for each pair of elements. any() or all() can be`
			`# applied to logical vectors to anylise them and return a soingle result.`
			`# if (...) conditions evaluate only a single value and will throw a warning if`
			`# there is more than one.`

			`if (any(x != refNames)) {`
			`# There was at least one not-equal pair - so: complain`
			`print(sprintf("Problem in list %d: names don't match", i))`
			`}`
			`}`

			`# If we get here without identifying problems, it means all pairs of`
			`# rownames match throughout the aainfex list.`


			`# Next: what is the cvorrect syntax to add one vector (the "I" vector of`
			`# one of the list elements) to our dataframe?`
			`aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index`
			`aaData[,2] <- aaindex[[2]]$I # ... add the secondf index`

			`str(aaData) # Confirm: we now have a two-column dataframe`

			`# Next: add the rest ...`
			`for (i in 3:length(aaindex)) {`
			`# get the I-vector and write it into our dataframe`
			`aaData[,i] <- aaindex[[i]]$I`
			`}`

			`# Sanity check`
			`plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other`

			`# Looks good.`

			`# We finished building our data structure ... but let's add the aacost table`
			`# aacost is ordered differently:`
			`rownames(aaData)`
			`aacost[ , 1]`

			`# using order(), applied to aacost - ordering the column with column-name`
			`# "aaa"`
			`sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes`
			`aacost[sel, "aaa"] # applying the order vector sorts the column`

			`# Is this the same order as refNames?`
			`refNames == aacost[sel, "aaa"] # Yes!`

			`# add the data from column "tot" (i.e. total metabolic cost) after the`
			`# last column of aaData`
			`aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]`

			`# Done.`
			`str(aaData) # A dataframe with 20 rows and 545 columns`

			`# To answer the question "Which amino acids are similar to each other?" we`
			`# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise`
			`# we will succumb to the "Curse of Dimensionality":`
			`#`
			`# "in high dimensional data, however, all objects appear`
			`# to be sparse and dissimilar in many ways..."`
			`# https://en.wikipedia.org/wiki/Curse_of_dimensionality`
			`#`
			`# A classic way to do this is Principal Component Analysis (PCA) ...`
			`# (Principal components analysis)`
			`#`
			`# PCA expects objects in columns, properties in rows. Therefore we need to`
			`# transpose our dataset:`

			`aaPCA <- prcomp(t(aaData))`

			`# This creates an error, because some of our indicews contain NA values!`
			`# Which indices are this?`

			`# We create a vector "sel" for which we check whether any element in each`
			`# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can`
			`# then use this vector to subset ourt dataframe.`

			`sel <- logical()`

			`for (i in 1:ncol(aaData)) { # for each index`
			`if (any(is.na(aaData[,i]))) { # if there is any NA value ...`
			`sel <- c(sel, FALSE) # add a FALSE element to the vector`
			`} else { # else`
			`sel <- c(sel, TRUE) # add a TRUE element`
			`}`
			`}`

			`# Done. sel now subsets only the NA-free columns`
			`545 - sum(sel) # 13 columns excluded`

			`# Do the PCA ... use the prcomp() function`
			`aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set`

			`str(aaPCA) # structure of the result`

			`plot(aaPCA) # plot the contributions of the`
			`# components to the variance`

			`plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC`
			`aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame`
			`type ="n") # just to set up the coordinate system`

			`text(aaPCA$rotation[ , 1], # plot the names of the amino acids into`
			`aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions`
			`labels = rownames(aaPCA$rotation))`

			`# PCA results are sensitive to the absolute numeric value of the features that`
			`# we are comparing. The prcomp() function has an option scale. = TRUE that`
			`# scales each row of features so that the variance of the value is 1.0 This`
			`# ensures that each feature is given approximately equal weight`

			`aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)`

			`plot(aaPCA)`

			`plot(aaPCA$rotation[ , 1],`
			`aaPCA$rotation[ , 2],`
			`type ="n")`
			`text(aaPCA$rotation[ , 1],`
			`aaPCA$rotation[ , 2],`
			`labels = rownames(aaPCA$rotation))`


			`# Next we try to identify what the PCs correspond to. We see whether there are`
			`# specific features that are highly correlated with the PCs`

			`# ==== Rotation 1 ===================`
			`#`

			`(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1`

			`# The function cor() calculates Pearson coefficients of correlation`
			`cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37`


			`# Iterate over all columns and calculate correlations`
			`cors <- numeric()`

			`for (i in 1:ncol(aaData)) {`
			`cors[i] <- cor(PC1, aaData[ , i])`
			`}`

			`summary(cors)`
			`# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's`
			`# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13`
			`#`
			`# The max correlation is ~0.6. That is not very high. Which ijndex is it?`

			`which(cors == max(cors, na.rm = TRUE))`

			`aaindex[[504]] # Linker propensity ???`

			`cor(PC1, aaindex[[504]]$I) # Did we get the right index?`

			`# Plot this ...`
			`plot(aaPCA$rotation[ , 1],`
			`aaindex[[504]]$I,`
			`type ="n")`
			`text(aaPCA$rotation[ , 1],`
			`aaindex[[504]]$I,`
			`labels = rownames(aaPCA$rotation))`

			`# This is essentially a random correlation but for Cysteine ...`


			`# ==== Rotation 2 ===================`
			`#`
			`# same process`
			`PC2 <- aaPCA$rotation[ , 2]`

			`cors2 <- numeric()`

			`for (i in 1:ncol(aaData)) {`
			`cors2[i] <- cor(PC2, aaData[ , i])`
			`}`

			`summary(cors2)`
			`# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's`
			`# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13`

			`# Here we have quite strong correlations`

			`which(cors2 == max(cors2, na.rm = TRUE))`

			`aaindex[[148]]`

			`# this index itself is correlated with many other indices`

			`cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index`

			`# Plot this too...`
			`plot(aaPCA$rotation[ , 2],`
			`aaindex[[148]]$I,`
			`type ="n")`
			`text(aaPCA$rotation[ , 2],`
			`aaindex[[148]]$I,`
			`labels = rownames(aaPCA$rotation))`

			`# This correlates well with hydrophobicity measures. In this case the`
			`# PC is to a certain degree interpretable - but this is not always the case`
			`# with PCA (see the example of the first PC).`






			`# [END]`