bch441-work-abc-units/FND-STA-Information_theory.R

# tocID <- "FND-STA-Information_theory.R"
#
# ==============================================================================
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the FND-STA-Information_theory unit.
#
# Version:  0.2.1
#
# Date:     2017 - 2021
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.2.1  Maintenance
#           0.2    Under development
#           0.1    First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                  Line
#TOC> --------------------------------------
#TOC>   1        ___Section___            39
#TOC>
#TOC> ==========================================================================


# =    1  ___Section___  =======================================================

# What level of information is "significant"

# Assume the background distribution is the database frequencies of
# amino acids:

AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
# sum to 1.0
AAref["A"] <- 0.0904
AAref["C"] <- 0.0123
AAref["D"] <- 0.0545
AAref["E"] <- 0.0617
AAref["F"] <- 0.0394
AAref["G"] <- 0.0724
AAref["H"] <- 0.0221
AAref["I"] <- 0.0573
AAref["K"] <- 0.0504
AAref["L"] <- 0.0986
AAref["M"] <- 0.0240
AAref["N"] <- 0.0392
AAref["P"] <- 0.0486
AAref["Q"] <- 0.0381
AAref["R"] <- 0.0570
AAref["S"] <- 0.0673
AAref["T"] <- 0.0558
AAref["V"] <- 0.0686
AAref["W"] <- 0.0129
AAref["Y"] <- 0.0294
sum(AAref)

# Function to calculate Shannon entropy
H <- function(pmf) {
  # Calculate Shannon entropy
  # Parameters:
  #   pmf (numeric) probability mass function: a vector of states and
  #                 associated probabilities. Each element of
  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
  # Value:
  #   Shannon entropy in bits.
  # Examples:
  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
  #                                         # nucleotide sequence
  #   H(1)     # If all elements are the same, entropy is zero
  #
  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
    stop("Input is not a discrete probability distribution.")
  }
  H <- -sum(pmf * (log(pmf) / log(2)))
  return(H)
}

# Why use all.equal()? Exact comparisons with floating point numbers are
# brittle. Consider for example:
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8


# Entropy of the database frequencies (in bits):
(Href <- H(AAref))

# for comparison: entropy if all amino acids are equiprobable
H(rep(0.05, 20))


# Set up a simulation to estimate the distribution of Information values
# from random sequences drawn from AAref. This is the distribution for the
# statistical null hypothesis:
nObs <- 15                      # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000                # number of trials
IObs <- numeric(nTrials)        # vector to store Information in each trial
simCounts <- numeric(20)        # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref


for (i in 1:nTrials) {  # simulate ...

  # sample AAref letters, nObs times, with the probabilities of AAref:
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)

  x <- table(AAobs)                            # table simulated observations
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
  simCounts[names(x)] <- x                     # overwrite with observed counts
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
  IObs[i] <- Href - Hobs                       # store information
}

# evaluate
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")

# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
# i.e. an actual observation that lies outside the purple lines is deemed
# "significant"(1)(2). Of course, this is only true to the degree that the
# database frequencies are a valid model for the null-hypothesis on the
# sequence position we are considering here.

#  (1) If we use 5% quantiles, this means a value is significantly larger
#      than expected, and we ignore cases when the value is < 0; if we
#      consider both smaller and larger values, we need to use 2.5% quantiles,
#      since 5% of all observations lie outside the 0.025 and 0.975
#      quantiles.
#
#  (2) For an actual observation of counts, we calculate its observed
#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).


# You can probably now appreciate that information is a bit of a shortcut for
# biological sequences, and does not really take the different inherent
# frequencies based on the character of the amino acids into account. For
# example, L is the most frequent and C is the least frequent, but if we have an
# alignment of 1000 sequences and we see that the frequencies for L and C are
# swapped, that would be _very_ surprising - nevertheless, the information would
# be 0. In order to take that into account, we should actually compute
# Kullback-Leibler divergences.


# Swap C and L frequencies
p <- AAref
q <- AAref
q["L"] <- AAref["C"]
q["C"] <- AAref["L"]
H(p)
H(q)

KLdiv <- function(p, q) {
  # p and q are two pmfs of discrete probability distributions
  # with the same outcomes, which are nowhere 0.
  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).

  if (length(p) != length(q)) {
    stop("PANIC: input vector lengths differ!")
  }
  if (any(c((p == 0), (q == 0)))) {
    stop("PANIC: 0's found in input vectors!")
  }

  return(sum(p * log( p / q )))
}

KLdiv(p, p)
KLdiv(p, q)


nObs <- 15                      # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000                # number of trials
KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
simCounts <- numeric(20)        # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref


for (i in 1:nTrials) {  # simulate ...

  # sample AAref letters, nObs times, with the probabilities of AAref:
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)

  x <- table(AAobs)                            # table simulated observations
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
  simCounts[names(x)] <- x                     # overwrite with observed counts
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
  simCounts <- simCounts/sum(simCounts)        # counts to frequency
  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
}

# evaluate
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
quantile(KLdivObs, 0.992)

# Running the simulation with KL does not give a fundamentally
# different behaviour - since we are just randomly sampling. But KL would be
# more sensitive in case there is biological selection, where the sampling is no
# longer random. If I run the same simulation, with nObs <- 80 but calculating
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
# nice addition to the toolbox.


# [END]