bch441-work-abc-units/FND-STA-Information_theory.R

# FND-STA-Information_theory.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the FND-STA-Information_theory unit.
#
# Version:  0.2
#
# Date:     2017  MM  DD
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.2    Under development
#           0.1    First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


# = 1 ___Section___

# What level of information is "significant"

# Assume the background distribution is the database frequencies of
# amino acids:

AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
# sum to 1.0
AAref["A"] <- 0.0904
AAref["C"] <- 0.0123
AAref["D"] <- 0.0545
AAref["E"] <- 0.0617
AAref["F"] <- 0.0394
AAref["G"] <- 0.0724
AAref["H"] <- 0.0221
AAref["I"] <- 0.0573
AAref["K"] <- 0.0504
AAref["L"] <- 0.0986
AAref["M"] <- 0.0240
AAref["N"] <- 0.0392
AAref["P"] <- 0.0486
AAref["Q"] <- 0.0381
AAref["R"] <- 0.0570
AAref["S"] <- 0.0673
AAref["T"] <- 0.0558
AAref["V"] <- 0.0686
AAref["W"] <- 0.0129
AAref["Y"] <- 0.0294
sum(AAref)

# Function to calculate Shannon entropy
H <- function(v) {
  # Shannon entropy (bits)
  return(-sum(v * (log(v) / log(2))))
}


# Entropy of the database frequencies (in bits):
(Href <- H(AAref))

# for comparison: entropy if all amino acids are equiprobable
H(rep(0.05, 20))


# Set up a simulation to estimate the distribution of Information values
# from random sequences drawn from AAref. This is the distribution for the
# statistical null hypothesis:
nObs <- 15                      # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000                # number of trials
IObs <- numeric(nTrials)        # vector to store Information in each trial
simCounts <- numeric(20)        # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref


for (i in 1:nTrials) {  # simulate ...

  # sample AAref letters, nObs times, with the probabilities of AAref:
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)

  x <- table(AAobs)                            # table simulated observations
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
  simCounts[names(x)] <- x                     # overwrite with observed counts
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
  IObs[i] <- Href - Hobs                       # store information
}

# evaluate
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")

# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
# i.e. an actual observation that lies outside the purple lines is deemed
# "significant"(1)(2). Of course, this is only true to the degree that the
# database frequencies are a valid model for the null-hypothesis on the
# sequence position we are considering here.

#  (1) If we use 5% quantiles, this means a value is significantly larger
#      than expected, and we ignore cases when the value is < 0; if we
#      consider both smaller and larger values, we need to use 2.5% quantiles,
#      since 5% of all observations lie outside the 0.025 and 0.975
#      quantiles.
#
#  (2) For an actual observation of counts, we calculate its observed
#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).


# You can probably now appreciate that information is a bit of a shortcut for
# biological sequences, and does not really take the different inherent
# frequencies based on the character of the amino acids into account. For
# example, L is the most frequent and C is the least frequent, but if we have an
# alignment of 1000 sequences and we see that the frequencies for L and C are
# swapped, that would be _very_ surprising - nevertheless, the information would
# be 0. In order to take that into account, we should actually compute
# Kullback-Leibler divergences.


# Swap C and L frequencies
p <- AAref
q <- AAref
q["L"] <- AAref["C"]
q["C"] <- AAref["L"]
H(p)
H(q)

KLdiv <- function(p, q) {
  # p and q are two pmfs of discrete probability distributions
  # with the same outcomes, which are nowhere 0.
  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).

  if (length(p) != length(q)) {
    stop("PANIC: input vector lengths differ!")
  }
  if (any(c((p == 0), (q == 0)))) {
    stop("PANIC: 0's found in input vectors!")
  }

  return(sum(p * log( p / q )))
}

KLdiv(p, p)
KLdiv(p, q)


nObs <- 15                      # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000                # number of trials
KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
simCounts <- numeric(20)        # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref


for (i in 1:nTrials) {  # simulate ...

  # sample AAref letters, nObs times, with the probabilities of AAref:
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)

  x <- table(AAobs)                            # table simulated observations
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
  simCounts[names(x)] <- x                     # overwrite with observed counts
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
  simCounts <- simCounts/sum(simCounts)        # counts to frequency
  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
}

# evaluate
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
quantile(KLdivObs, 0.992)

# Running the simulation with KL does not give a fundamentally
# different behaviour - since we are just randomly sampling. But KL would be
# more sensitive in case there is biological selection, where the sampling is no
# longer random. If I run the same simulation, with nObs <- 80 but calculating
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
# nice addition to the toolbox.


# [END]
New code - under development 2017-11-21 00:01:20 +00:00			`# FND-STA-Information_theory.R`
			`#`
			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the FND-STA-Information_theory unit.`
			`#`
			`# Version: 0.2`
			`#`
			`# Date: 2017 MM DD`
			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
			`# 0.2 Under development`
			`# 0.1 First code copied from 2016 material.`
			`#`
			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
			`#`
			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
			`#`
			`# ==============================================================================`



			`# = 1 ___Section___`

			`# What level of information is "significant"`

			`# Assume the background distribution is the database frequencies of`
			`# amino acids:`

			`AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to`
			`# sum to 1.0`
			`AAref["A"] <- 0.0904`
			`AAref["C"] <- 0.0123`
			`AAref["D"] <- 0.0545`
			`AAref["E"] <- 0.0617`
			`AAref["F"] <- 0.0394`
			`AAref["G"] <- 0.0724`
			`AAref["H"] <- 0.0221`
			`AAref["I"] <- 0.0573`
			`AAref["K"] <- 0.0504`
			`AAref["L"] <- 0.0986`
			`AAref["M"] <- 0.0240`
			`AAref["N"] <- 0.0392`
			`AAref["P"] <- 0.0486`
			`AAref["Q"] <- 0.0381`
			`AAref["R"] <- 0.0570`
			`AAref["S"] <- 0.0673`
			`AAref["T"] <- 0.0558`
			`AAref["V"] <- 0.0686`
			`AAref["W"] <- 0.0129`
			`AAref["Y"] <- 0.0294`
			`sum(AAref)`

			`# Function to calculate Shannon entropy`
			`H <- function(v) {`
			`# Shannon entropy (bits)`
			`return(-sum(v * (log(v) / log(2))))`
			`}`


			`# Entropy of the database frequencies (in bits):`
			`(Href <- H(AAref))`

			`# for comparison: entropy if all amino acids are equiprobable`
			`H(rep(0.05, 20))`


			`# Set up a simulation to estimate the distribution of Information values`
			`# from random sequences drawn from AAref. This is the distribution for the`
			`# statistical null hypothesis:`
			`nObs <- 15 # number of observations (e.g aligned sequences)`
			`# nObs <- 80`
			`nTrials <- 10000 # number of trials`
			`IObs <- numeric(nTrials) # vector to store Information in each trial`
			`simCounts <- numeric(20) # vector to tabulate our information ...`
			`names(simCounts) <- names(AAref)# ... with the names of AAref`


			`for (i in 1:nTrials) { # simulate ...`

			`# sample AAref letters, nObs times, with the probabilities of AAref:`
			`AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)`

			`x <- table(AAobs) # table simulated observations`
			`simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0`
			`simCounts[names(x)] <- x # overwrite with observed counts`
			`simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts`
			`Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H`
			`IObs[i] <- Href - Hobs # store information`
			`}`

			`# evaluate`
			`hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)`
			`abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")`

			`# The purple lines are drawn at the 5% quantiles of the Iobs distributions -`
			`# i.e. an actual observation that lies outside the purple lines is deemed`
			`# "significant"(1)(2). Of course, this is only true to the degree that the`
			`# database frequencies are a valid model for the null-hypothesis on the`
			`# sequence position we are considering here.`

			`# (1) If we use 5% quantiles, this means a value is significantly larger`
			`# than expected, and we ignore cases when the value is < 0; if we`
			`# consider both smaller and larger values, we need to use 2.5% quantiles,`
			`# since 5% of all observations lie outside the 0.025 and 0.975`
			`# quantiles.`
			`#`
			`# (2) For an actual observation of counts, we calculate its observed`
			`# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).`


			`# You can probably now appreciate that information is a bit of a shortcut for`
			`# biological sequences, and does not really take the different inherent`
			`# frequencies based on the character of the amino acids into account. For`
			`# example, L is the most frequent and C is the least frequent, but if we have an`
			`# alignment of 1000 sequences and we see that the frequencies for L and C are`
			`# swapped, that would be _very_ surprising - nevertheless, the information would`
			`# be 0. In order to take that into account, we should actually compute`
			`# Kullback-Leibler divergences.`


			`# Swap C and L frequencies`
			`p <- AAref`
			`q <- AAref`
			`q["L"] <- AAref["C"]`
			`q["C"] <- AAref["L"]`
			`H(p)`
			`H(q)`

			`KLdiv <- function(p, q) {`
			`# p and q are two pmfs of discrete probability distributions`
			`# with the same outcomes, which are nowhere 0.`
			`# Value: Kullback-Leibler divergence sum(p * log( p / q))).`

			`if (length(p) != length(q)) {`
			`stop("PANIC: input vector lengths differ!")`
			`}`
			`if (any(c((p == 0), (q == 0)))) {`
			`stop("PANIC: 0's found in input vectors!")`
			`}`

			`return(sum(p * log( p / q )))`
			`}`

			`KLdiv(p, p)`
			`KLdiv(p, q)`


			`nObs <- 15 # number of observations (e.g aligned sequences)`
			`# nObs <- 80`
			`nTrials <- 10000 # number of trials`
			`KLdivObs <- numeric(nTrials) # vector to store Information in each trial`
			`simCounts <- numeric(20) # vector to tabulate our information ...`
			`names(simCounts) <- names(AAref)# ... with the names of AAref`


			`for (i in 1:nTrials) { # simulate ...`

			`# sample AAref letters, nObs times, with the probabilities of AAref:`
			`AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)`

			`x <- table(AAobs) # table simulated observations`
			`simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0`
			`simCounts[names(x)] <- x # overwrite with observed counts`
			`simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts`
			`simCounts <- simCounts/sum(simCounts) # counts to frequency`
			`KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv`
			`}`

			`# evaluate`
			`hist(KLdivObs, col = "#C9F4E3", breaks = 25)`
			`abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")`
			`quantile(KLdivObs, 0.992)`

			`# Running the simulation with KL does not give a fundamentally`
			`# different behaviour - since we are just randomly sampling. But KL would be`
			`# more sensitive in case there is biological selection, where the sampling is no`
			`# longer random. If I run the same simulation, with nObs <- 80 but calculating`
			`# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L`
			`# frequency swap gives me a KL divergence of 0.18 - this is significant at p =`
			`# 0.008 - (remember, Information is 0 in this case). So that's actually quite a`
			`# nice addition to the toolbox.`


			`# [END]`