updates to use of "data" in nls()/nlrob()
This commit is contained in:
parent
1f2ec21546
commit
39b1da5897
@ -4,12 +4,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||
#
|
||||
# Version: 1.4
|
||||
# Version: 1.6
|
||||
#
|
||||
# Date: 2017-10 - 2020-11
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.6 Revise use of data argument to nls()/nlrob()
|
||||
# 1.5 Extensive additions on Poisson and binomial distributions
|
||||
# and regression fits
|
||||
# 1.4 2020 Maintenance
|
||||
@ -35,31 +36,31 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------------
|
||||
#TOC> 1 Introduction 67
|
||||
#TOC> 2 Three fundamental distributions 130
|
||||
#TOC> 2.1 The Poisson Distribution 133
|
||||
#TOC> 2.2 The hypergeometric distribution 227
|
||||
#TOC> 2.2.1 Digression: qqplot() and residuals 375
|
||||
#TOC> 2.2.2 Fitting functions 433
|
||||
#TOC> 2.2.2.1 Fit a normal distribution (using nls() ) 503
|
||||
#TOC> 2.2.2.2 Fit a normal distribution (using nlrob()) ) 520
|
||||
#TOC> 2.2.2.3 Extreme Value distributions: Gumbel 547
|
||||
#TOC> 2.2.2.4 Extreme Value distributions: Weibull 570
|
||||
#TOC> 2.2.2.5 Logistic distribution 613
|
||||
#TOC> 2.2.2.6 Log-Logistic distribution 643
|
||||
#TOC> 2.2.2.7 Fitting a negative binomial distribution 673
|
||||
#TOC> 2.2.2.8 Fitting a binomial distribution 726
|
||||
#TOC> 2.3 The uniform distribution 774
|
||||
#TOC> 2.4 The Normal Distribution 794
|
||||
#TOC> 3 quantile-quantile comparison 835
|
||||
#TOC> 3.1 qqnorm() 845
|
||||
#TOC> 3.2 qqplot() 911
|
||||
#TOC> 4 Quantifying the difference 928
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 963
|
||||
#TOC> 4.2 Kullback-Leibler divergence 1054
|
||||
#TOC> 4.2.1 An example from tossing dice 1065
|
||||
#TOC> 4.2.2 An example from lognormal distributions 1188
|
||||
#TOC> 4.3 Continuous distributions: Kolmogorov-Smirnov test 1231
|
||||
#TOC> 1 Introduction 68
|
||||
#TOC> 2 Three fundamental distributions 131
|
||||
#TOC> 2.1 The Poisson Distribution 134
|
||||
#TOC> 2.2 The hypergeometric distribution 228
|
||||
#TOC> 2.2.1 Digression: qqplot() and residuals 376
|
||||
#TOC> 2.2.2 Fitting functions 436
|
||||
#TOC> 2.2.2.1 Fit a normal distribution (using nls() ) 506
|
||||
#TOC> 2.2.2.2 Fit a normal distribution (using nlrob()) ) 525
|
||||
#TOC> 2.2.2.3 Extreme Value distributions: Gumbel 551
|
||||
#TOC> 2.2.2.4 Extreme Value distributions: Weibull 576
|
||||
#TOC> 2.2.2.5 Logistic distribution 618
|
||||
#TOC> 2.2.2.6 Log-Logistic distribution 647
|
||||
#TOC> 2.2.2.7 Fitting a negative binomial distribution 676
|
||||
#TOC> 2.2.2.8 Fitting a binomial distribution 729
|
||||
#TOC> 2.3 The uniform distribution 841
|
||||
#TOC> 2.4 The Normal Distribution 861
|
||||
#TOC> 3 quantile-quantile comparison 902
|
||||
#TOC> 3.1 qqnorm() 912
|
||||
#TOC> 3.2 qqplot() 978
|
||||
#TOC> 4 Quantifying the difference 995
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 1029
|
||||
#TOC> 4.2 Kullback-Leibler divergence 1120
|
||||
#TOC> 4.2.1 An example from tossing dice 1131
|
||||
#TOC> 4.2.2 An example from lognormal distributions 1253
|
||||
#TOC> 4.3 Continuous distributions: Kolmogorov-Smirnov test 1296
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -384,7 +385,9 @@ legend("topright",
|
||||
# misunderstanding about the requirements for the particular theory to apply. R
|
||||
# has a general way to examine such differences: qqplot() plots the deviations
|
||||
# between theory and observation ordered as ranks, i.e. not affected by absolute
|
||||
# scale.
|
||||
# scale. (See here for an accessible introduction to qqplot() and the
|
||||
# "quantiles" that it uses:
|
||||
# https://data.library.virginia.edu/understanding-q-q-plots/)
|
||||
|
||||
oPar <- par(mfrow = c(1, 2))
|
||||
qqplot( dpois(0:nMax, (250 * 256) / 6091),
|
||||
@ -503,10 +506,12 @@ legend("topright",
|
||||
# ==== 2.2.2.1 Fit a normal distribution (using nls() )
|
||||
|
||||
x <- 0:28
|
||||
plot(0:28, tu, type="s")
|
||||
fit <- nls(tu ~ ( a / (sig*sqrt(2*pi)) ) * exp( (-1/2)*((x-mu)/sig)^2 ), start = c(a = 1, mu = 10, sig = 3)) # starting values
|
||||
plot(x, tu, type="s")
|
||||
fit <- nls(tu ~ ( a / (sig*sqrt(2*pi)) ) * exp( (-1/2)*((x-mu)/sig)^2 ),
|
||||
data = data.frame(x = 0:28, tu = tu),
|
||||
start = c(a = 1, mu = 10, sig = 3)) # starting values
|
||||
|
||||
points(x, predict(fit, list(x = x)), col = "#CC00CC55", lwd = 2,
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2,
|
||||
type = "s")
|
||||
|
||||
coef(fit)
|
||||
@ -526,15 +531,14 @@ pkg <- "robustbase"
|
||||
if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }
|
||||
|
||||
x <- 0:28
|
||||
plot(0:28, tu, type="s")
|
||||
plot(x, tu, type="s")
|
||||
fit <- robustbase::nlrob(tu ~ ( a / (sig*sqrt(6.2831853072)) ) *
|
||||
exp( (-1/2)*((x-mu)/sig)^2 ),
|
||||
data = data.frame(tu = tu,
|
||||
x = 0:28),
|
||||
data = data.frame(x = 0:28,
|
||||
tu = tu),
|
||||
start = c(a = 1, mu = 10, sig = 3)) # starting values
|
||||
|
||||
points(x, predict(fit, list(x = x)), col = "#CC00CC55", lwd = 2,
|
||||
type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# a mu sig
|
||||
@ -546,18 +550,20 @@ sum(resid(fit)^2)
|
||||
|
||||
# ==== 2.2.2.3 Extreme Value distributions: Gumbel
|
||||
|
||||
# Many processes that involve best-of choices are better modelled with so called extreme-value distributions: here is the Gumbel distribution from the evd package.
|
||||
# Many processes that involve "best-of" choices are better modelled with
|
||||
# so-called extreme-value distributions: here is the Gumbel distribution
|
||||
# from the evd package.
|
||||
pkg <- "evd"
|
||||
if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }
|
||||
|
||||
x <- 0:28
|
||||
plot(0:28, tu, type="s")
|
||||
plot(x, tu, type="s")
|
||||
|
||||
fit <- robustbase::nlrob(tu ~ evd::dgumbel(x, loc = L, scale = S),
|
||||
data = data.frame(tu = tu, x = 0:28),
|
||||
start = c(L = 7.3, S = 2.82))
|
||||
|
||||
points(x, predict(fit, list(x = x)), type = "s", col = "#55DD88")
|
||||
points(x, predict(fit), type = "s", col = "#55DD88")
|
||||
|
||||
coef(fit)
|
||||
# L S
|
||||
@ -575,9 +581,9 @@ sum(resid(fit)^2)
|
||||
|
||||
# NOTE: the parameter TH is > X
|
||||
idx <- 4:28
|
||||
x <- idx
|
||||
y <- as.numeric(tu)[idx]
|
||||
plot(x, y, type="s")
|
||||
myX <- idx
|
||||
myY <- as.numeric(tu)[idx]
|
||||
plot(myX, myY, type="s")
|
||||
|
||||
dWei <- function(x, th, l, k) {
|
||||
a <- k/l
|
||||
@ -589,8 +595,8 @@ dWei <- function(x, th, l, k) {
|
||||
|
||||
set.seed(112358)
|
||||
fit <- robustbase::nlrob(y ~ dWei(x, th = TH, l = L, k = K),
|
||||
data = data.frame(y = y,
|
||||
x = x),
|
||||
data = data.frame(y = myY,
|
||||
x = myX),
|
||||
lower = c(TH = 3.5,
|
||||
L = 8,
|
||||
K = 2.5),
|
||||
@ -599,8 +605,7 @@ fit <- robustbase::nlrob(y ~ dWei(x, th = TH, l = L, k = K),
|
||||
K = 3),
|
||||
method = "mtl")
|
||||
|
||||
points(x, predict(fit, list(x = x)),
|
||||
col = "#CC00CC55", lwd = 2, type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# TH L K
|
||||
@ -615,9 +620,9 @@ sum(resid(fit)^2)
|
||||
# Similar to normal distribution, but with heavier tails
|
||||
# https://en.wikipedia.org/wiki/Logistic_distribution
|
||||
|
||||
x <- 0:28
|
||||
y <- as.numeric(tu)
|
||||
plot(0:28, y, type="s")
|
||||
myX <- 0:28
|
||||
myY <- as.numeric(tu)
|
||||
plot(myX, myY, type="s")
|
||||
|
||||
dLogi <- function(x, mu, s) {
|
||||
y <- (exp(-(x-mu)/s)) / (s * (1+exp(-(x-mu)/s))^2)
|
||||
@ -625,12 +630,11 @@ dLogi <- function(x, mu, s) {
|
||||
}
|
||||
|
||||
fit <- robustbase::nlrob(y ~ dLogi(x, mu = M, s = S),
|
||||
data = data.frame(y = y,
|
||||
x = 0:28),
|
||||
data = data.frame(y = myY,
|
||||
x = myX),
|
||||
start = c(M = 10, S = 3))
|
||||
|
||||
points(x, predict(fit, list(x = x)),
|
||||
col = "#CC00CC55", lwd = 2, type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# M S
|
||||
@ -642,12 +646,12 @@ sum(resid(fit)^2)
|
||||
|
||||
# ==== 2.2.2.6 Log-Logistic distribution
|
||||
|
||||
# Spercial case of logistic, often used in survival analysis
|
||||
# Special case of logistic, often used in survival analysis
|
||||
# https://en.wikipedia.org/wiki/Log-logistic_distribution
|
||||
|
||||
x <- 0:28
|
||||
y <- as.numeric(tu)
|
||||
plot(0:28, y, type="s")
|
||||
myX <- 0:28
|
||||
myY <- as.numeric(tu)
|
||||
plot(myX, myY, type="s")
|
||||
|
||||
dLogLogi <- function(x, a, b) {
|
||||
y <- ((b/a)*(x/a)^(b-1)) / (1+((x/a)^b))^2
|
||||
@ -655,12 +659,11 @@ dLogLogi <- function(x, a, b) {
|
||||
}
|
||||
|
||||
fit <- robustbase::nlrob(y ~ dLogLogi(x, a = A, b = B),
|
||||
data = data.frame(y = y,
|
||||
x = 0:28),
|
||||
data = data.frame(y = myY,
|
||||
x = myX),
|
||||
start = c(A = 9.5, B = 4.8))
|
||||
|
||||
points(x, predict(fit, list(x = x)),
|
||||
col = "#CC00CC55", lwd = 2, type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# A B
|
||||
@ -696,8 +699,8 @@ fit <- robustbase::nlrob(tu ~ dNB(x, r = RR, p = P),
|
||||
upper = c(P = 0.99),
|
||||
method = "mtl")
|
||||
|
||||
points(x, predict(fit, list(x = x)),
|
||||
col = "#CC00CC55", lwd = 2, type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# P
|
||||
# 0.9100729
|
||||
@ -728,9 +731,9 @@ which(R == min(R))
|
||||
# cf. https://en.wikipedia.org/wiki/Binomial_distribution
|
||||
|
||||
|
||||
x <- 0:28
|
||||
y <- as.numeric(tu)
|
||||
plot(x, y, type="s")
|
||||
myX <- 0:28
|
||||
myY <- as.numeric(tu)
|
||||
plot(myX, myY, type="s")
|
||||
|
||||
dBinom <- function(x, s, p) {
|
||||
y <- choose(s, x) * (p^x) * ((1-p)^(s-x))
|
||||
@ -738,12 +741,11 @@ dBinom <- function(x, s, p) {
|
||||
}
|
||||
|
||||
fit <- robustbase::nlrob(y ~ dBinom(x, s = S, p = P),
|
||||
data = data.frame(y = y,
|
||||
x = 0:28),
|
||||
data = data.frame(y = myY,
|
||||
x = myX),
|
||||
start = c(S = 240, P = 256/6091))
|
||||
|
||||
points(x, predict(fit, list(x = x)),
|
||||
col = "#CC00CC55", lwd = 2, type = "s")
|
||||
points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
|
||||
|
||||
coef(fit)
|
||||
# S P
|
||||
@ -752,7 +754,72 @@ coef(fit)
|
||||
sum(resid(fit)^2)
|
||||
# [1] 1.114272e-06
|
||||
|
||||
# Here we go: near perfect fit. But note the parameters! Can you identify the relationship of S and P to our model of choosing unique transcription factors in a random sampling process. Because, honestly: I can't.
|
||||
# Here we go: near perfect fit. But note the parameters! Can you identify the
|
||||
# relationship of S and P to our model of choosing unique transcription factors
|
||||
# in a random sampling process?
|
||||
|
||||
# Remember what we started out with:
|
||||
points(x, dBinom(x, 240, 256/6091),
|
||||
col = "#00AAFF55", lwd = 2, type = "s")
|
||||
|
||||
# The model has improved a lot from our starting-value guess. But how does an
|
||||
# effective sample size of 122.78 relate to the actual value of 250 samples? And
|
||||
# how does a probability for choosing a transcription factor of 0.0838 relate to
|
||||
# having 256 transcription factors among 6091 genes (a fraction of 0.0420)? Well
|
||||
# - if you stare at the numbers for a bit you might notice that the fitted
|
||||
# sample size is about half of what we had, while the fitted probability is
|
||||
# about twice that. Let's correct for the factor of two, and look at the fit ...
|
||||
|
||||
points(x, dBinom(x, coef(fit)["S"] * 2 , coef(fit)["P"] / 2),
|
||||
col = "#00FF8899", lwd = 2, type = "s")
|
||||
|
||||
# interesting ... that's not so bad ... Essentially what this tells us is: our
|
||||
# model is very similar to one in which our effective sample size would have
|
||||
# been about 246 genes, not 250, and the number of our genes would have been
|
||||
# close to 0.04188461 * 6091 ~= 255 transcription factors, not 256. Close, to -
|
||||
# but not almost perfect.
|
||||
|
||||
# Incidentally: would you have thought that we can
|
||||
# detect the number of transcription factors in our sample +- 1 if we just
|
||||
# sample frequently enough?
|
||||
|
||||
# Let's examine the effect that applying various factors to size and probability
|
||||
# have on our distribution:
|
||||
|
||||
|
||||
z <- c(1/8:2 , 1, 2:8, 10, 20, 50)
|
||||
myCols <- colorRampPalette(c("#00000022",
|
||||
"#00000088",
|
||||
"#DD0000DD",
|
||||
"#0088FF88",
|
||||
"#0088FF55",
|
||||
"#0088FF22"),
|
||||
alpha=TRUE)(length(z))
|
||||
|
||||
myLwd <- rep(1.25, length(z))
|
||||
myLwd[z == 1] <- 2.5 # make the actual fitted value stand out
|
||||
|
||||
plot(x, y, type="s", ylim =c(0, 0.22), lwd = 5, col="#FF990077")
|
||||
|
||||
for (i in seq_along(z)) {
|
||||
points(x, dBinom(x, coef(fit)["S"] * z[i] , coef(fit)["P"] / z[i]),
|
||||
col = myCols[i], type = "s")
|
||||
}
|
||||
|
||||
legend("topright",
|
||||
legend = sprintf("%3.2f", z),
|
||||
cex = 0.5,
|
||||
col = myCols,
|
||||
lwd = myLwd,
|
||||
bty = "n")
|
||||
|
||||
# We see how factors less than 1 applied to our actual sample size and fraction
|
||||
# of transcription factors increasingly narrow the distribution; factors larger
|
||||
# than one increasingly broaden the distribution but in a much slower manner. A
|
||||
# factor of one (i.e. half size, double probability) hits the sweet spot and
|
||||
# recaptures the essence of our observation. Clearly, this is a case of
|
||||
# fortuitously cancelling errors.
|
||||
|
||||
|
||||
# What is the take-home message here?
|
||||
# - The standard Poisson and hypergeometric distributions apply to
|
||||
@ -927,7 +994,6 @@ qqplot(dl, dg) # clearly not equal
|
||||
|
||||
# = 4 Quantifying the difference ==========================================
|
||||
|
||||
|
||||
# Quantile-quantile plots give us a visual estimate, but how do we quantify the
|
||||
# difference between distributions? Let's compare two types of extreme-value
|
||||
# distributions, a lognormal distribution with the same distribution that is
|
||||
@ -964,7 +1030,7 @@ legend("topright",
|
||||
|
||||
# The chi2 test can be used to compare discrete distributions - even though
|
||||
# it is not ideal for this purpose.
|
||||
# (https://stats.stackexchange.com/questions/113692/test-identicality-of-discrete-distributions)
|
||||
# (https://stats.stackexchange.com/questions/113692)
|
||||
#
|
||||
# Let's draw 100 random variables according to our target distributions
|
||||
N <- 100
|
||||
@ -989,9 +1055,9 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4])
|
||||
# package information - plotrix has _many_ useful utilities to enhance
|
||||
# plots or produce informative visualizations.
|
||||
|
||||
if (! requireNamespace("plotrix", quietly = TRUE)) {
|
||||
install.packages("plotrix")
|
||||
}
|
||||
pkg <- "plotrix"
|
||||
if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }
|
||||
|
||||
# Package information:
|
||||
# library(help = plotrix) # basic information
|
||||
# browseVignettes("plotrix") # available vignettes
|
||||
@ -1139,7 +1205,6 @@ pmfPC <- function(cts, nam) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
# The definition of the Kullback-Leibler divergence itself is quite simple
|
||||
# actually: for two distributions, p and q it is sum(p * log( p / q))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user