updates to use of "data" in nls()/nlrob()

2020-11-05 20:34:21 +10:00 · 2020-11-05 20:34:21 +10:00 · 39b1da5897
commit 39b1da5897
parent 1f2ec21546
1 changed files with 141 additions and 76 deletions
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -4,12 +4,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
 #
-# Version:  1.4
+# Version:  1.6
 #
 # Date:     2017-10  -  2020-11
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.6    Revise use of data argument to nls()/nlrob()
 #           1.5    Extensive additions on Poisson and binomial distributions
 #                    and regression fits
 #           1.4    2020 Maintenance
@ -35,31 +36,31 @@
 #TOC> 
 #TOC>   Section  Title                                                      Line
 #TOC> --------------------------------------------------------------------------
-#TOC>   1        Introduction                                                 67
-#TOC>   2        Three fundamental distributions                             130
-#TOC>   2.1        The Poisson Distribution                                  133
-#TOC>   2.2        The hypergeometric distribution                           227
-#TOC>   2.2.1          Digression: qqplot() and residuals                    375
-#TOC>   2.2.2          Fitting functions                                     433
-#TOC>   2.2.2.1        Fit a normal distribution (using nls() )              503
-#TOC>   2.2.2.2        Fit a normal distribution (using nlrob()) )           520
-#TOC>   2.2.2.3        Extreme Value distributions: Gumbel                   547
-#TOC>   2.2.2.4        Extreme Value distributions: Weibull                  570
-#TOC>   2.2.2.5        Logistic distribution                                 613
-#TOC>   2.2.2.6        Log-Logistic distribution                             643
-#TOC>   2.2.2.7        Fitting a negative binomial distribution              673
-#TOC>   2.2.2.8        Fitting a binomial distribution                       726
-#TOC>   2.3        The uniform distribution                                  774
-#TOC>   2.4        The Normal Distribution                                   794
-#TOC>   3        quantile-quantile comparison                                835
-#TOC>   3.1        qqnorm()                                                  845
-#TOC>   3.2        qqplot()                                                  911
-#TOC>   4        Quantifying the difference                                  928
-#TOC>   4.1        Chi2 test for discrete distributions                      963
-#TOC>   4.2        Kullback-Leibler divergence                              1054
-#TOC>   4.2.1          An example from tossing dice                         1065
-#TOC>   4.2.2          An example from lognormal distributions              1188
-#TOC>   4.3        Continuous distributions: Kolmogorov-Smirnov test        1231
+#TOC>   1        Introduction                                                 68
+#TOC>   2        Three fundamental distributions                             131
+#TOC>   2.1        The Poisson Distribution                                  134
+#TOC>   2.2        The hypergeometric distribution                           228
+#TOC>   2.2.1          Digression: qqplot() and residuals                    376
+#TOC>   2.2.2          Fitting functions                                     436
+#TOC>   2.2.2.1        Fit a normal distribution (using nls() )              506
+#TOC>   2.2.2.2        Fit a normal distribution (using nlrob()) )           525
+#TOC>   2.2.2.3        Extreme Value distributions: Gumbel                   551
+#TOC>   2.2.2.4        Extreme Value distributions: Weibull                  576
+#TOC>   2.2.2.5        Logistic distribution                                 618
+#TOC>   2.2.2.6        Log-Logistic distribution                             647
+#TOC>   2.2.2.7        Fitting a negative binomial distribution              676
+#TOC>   2.2.2.8        Fitting a binomial distribution                       729
+#TOC>   2.3        The uniform distribution                                  841
+#TOC>   2.4        The Normal Distribution                                   861
+#TOC>   3        quantile-quantile comparison                                902
+#TOC>   3.1        qqnorm()                                                  912
+#TOC>   3.2        qqplot()                                                  978
+#TOC>   4        Quantifying the difference                                  995
+#TOC>   4.1        Chi2 test for discrete distributions                     1029
+#TOC>   4.2        Kullback-Leibler divergence                              1120
+#TOC>   4.2.1          An example from tossing dice                         1131
+#TOC>   4.2.2          An example from lognormal distributions              1253
+#TOC>   4.3        Continuous distributions: Kolmogorov-Smirnov test        1296
 #TOC> 
 #TOC> ==========================================================================

@ -384,7 +385,9 @@ legend("topright",
 # misunderstanding about the requirements for the particular theory to apply. R
 # has a general way to examine such differences: qqplot() plots the deviations
 # between theory and observation ordered as ranks, i.e. not affected by absolute
-# scale.
+# scale. (See here for an accessible introduction to qqplot() and the
+# "quantiles" that it uses:
+# https://data.library.virginia.edu/understanding-q-q-plots/)

 oPar <- par(mfrow = c(1, 2))
 qqplot( dpois(0:nMax, (250 * 256) / 6091),
@ -503,10 +506,12 @@ legend("topright",
 # ====  2.2.2.1  Fit a normal distribution (using nls() )         

 x <- 0:28
-plot(0:28, tu, type="s")
-fit <- nls(tu ~ ( a / (sig*sqrt(2*pi)) ) * exp( (-1/2)*((x-mu)/sig)^2 ),                    start = c(a = 1, mu = 10, sig = 3))  # starting values
+plot(x, tu, type="s")
+fit <- nls(tu ~ ( a / (sig*sqrt(2*pi)) ) * exp( (-1/2)*((x-mu)/sig)^2 ),
+           data = data.frame(x = 0:28, tu = tu),
+           start = c(a = 1, mu = 10, sig = 3))  # starting values

-points(x, predict(fit, list(x = x)), col = "#CC00CC55", lwd = 2,
+points(x, predict(fit), col = "#CC00CC55", lwd = 2,
       type = "s")

 coef(fit)
@ -526,15 +531,14 @@ pkg <- "robustbase"
 if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }

 x <- 0:28
-plot(0:28, tu, type="s")
+plot(x, tu, type="s")
 fit <- robustbase::nlrob(tu ~ ( a / (sig*sqrt(6.2831853072)) ) *
                           exp( (-1/2)*((x-mu)/sig)^2 ),
-                         data = data.frame(tu = tu,
-                                           x = 0:28),
+                         data = data.frame(x = 0:28,
+                                           tu = tu),
                         start = c(a = 1, mu = 10, sig = 3))  # starting values

-points(x, predict(fit, list(x = x)), col = "#CC00CC55", lwd = 2,
-       type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")

 coef(fit)
 #        a        mu       sig
@ -546,18 +550,20 @@ sum(resid(fit)^2)

 # ====  2.2.2.3  Extreme Value distributions: Gumbel              

-# Many processes that involve best-of choices are better modelled with so called extreme-value distributions: here is the Gumbel distribution from the evd package.
+# Many processes that involve "best-of" choices are better modelled with
+# so-called extreme-value distributions: here is the Gumbel distribution
+# from the evd package.
 pkg <- "evd"
 if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }

 x <- 0:28
-plot(0:28, tu, type="s")
+plot(x, tu, type="s")

 fit <- robustbase::nlrob(tu ~ evd::dgumbel(x, loc = L, scale = S),
                         data = data.frame(tu = tu, x = 0:28),
                         start = c(L = 7.3, S = 2.82))

-points(x, predict(fit, list(x = x)), type = "s", col = "#55DD88")
+points(x, predict(fit), type = "s", col = "#55DD88")

 coef(fit)
 #        L        S
@ -575,9 +581,9 @@ sum(resid(fit)^2)

 # NOTE: the parameter TH is > X
 idx <- 4:28
-x <- idx
-y <- as.numeric(tu)[idx]
-plot(x, y, type="s")
+myX <- idx
+myY <- as.numeric(tu)[idx]
+plot(myX, myY, type="s")

 dWei <- function(x, th, l, k) {
  a <- k/l
@ -589,8 +595,8 @@ dWei <- function(x, th, l, k) {

 set.seed(112358)
 fit <- robustbase::nlrob(y ~ dWei(x, th = TH, l = L, k = K),
-                         data = data.frame(y = y,
-                                           x = x),
+                         data = data.frame(y = myY,
+                                           x = myX),
                         lower = c(TH = 3.5,
                                   L = 8,
                                   K = 2.5),
@ -599,8 +605,7 @@ fit <- robustbase::nlrob(y ~ dWei(x, th = TH, l = L, k = K),
                                   K = 3),
                         method = "mtl")

-points(x, predict(fit, list(x = x)),
-       col = "#CC00CC55", lwd = 2, type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")

 coef(fit)
 #      TH        L        K
@ -615,9 +620,9 @@ sum(resid(fit)^2)
 # Similar to normal distribution, but with heavier tails
 # https://en.wikipedia.org/wiki/Logistic_distribution

-x <- 0:28
-y <- as.numeric(tu)
-plot(0:28, y, type="s")
+myX <- 0:28
+myY <- as.numeric(tu)
+plot(myX, myY, type="s")

 dLogi <- function(x, mu, s) {
  y <- (exp(-(x-mu)/s)) / (s * (1+exp(-(x-mu)/s))^2)
@ -625,12 +630,11 @@ dLogi <- function(x, mu, s) {
 }

 fit <- robustbase::nlrob(y ~ dLogi(x, mu = M, s = S),
-                         data = data.frame(y = y,
-                                           x = 0:28),
+                         data = data.frame(y = myY,
+                                           x = myX),
                         start = c(M = 10, S = 3))

-points(x, predict(fit, list(x = x)),
-       col = "#CC00CC55", lwd = 2, type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")

 coef(fit)
 # M         S
@ -642,12 +646,12 @@ sum(resid(fit)^2)

 # ====  2.2.2.6  Log-Logistic distribution                        

-# Spercial case of logistic, often used in survival analysis
+# Special case of logistic, often used in survival analysis
 # https://en.wikipedia.org/wiki/Log-logistic_distribution

-x <- 0:28
-y <- as.numeric(tu)
-plot(0:28, y, type="s")
+myX <- 0:28
+myY <- as.numeric(tu)
+plot(myX, myY, type="s")

 dLogLogi <- function(x, a, b) {
  y <- ((b/a)*(x/a)^(b-1)) / (1+((x/a)^b))^2
@ -655,12 +659,11 @@ dLogLogi <- function(x, a, b) {
 }

 fit <- robustbase::nlrob(y ~ dLogLogi(x, a = A, b = B),
-                         data = data.frame(y = y,
-                                           x = 0:28),
+                         data = data.frame(y = myY,
+                                           x = myX),
                         start = c(A = 9.5, B = 4.8))

-points(x, predict(fit, list(x = x)),
-       col = "#CC00CC55", lwd = 2, type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")

 coef(fit)
 #         A         B
@ -696,8 +699,8 @@ fit <- robustbase::nlrob(tu ~ dNB(x, r = RR, p = P),
                         upper = c(P = 0.99),
                         method = "mtl")

-points(x, predict(fit, list(x = x)),
-       col = "#CC00CC55", lwd = 2, type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")
+
 coef(fit)
 #         P
 # 0.9100729
@ -728,9 +731,9 @@ which(R == min(R))
 # cf. https://en.wikipedia.org/wiki/Binomial_distribution


-x <- 0:28
-y <- as.numeric(tu)
-plot(x, y, type="s")
+myX <- 0:28
+myY <- as.numeric(tu)
+plot(myX, myY, type="s")

 dBinom <- function(x, s, p) {
  y <- choose(s, x) * (p^x) * ((1-p)^(s-x))
@ -738,12 +741,11 @@ dBinom <- function(x, s, p) {
 }

 fit <- robustbase::nlrob(y ~ dBinom(x, s = S, p = P),
-                         data = data.frame(y = y,
-                                           x = 0:28),
+                         data = data.frame(y = myY,
+                                           x = myX),
                         start = c(S = 240, P = 256/6091))

-points(x, predict(fit, list(x = x)),
-       col = "#CC00CC55", lwd = 2, type = "s")
+points(x, predict(fit), col = "#CC00CC55", lwd = 2, type = "s")

 coef(fit)
 #            S            P
@ -752,7 +754,72 @@ coef(fit)
 sum(resid(fit)^2)
 # [1] 1.114272e-06

-# Here we go: near perfect fit. But note the parameters! Can you identify the relationship of S and P to our model of choosing unique transcription factors in a random sampling process. Because, honestly: I can't.
+# Here we go: near perfect fit. But note the parameters! Can you identify the
+# relationship of S and P to our model of choosing unique transcription factors
+# in a random sampling process?
+
+# Remember what we started out with:
+points(x, dBinom(x, 240, 256/6091),
+       col = "#00AAFF55", lwd = 2, type = "s")
+
+# The model has improved a lot from our starting-value guess. But how does an
+# effective sample size of 122.78 relate to the actual value of 250 samples? And
+# how does a probability for choosing a transcription factor of 0.0838 relate to
+# having 256 transcription factors among 6091 genes (a fraction of 0.0420)? Well
+# - if you stare at the numbers for a bit you might notice that the fitted
+# sample size is about half of what we had, while the fitted probability is
+# about twice that. Let's correct for the factor of two, and look at the fit ...
+
+points(x, dBinom(x, coef(fit)["S"] * 2 , coef(fit)["P"] / 2),
+       col = "#00FF8899", lwd = 2, type = "s")
+
+# interesting ... that's not so bad ... Essentially what this tells us is: our
+# model is very similar to one in which our effective sample size would have
+# been about 246 genes, not 250, and the number of our genes would have been
+# close to 0.04188461 * 6091 ~= 255 transcription factors, not 256. Close, to -
+# but not almost perfect.
+
+# Incidentally: would you have thought that we can
+# detect the number of transcription factors in our sample +- 1 if we just
+# sample frequently enough?
+
+# Let's examine the effect that applying various factors to size and probability
+# have on our distribution:
+
+
+z <- c(1/8:2 , 1, 2:8, 10, 20, 50)
+myCols <- colorRampPalette(c("#00000022",
+                             "#00000088",
+                             "#DD0000DD",
+                             "#0088FF88",
+                             "#0088FF55",
+                             "#0088FF22"),
+                           alpha=TRUE)(length(z))
+
+myLwd <- rep(1.25, length(z))
+myLwd[z == 1] <- 2.5 # make the actual fitted value stand out
+
+plot(x, y, type="s", ylim =c(0, 0.22), lwd = 5, col="#FF990077")
+
+for (i in seq_along(z)) {
+  points(x, dBinom(x, coef(fit)["S"] * z[i] , coef(fit)["P"] / z[i]),
+         col = myCols[i], type = "s")
+}
+
+legend("topright",
+       legend = sprintf("%3.2f", z),
+       cex = 0.5,
+       col = myCols,
+       lwd = myLwd,
+       bty = "n")
+
+# We see how factors less than 1 applied to our actual sample size and fraction
+# of transcription factors increasingly narrow the distribution; factors larger
+# than one increasingly broaden the distribution but in a much slower manner. A
+# factor of one (i.e. half size, double probability) hits the sweet spot and
+# recaptures the essence of our observation. Clearly, this is a case of
+# fortuitously cancelling errors.
+

 # What is the take-home message here?
 #  - The standard Poisson and hypergeometric distributions apply to
@ -927,7 +994,6 @@ qqplot(dl, dg) # clearly not equal

 # =    4  Quantifying the difference  ==========================================

-
 # Quantile-quantile plots give us a visual estimate, but how do we quantify the
 # difference between distributions? Let's compare two types of extreme-value
 # distributions, a lognormal distribution with the same distribution that is
@ -964,7 +1030,7 @@ legend("topright",

 # The chi2 test can be used to compare discrete distributions - even though
 # it is not ideal for this purpose.
-# (https://stats.stackexchange.com/questions/113692/test-identicality-of-discrete-distributions)
+# (https://stats.stackexchange.com/questions/113692)
 #
 # Let's draw 100 random variables according to our target distributions
 N <- 100
@ -989,9 +1055,9 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4])
 # package information - plotrix has _many_ useful utilities to enhance
 # plots or produce informative visualizations.

-if (! requireNamespace("plotrix", quietly = TRUE)) {
-  install.packages("plotrix")
-}
+pkg <- "plotrix"
+if (! requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) }
+
 # Package information:
 #  library(help = plotrix)       # basic information
 #  browseVignettes("plotrix")    # available vignettes
@ -1139,7 +1205,6 @@ pmfPC <- function(cts, nam) {
 }


-
 # The definition of the Kullback-Leibler divergence itself is quite simple
 # actually: for two distributions, p and q it is sum(p * log( p / q))