Correct calculation of empirical p-vale

2017-11-01 23:29:18 -04:00 · 2017-11-01 23:29:18 -04:00 · 6121323df3
commit 6121323df3
parent f6983b142a
2 changed files with 50 additions and 44 deletions
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  10  10
+# Date:     2017  10
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1    Corrected empirical p-value
 #           1.0    First code live version
 #
 # TODO:
@ -27,20 +28,20 @@
 #TOC> 
 #TOC>   Section  Title                                                   Line
 #TOC> -----------------------------------------------------------------------
-#TOC>   1        Introduction                                              54
-#TOC>   2        Three fundamental distributions                          117
-#TOC>   2.1      The Poisson Distribution                                 120
-#TOC>   2.2      The uniform distribution                                 173
-#TOC>   2.3      The Normal Distribution                                  193
-#TOC>   3        quantile-quantile comparison                             234
-#TOC>   3.1      qqnorm()                                                 244
-#TOC>   3.2      qqplot()                                                 304
-#TOC>   4        Quantifying the difference                               321
-#TOC>   4.1      Chi2 test for discrete distributions                     355
-#TOC>   4.2      Kullback-Leibler divergence                              446
-#TOC>   4.2.1    An example from tossing dice                             457
-#TOC>   4.2.2    An example from lognormal distributions                  579
-#TOC>   4.3      Kolmogorov-Smirnov test for continuous distributions     620
+#TOC>   1        Introduction                                              49
+#TOC>   2        Three fundamental distributions                          112
+#TOC>   2.1      The Poisson Distribution                                 115
+#TOC>   2.2      The uniform distribution                                 168
+#TOC>   2.3      The Normal Distribution                                  188
+#TOC>   3        quantile-quantile comparison                             229
+#TOC>   3.1      qqnorm()                                                 239
+#TOC>   3.2      qqplot()                                                 299
+#TOC>   4        Quantifying the difference                               316
+#TOC>   4.1      Chi2 test for discrete distributions                     350
+#TOC>   4.2      Kullback-Leibler divergence                              441
+#TOC>   4.2.1    An example from tossing dice                             452
+#TOC>   4.2.2    An example from lognormal distributions                  574
+#TOC>   4.3      Kolmogorov-Smirnov test for continuous distributions     616
 #TOC> 
 #TOC> ==========================================================================

@ -606,9 +607,10 @@ abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
 # How many KL-divergences were less than the difference we observed?
 sum(divs < KLdiv(pmfL1, pmfL2)) #933

-# Therefore the probability that the samples came from the same distribution
-# is only 100 * (N - 933) / N (%) ... 6.7%. You see that this gives a much more
-# powerful statistical approach than the chi2 test we used above.
+# Therefore the empirical p-value that the samples came from the same
+# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
+# that this gives a much more powerful statistical approach than the chi2 test
+# we used above.


 # ==   4.3  Kolmogorov-Smirnov test for continuous distributions  ==============
@ -628,7 +630,7 @@ ks.test(dl1, dg1.5)
 ks.test(dl1, dg1.9)
 ks.test(dg1.5, dg1.9) # the two last gammas against each other

-# (The warnings about the presence of ties comes from the 0s in our function
+# (The warnings about the presence of ties comes from the 0's in our function
 # values). The p-value that these distributions are samples of the same
 # probability distribution gets progressively smaller.

--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Significance unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  09  05
+# Date:     2017  09  - 2017  10
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1    Corrected treatment of empirical p-value
 #           1.0    First contents
 #
 # TODO:
@ -20,27 +21,22 @@
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
 # ==============================================================================
- 
+
+
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                        Line
 #TOC> ------------------------------------------------------------
-#TOC>   1        Significance and p-value                       46
-#TOC>   1.1      Significance levels                            57
-#TOC>   1.2      probability and p-value                        74
+#TOC>   1        Significance and p-value                       42
+#TOC>   1.1      Significance levels                            53
+#TOC>   1.2      probability and p-value                        70
 #TOC>   1.2.1    p-value illustrated                           100
-#TOC>   2        One- or two-sided                             152
-#TOC>   3        Significance by integration                   192
-#TOC>   4        Significance by simulation or permutation     198
-#TOC>   5        Final tasks                                   298
+#TOC>   2        One- or two-sided                             153
+#TOC>   3        Significance by integration                   193
+#TOC>   4        Significance by simulation or permutation     199
+#TOC>   5        Final tasks                                   302
 #TOC> 
 #TOC> ==========================================================================
- 
-
-#TOC>
-#TOC>
-
-


 # =    1  Significance and p-value  ============================================
@ -56,7 +52,7 @@

 # ==   1.1  Significance levels  ===============================================

-# A "very small" probability is purely a matter of convention, a cultural
+# A "very small" probability is purely a matter of convention - a cultural
 # convention. In the biomedical field we usually call probabilities of less then
 # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
 # observations with a probability of less than 0.05 "significant" and if we want
@ -67,7 +63,7 @@
 # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
 # British statistician Ronald Fisher happened to propose for this purpose in
 # 1925. Incidentally, Fisher later recommended to use different cutoffs for
-# diffrent purposes (cf.
+# different purposes (cf.
 # https://en.wikipedia.org/wiki/Statistical_significance).


@ -93,8 +89,12 @@ print(x, digits = 22)
 # 2: that distribution can be integrated between any specific value
 #      and its upper and lower bounds (or +- infinity).

-# Then what we really mean by the probability of an observation in the context of that distribution is: the
-# probability of observing that value, or a value more extreme than the one we have. We call this the p-value. Note that we are not talking about an individual number anymore, we are talking about the area under the curve between our observation and the upper (or lower) bound of the curve, as a fraction of the whole.
+# Then what we really mean by the probability of an observation in the context
+# of that distribution is: the probability of observing that value, or a value
+# more extreme than the one we have. We call this the p-value. Note that we are
+# not talking about an individual number anymore, we are talking about the area
+# under the curve between our observation and the upper (or lower) bound of the
+# curve, as a fraction of the whole.


 # ===  1.2.1  p-value illustrated                      
@ -102,6 +102,7 @@ print(x, digits = 22)
 # Let's illustrate. First we draw a million random values from our
 # standard, normal distribution:

+set.seed(112358)
 r <- rnorm(1000000)

 # Let's see what the distribution looks like:
@ -201,8 +202,11 @@ sum(r > quantile(r, probs = 0.95))
 # be warranted for biological data, can be a highly technical question.
 # Fortunately, we can often simply run a simulation, a random resampling, or a
 # permutation and then count the number of outcomes, just as we did with our
-# rnorm() samples. Here is an example. Assume you have a protein sequence and
-# you speculate that postively charged residues are close to negatively charged
+# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
+# p-value" is defined as (Nobs + 1) / (N + 1).  )
+
+# Here is an example. Assume you have a protein sequence and
+# you speculate that positively charged residues are close to negatively charged
 # residues to balance charge locally. A statistic that would capture this is the
 # mean minimum distance between all D,E residues and the closest R,K,H
 # residue. Let's compute this for the sequence of yeast Mbp1.
@ -297,8 +301,8 @@ abline(v = chSep(v), col = "#EE0000")

 # =    5  Final tasks  =========================================================

-# From chs, compute the probability of a mean minimum charge separation to be
-#   larger or equal to the value observed for the yeast MBP1 sequence. Note
+# From chs, compute the empirical p-value of a mean minimum charge separation to
+#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
 #   the result in your journal. Is it significant? Also note the result of
 #   the following expression for validation:
 writeBin(sum(chs), raw(8))