Maintenance updates, and revised set.seed() usage

2019-01-07 16:17:23 +10:00
parent 2ab162e375
commit 6f54293592
9 changed files with 275 additions and 180 deletions
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -25,10 +25,11 @@
 #
 # ==============================================================================
 #TOC> ==========================================================================
 #TOC>
 #TOC>   Section  Title                           Line
-#TOC> ---------------------------------------
+#TOC> -----------------------------------------------
 #TOC>   1        Preparations                      39
 #TOC>   2        Suitable MYSPE Species            51
 #TOC>   3        Adopt "MYSPE"                     65
@@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
 load("data/MYSPEspecies.RData")     # load the species names
 set.seed(myStudentNumber)           # seed the random number generator
 MYSPE <- sample(MYSPEspecies, 1)    # pick a species at random
 set.seed(NULL)                      # reset the random number generator
 # write the result to your personalized profile data so we can use the result in
 # other functions
 cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
@@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?
 # Task: Note down the species name and its five letter label on your Student
 # Wiki user page. Use this species whenever this or future assignments refer
-# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
+# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
 # [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -30,7 +30,7 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                              Line
-#TOC> --------------------------------------------
+#TOC> --------------------------------------------------
 #TOC>   1        Preparation and Tree Plot            43
 #TOC>   2        Tree Analysis                        82
 #TOC>   2.1        Rooting Trees                     141
@@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label),  # number of tips
                                       #   compare them anyway.
 # Let's compute some random trees this way, calculate the distances to
-# fungiTree, and then compare the values we get for apsTree2:
+# fungiTree, and then compare the values we get for apsTree2. The random
 # trees are provided by ape::rtree().
 set.seed(112358)
 N <- 10000  # takes about 15 seconds
 myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
 colnames(myTreeDistances) <- c("symm", "path")
 set.seed(112358)
 for (i in 1:N) {
  xTree <- rtree(n = length(apsTree2$tip.label),
                 rooted = TRUE,
@@ -283,6 +284,7 @@ for (i in 1:N) {
                 br = NULL)
  myTreeDistances[i, ] <- treedist(fungiTree, xTree)
 }
 set.seed(NULL)                      # reset the random number generator
 table(myTreeDistances[, "symm"])
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@@ -28,14 +28,14 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------
+#TOC> ---------------------------------------------------------------
 #TOC>   1        Setup and data                                    43
 #TOC>   2        Functional Edges in the Human Proteome            80
 #TOC>   2.1        Cliques                                        123
 #TOC>   2.2        Communities                                    164
-#TOC>   2.3      Betweenness Centrality                     176
+#TOC>   2.3        Betweenness Centrality                         178
-#TOC>   3        biomaRt                                    220
+#TOC>   3        biomaRt                                          224
-#TOC>   4        Task for submission                        291
+#TOC>   4        Task for submission                              295
 #TOC> 
 #TOC> ==========================================================================
@@ -163,8 +163,10 @@ par(oPar)
 # ==   2.2  Communities  =======================================================
-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 gSTRclusters <- cluster_infomap(gSTR)
 set.seed(NULL)                         # reset the RNG
 modularity(gSTRclusters) # ... measures how separated the different membership
                         # types are from each other
 tMem <- table(membership(gSTRclusters))
@@ -205,9 +207,11 @@ head(sBC)
 # We are going to use these IDs to produce some output for a submitted task:
 # so I need you to personalize ENSPsel with the following
-# two lines of code:
+# three lines of code:
 set.seed(<myStudentNumber>)         # enter your student number here
 (ENSPsel <- sample(ENSPsel))
 set.seed(NULL)                      # reset the random number generator
 #  Next, to find what these proteins are...
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
-# Date:     2017  09  - 2017  10
+# Date:     2017  09  - 2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.3    Update set.seed() usage
 #           1.2    Removed irrelevant task. How did that even get in there? smh
 #           1.1    Add chartr()
 #           1.0    First live version 2017.
@@ -28,20 +29,24 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                Line
-#TOC> ----------------------------------------------
+#TOC> ----------------------------------------------------
-#TOC>   1        Prepare                          55
+#TOC>   1        Prepare                                60
-#TOC>   2        Storing Sequence                 73
+#TOC>   2        Storing Sequence                       78
-#TOC>   3        String properties               102
+#TOC>   3        String properties                     107
-#TOC>   4        Substrings                      109
+#TOC>   4        Substrings                            114
-#TOC>   5        Creating strings: sprintf()     115
+#TOC>   5        Creating strings: sprintf()           135
-#TOC>   6        Changing strings                146
+#TOC>   6        Changing strings                      170
-#TOC>   6.1      stringi and stringr             198
+#TOC>   6.1.1          Changing case                   172
-#TOC>   6.2      dbSanitizeSequence()            208
+#TOC>   6.1.2          Reverse                         177
-#TOC>   7        Permuting and sampling          220
+#TOC>   6.1.3          Change characters               181
-#TOC>   7.1      Permutations                    227
+#TOC>   6.1.4          Substitute characters           209
-#TOC>   7.2      Sampling                        270
+#TOC>   6.2        stringi and stringr                 229
-#TOC>   7.2.1    Equiprobable characters         272
+#TOC>   6.3        dbSanitizeSequence()                239
-#TOC>   7.2.2    Defined probability vector      312
+#TOC>   7        Permuting and sampling                251
 #TOC>   7.1        Permutations                        258
 #TOC>   7.2        Sampling                            304
 #TOC>   7.2.1          Equiprobable characters         306
 #TOC>   7.2.2          Defined probability vector      348
 #TOC> 
 #TOC> ==========================================================================
@@ -111,16 +116,31 @@ nchar(s)  # aha
 # Use the substr() function
 substr(s, 2, 4)
 # or the similar substring()
 substring(s, 2, 4)
 # Note: both functions are vectorized (i.e. they operate on vectors
 # of arguments, you don't need to loop over input)...
 myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
 substr(   myBiCodes, 1, 3)
 substring(myBiCodes, 1, 3)
 # ... however only substring() will also use vectors for start and stop
 s <- "gatattgtgatgacccagtaa"     # a DNA sequence
 (i <- seq(1, nchar(s), by = 3))  # an index vector
 substr(   s, i, i+2)             # ... returns only the first nucleotide triplet
 substring(s, i, i+2)             # ... returns all triplets
 # =    5  Creating strings: sprintf()  =========================================
 # Sprintf is a very smart, very powerful function and has cognates in all
-# other programming languages. It has a small learning curve, but it's
+# other programming languages. It has a bit of a  learning curve, but this is
 # totally worth it:
 # the function takes a format string, and a list of other arguments. It returns
 # a formatted string. Here are some examples - watch carefully for sprintf()
-# calls in other code.
+# calls elsewhere in the code.
 sprintf("Just a string.")
 sprintf("A string and the number %d.", 5)
@@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
 sprintf("Pi is ~ %1.2f ...", pi)
 sprintf("or more accurately ~ %1.11f.", pi)
 x <- "bottles of beer"
-n <- 99
+N <- 99
 sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
-        n, x, n, x, "one down, and pass it around", n-1, x)
+        N, x, N, x, "one down, and pass it around", N - 1, x)
 # Note that in the last example, the value of the string was displayed with
 # R's usual print-formatting function and therefore the line-break "\n" did
 # not actually break the line. To have line breaks, tabs etc, you need to use
 # cat() to display the string:
-for (i in 99:95) {
+for (i in N:(N-4)) {
  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
-              i, x, i, x, "one down, and pass it around", i-1, x))
+              i, x, i, x, "one down, and pass it around", i - 1, x))
 }
 # sprintf() is vectorized: if one of its parameters is a vector, it
 # will generate one output string for each of the vector's elements:
 cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
 # =    6  Changing strings  ====================================================
-# Changing case
+# ===   6.1.1  Changing case              
 tolower(s)
 toupper(tolower(s))
-#reverse
+# ===   6.1.2  Reverse                    
 reverse(s)
 # ===   6.1.3  Change characters          
 # chartr(old, new, x) maps all characters in x that appear in "old" to the
 # correpsonding character in "new."
@@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),
 # One amusing way to use the function  is for a reversible substitution
 # cypher.
-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
-myCypher <- paste0(sample(letters), collapse = "")
+(myCypher <- paste0(sample(letters), collapse = ""))
-lett <- paste0(letters, collapse = "")
+set.seed(NULL)                         # reset the RNG
 (lett <- paste0(letters, collapse = ""))
 # encode ...
 (x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
 # decode ...
 chartr(myCypher, lett, x)
 # (Nb. substitution cyphers are easy to crack!)
-# substituing characters
+# ===   6.1.4  Substitute characters      
 (s <- gsub("IV", "i-v", s))  # gsub can change length, first argument is
                             # a "regular expression"!
@@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 # remove "whitespace" (spaces, tabs, line breaks)...
 (s <- gsub("\\s", "", s))
-# ==   6.1  stringi and stringr  ===============================================
+# ==   6.2  stringi and stringr  ===============================================
 # But there are also specialized functions eg. to remove leading/trailing
 # whitespace which may be important to sanitize user input etc. Have a look at
@@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
-# ==   6.2  dbSanitizeSequence()  ==============================================
+# ==   6.3  dbSanitizeSequence()  ==============================================
 # In our learning units, we use a function dbSanitizeSequence() to clean up
 # sequences that may be copy/pasted from Web-sources
@@ -254,10 +285,13 @@ mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
 (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 N <- 10000
 d <- numeric(N)
-set.seed(112358)
+
 set.seed(112358)                       # set RNG seed for repeatable randomness
 for (i in 1:N) {
  d[i] <- mean(which(sample(s, length(s)) == "K"))
 }
 set.seed(NULL)                         # reset the RNG
 hist(d, breaks = 20)
 abline(v = 2.5, lwd = 2, col = "firebrick")
 sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
@@ -276,8 +310,10 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
 nuc <- c("A", "C", "G", "T")
 N <- 100
-set.seed(16818)
+
 set.seed(16818)                        # set RNG seed for repeatable randomness
 v <- sample(nuc, N, replace = TRUE)
 set.seed(NULL)                         # reset the RNG
 (mySeq <- paste(v, collapse = ""))
 # What's the GC content?
@@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
 #  data(package = "stringi")     # available datasets
-(x <- stri_match_all(mySeq, regex = "CG"))
+(x <- stri::stri_match_all(mySeq, regex = "CG"))
 length(unlist(x))
 # Now you could compare that number with yeast DNA sequences, and determine
@@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
 nuc <- c("A", "C", "G", "T")
 N <- 100
 set.seed(16818)
 myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
 set.seed(16818)                       # set RNG seed for repeatable randomness
 v <- sample(nuc, N, prob = myProb, replace = TRUE)
 set.seed(NULL)                         # reset the RNG
 (mySeq <- paste(v, collapse = ""))
 # What's the GC content?
@@ -333,7 +372,7 @@ table(v)
 sum(table(v)[c("G", "C")]) # Close to expected
 # What's the number of CpG motifs?
-(x <- stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 # ... not a single one in this case.
--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-MAT-Graphs_and_networks unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  10  06
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.1    Update set.seed() usage
 #           1.0    First final version for learning units.
 #           0.1    First code copied from 2016 material.
 #
@@ -28,18 +29,18 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                        Line
-#TOC> ------------------------------------------------------
+#TOC> ------------------------------------------------------------
-#TOC>   1        Review                                   52
+#TOC>   1        Review                                         48
 #TOC>   2        DEGREE DISTRIBUTIONS                          201
 #TOC>   2.1        Random graph                                207
-#TOC>   2.2      scale-free graph (Barabasi-Albert)      251
+#TOC>   2.2        scale-free graph (Barabasi-Albert)          255
-#TOC>   2.3      Random geometric graph                  313
+#TOC>   2.3        Random geometric graph                      320
-#TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE     433
+#TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE           442
-#TOC>   3.1      Basics                                  436
+#TOC>   3.1        Basics                                      445
-#TOC>   3.2      Components                              508
+#TOC>   3.2        Components                                  517
-#TOC>   4        RANDOM GRAPHS AND GRAPH METRICS         527
+#TOC>   4        RANDOM GRAPHS AND GRAPH METRICS               536
-#TOC>   4.1      Diameter                                562
+#TOC>   4.1        Diameter                                    573
-#TOC>   5        GRAPH CLUSTERING                        630
+#TOC>   5        GRAPH CLUSTERING                              641
 #TOC> 
 #TOC> ==========================================================================
@@ -57,7 +58,7 @@
 # To begin let's write a little function that will create random "gene" names;
 # there's no particular purpose to this other than to make our graphs look a
-# little more "biological ...
+# little more "biological" ...
 makeRandomGenenames <- function(N) {
  nam <- character()
  while (length(nam) < N) {
@@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {
 N <- 20
-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 (Nnames <- makeRandomGenenames(N))
 set.seed(NULL)                         # reset the RNG
 # One way to represent graphs in a computer is as an "adjacency matrix". In this
 # matrix, each row and each column represents a node, and the cell at the
@@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
  return(AM)
 }
-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 (myRandAM <- makeRandomAM(Nnames, p = 0.09))
 set.seed(NULL)                         # reset the RNG
 # Listing the matrix is not very informative - we should plot this graph. The
@@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {
 myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
-set.seed(112358)
+
 set.seed(112358)                       # set RNG seed for repeatable randomness
 myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
 set.seed(NULL)                         # reset the RNG
 # The igraph package adds its own function to the collection of plot()
@@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)
 # ==   2.1  Random graph  ======================================================
 N <- 200
 set.seed(31415927)                     # set RNG seed for repeatable randomness
 my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
 set.seed(NULL)                         # reset the RNG
 set.seed(31415927)
 my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
 myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
-myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates
+myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
                                                     # coordinates
-oPar <- par(mar= rep(0,4)) # Turn margins off
+oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state
 plot(myG200,
     layout = myGxy,
     rescale = FALSE,
@@ -216,7 +225,7 @@ plot(myG200,
     vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
     vertex.size = 150 + (60 * degree(myG200)),
     vertex.label = NA)
-par(oPar)
+par(oPar)                              # restore graphics state
 # This graph has thirteen singletons and one large, connected component. Many
 # biological graphs look approximately like this.
@@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
 # stands for "preferential attachment". Preferential attachment is one type of
 # process that will yield scale-free distributions.
-set.seed(31415927)
+N <- 200
-GBA <- sample_pa(200, power = 0.8, directed = FALSE)
+
 set.seed(31415927)                     # set RNG seed for repeatable randomness
 GBA <- sample_pa(N, power = 0.8, directed = FALSE)
 set.seed(NULL)                         # reset the RNG
 GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
-oPar <- par(mar= rep(0,4)) # Turn margins off
+oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state
 plot(GBA,
     layout = GBAxy,
     rescale = FALSE,
@@ -265,7 +277,7 @@ plot(GBA,
     vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
     vertex.size = 200 + (30 * degree(GBA)),
     vertex.label = NA)
-par(oPar)
+par(oPar)                              # restore grphics state
 # This is a very obviously different graph! Some biological networks have
 # features that look like that - but in my experience the hub nodes are usually
@@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
 #      xlab = "d", ylab = "p(edge)")
 # 200 node random geomteric graph
-set.seed(112358)
+N <- 200
-rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
 set.seed(NULL)                         # reset the RNG
 myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
@@ -539,20 +553,22 @@ names(c1)
 # considered to be more central. And that's also the way the force-directed
 # layout drawas them, obviously.
-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 myGxy <- layout_with_fr(myG)           # calculate layout coordinates
-oPar <- par(mar= rep(0,4)) # Turn margins off
+set.seed(NULL)                         # reset the RNG
 oPar <- par(mar = rep(0, 4))           # turn margins off, save graphics state
 plot(myG,
     layout = myGxy,
     rescale = FALSE,
     xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
     ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
-     vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
+     vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
     vertex.size = 20 + (10 * degree(myG)),
     vertex.label = V(myG)$name,
     vertex.label.family = "sans",
     vertex.label.cex = 0.8)
-par(oPar)
+par(oPar)                              # restore graphics state
 # ==   4.1  Diameter  ==========================================================
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    Update set.seed() usage
 #           1.1    Corrected empirical p-value
 #           1.0    First code live version
 #
@@ -27,21 +28,21 @@
 #TOC> ==========================================================================
 #TOC>
 #TOC>   Section  Title                                                     Line
-#TOC> -----------------------------------------------------------------------
+#TOC> -------------------------------------------------------------------------
-#TOC>   1        Introduction                                              49
+#TOC>   1        Introduction                                                50
-#TOC>   2        Three fundamental distributions                          112
+#TOC>   2        Three fundamental distributions                            113
-#TOC>   2.1      The Poisson Distribution                                 115
+#TOC>   2.1        The Poisson Distribution                                 116
-#TOC>   2.2      The uniform distribution                                 168
+#TOC>   2.2        The uniform distribution                                 170
-#TOC>   2.3      The Normal Distribution                                  188
+#TOC>   2.3        The Normal Distribution                                  190
-#TOC>   3        quantile-quantile comparison                             229
+#TOC>   3        quantile-quantile comparison                               231
-#TOC>   3.1      qqnorm()                                                 239
+#TOC>   3.1        qqnorm()                                                 241
-#TOC>   3.2      qqplot()                                                 299
+#TOC>   3.2        qqplot()                                                 307
-#TOC>   4        Quantifying the difference                               316
+#TOC>   4        Quantifying the difference                                 324
-#TOC>   4.1      Chi2 test for discrete distributions                     350
+#TOC>   4.1        Chi2 test for discrete distributions                     359
-#TOC>   4.2      Kullback-Leibler divergence                              441
+#TOC>   4.2        Kullback-Leibler divergence                              451
-#TOC>   4.2.1    An example from tossing dice                             452
+#TOC>   4.2.1          An example from tossing dice                         462
-#TOC>   4.2.2    An example from lognormal distributions                  574
+#TOC>   4.2.2          An example from lognormal distributions              585
-#TOC>   4.3      Kolmogorov-Smirnov test for continuous distributions     616
+#TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions     628
 #TOC>
 #TOC> ==========================================================================
@@ -151,6 +152,7 @@ set.seed(112358)
 for (i in 1:N) {
  x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
 }
 set.seed(NULL)
 (t <- table(x)/N)
@@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
 # The functions qqnorm() and qqline() perform this
 # comparison with the normal distribution.
-set.seed(1112358)
+set.seed(112358)
-x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues
+x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
 set.seed(NULL)
 qqnorm(x)
 qqline(x, col = "seagreen")
@@ -253,12 +257,15 @@ qqline(x, col = "seagreen")
 # Create a vector of sample means from the exponential distribution; use
 # only a few samples for the mean
 set.seed(112358)
 x <- rexp(12345)
 v <- numeric(999)
 set.seed(112358)
 for (i in 1:length(v)) {
  v[i] <- mean(sample(x, 12))
 }
 set.seed(NULL)
 qqnorm(v)
 qqline(v, col = "turquoise") # normal
@@ -288,13 +295,14 @@ rEVD <- numeric(9999)
 for (i in seq_along(rEVD)) {
  rEVD[i] <- max(rnorm(100))
 }
 set.seed(NULL)
 hist(rEVD, breaks = 20, col = "orchid")
 # Note the long tail on the right!
 qqnorm(rEVD)
-qqline(rEVD, col = "orchid") # normal
+qqline(rEVD, col = "orchid") # Definitely not "normal"!
 # Definitely not "normal"!
 # ==   3.2  qqplot()  ==========================================================
@@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
 dg1.2 <- dgamma(x, shape=1.2)   # three gamma distributions with...
 dg1.5 <- dgamma(x, shape=1.5)   # ...wider, and wider...
 dg1.9 <- dgamma(x, shape=1.9)   # ...peak
 set.seed(NULL)
 myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
@@ -361,6 +370,7 @@ rL2   <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
 rG1.2 <- rgamma(N, shape=1.2)   # three gamma distributions with...
 rG1.5 <- rgamma(N, shape=1.5)   # ...wider, and wider...
 rG1.9 <- rgamma(N, shape=1.9)   # ...peak
 set.seed(NULL)
 maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
@@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
 set.seed(47)
 N <- 20
 (counts <- table(sample(1:6, N, replace = TRUE)))
 set.seed(NULL)
 # We have not observed a "2"!
 #
@@ -597,6 +608,7 @@ for (i in 1:N) {
  q <- pmfPC(y, nam = 1:10)  # convert to p.m.f. with pseudocounts
  divs[i] <- KLdiv(pmfL1, q)     # calculate Kullback-Leibler divergence
 }
 set.seed(NULL)
 hist(divs,
     col = "thistle",
@@ -605,7 +617,7 @@ hist(divs,
 abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
 # How many KL-divergences were less than the difference we observed?
-sum(divs < KLdiv(pmfL1, pmfL2)) #933
+sum(divs < KLdiv(pmfL1, pmfL2)) # 933
 # Therefore the empirical p-value that the samples came from the same
 # distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Significance unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  09  - 2017  10
+# Date:     2017  09  - 2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    Update set.seed() usage
 #           1.1    Corrected treatment of empirical p-value
 #           1.0    First contents
 #
@@ -26,15 +27,15 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                              Line
-#TOC> ------------------------------------------------------------
+#TOC> ------------------------------------------------------------------
-#TOC>   1        Significance and p-value                       42
+#TOC>   1        Significance and p-value                             43
-#TOC>   1.1      Significance levels                            53
+#TOC>   1.1        Significance levels                                54
-#TOC>   1.2      probability and p-value                        70
+#TOC>   1.2        probability and p-value                            71
-#TOC>   1.2.1    p-value illustrated                           100
+#TOC>   1.2.1          p-value illustrated                           103
-#TOC>   2        One- or two-sided                             153
+#TOC>   2        One- or two-sided                                   158
-#TOC>   3        Significance by integration                   193
+#TOC>   3        Significance by integration                         198
-#TOC>   4        Significance by simulation or permutation     199
+#TOC>   4        Significance by simulation or permutation           204
-#TOC>   5        Final tasks                                   302
+#TOC>   5        Final tasks                                         312
 #TOC> 
 #TOC> ==========================================================================
@@ -75,6 +76,8 @@
 set.seed(sqrt(5))
 x <- rnorm(1)
 set.seed(NULL)
 print(x, digits = 22)
 # [1] -0.8969145466249813791748
@@ -102,8 +105,10 @@ print(x, digits = 22)
 # Let's illustrate. First we draw a million random values from our
 # standard, normal distribution:
-set.seed(112358)
+N <- 1e6                             # one million
-r <- rnorm(1000000)
+set.seed(112358)                     # set RNG seed for repeatable randomness
 r <- rnorm(N)                        # N values from a normal distribution
 set.seed(NULL)                       # reset the RNG
 # Let's see what the distribution looks like:
@@ -277,9 +282,14 @@ chSep <- function(v) {
 chSep(v)
 # Now we can produce a random permutation of v, and recalculate
-set.seed(pi)
+
 set.seed(pi)                       # set RNG seed for repeatable randomness
 w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
                                   # code paradigm. It is very useful.
 set.seed(NULL)                     # reset the RNG
 chSep(w)
 # 3.273 ... that's actually less than what we had before.
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@@ -489,7 +489,8 @@ for (name in toupper(myControls)) {
 # ==   5.1  Final task: Gene descriptions  =====================================
-#    Print the descriptions of the top ten differentially expressed genes.
+#    Print the descriptions of the top ten differentially expressed genes
 #    and comment on what they have in common (or not).
 # =    6  Improving on Discovery by Differential Expression  ===================
@@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
 str(GPL1914)
 # ... from which we can get the data - which is however NOT necessarily
-# matched to the rows of our expression dataset. Note that here to: the majority
+# matched to the rows of our expression dataset. Note that here too: the
-# of data elements are factors and will likely have to be converted before
+# majority of data elements are factors and will likely have to be converted
-# use.
+# before use.
 # [END]
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Genetic_code_optimality unit.
 #
-# Version:  1.0.1
+# Version:  1.1
 #
-# Date:     2017  10  16
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.1      Update set.seed() usage
 #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
 #           1.0      New material.
 #
@@ -28,17 +29,17 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                          Line
-#TOC> --------------------------------------------------------
+#TOC> --------------------------------------------------------------
-#TOC>   1        Designing a computational experiment       57
+#TOC>   1        Designing a computational experiment             54
-#TOC>   2        Setting up the tools                       73
+#TOC>   2        Setting up the tools                             70
-#TOC>   2.1      Natural and alternative genetic codes      76
+#TOC>   2.1        Natural and alternative genetic codes          73
-#TOC>   2.2      Effect of mutations                       135
+#TOC>   2.2        Effect of mutations                           132
-#TOC>   2.2.1    reverse-translate                         146
+#TOC>   2.2.1          reverse-translate                         143
-#TOC>   2.2.2    Randomly mutate                           171
+#TOC>   2.2.2          Randomly mutate                           168
-#TOC>   2.2.3    Forward- translate                        196
+#TOC>   2.2.3          Forward- translate                        193
-#TOC>   2.2.4    measure effect                            214
+#TOC>   2.2.4          measure effect                            211
-#TOC>   3        Run the experiment                        261
+#TOC>   3        Run the experiment                              258
-#TOC>   4        Task solutions                            348
+#TOC>   4        Task solutions                                  351
 #TOC> 
 #TOC> ==========================================================================
@@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
 # Mutate and evaluate
 set.seed(112358)
 x <- randMut(myDNA)
 set.seed(NULL)
 x <- traFor(x, GENETIC_CODE)
 evalMut(myAA, x)  # 166.4
 # Try this 200 times, and see how the values are distributed.
 set.seed(112358)
 N <- 200
 valUGC <- numeric(N)
 set.seed(112358)                   # set RNG seed for repeatable randomness
 for (i in 1:N) {
  x <- randMut(myDNA)              # mutate
  x <- traFor(x, GENETIC_CODE)     # translate
  valUGC[i] <- evalMut(myAA, x)    # evaluate
 }
 set.seed(NULL)                     # reset the RNG
 hist(valUGC,
     breaks = 15,
@@ -299,6 +303,7 @@ effectUGC <- mean(valUGC)  # 178.1
 set.seed(112358)
 # choose a new code
 GC <- randomGC(GENETIC_CODE)
 set.seed(NULL)
 # reverse translate hypothetical sequence according to the new code
 x <- traRev(myAA, GC)
@@ -311,9 +316,10 @@ evalMut(myAA, x)       # evaluate mutation effects: 298.5
 # Let's try with different genetic codes. 200 trials - but this time every trial
 # is with a different, synthetic genetic code.
 set.seed(1414214)
 N <- 200
 valXGC <- numeric(N)
 set.seed(1414214)                # set RNG seed for repeatable randomness
 for (i in 1:N) {
  GC <- randomGC(GENETIC_CODE)   # Choose code
  x <- traRev(myAA, GC)          # reverse translate
@@ -321,6 +327,7 @@ for (i in 1:N) {
  x <- traFor(x, GC)             # translate
  valXGC[i] <- evalMut(myAA, x)  # evaluate
 }
 set.seed(NULL)                   # reset the RNG
 hist(valXGC,
     col = "plum",
@@ -343,9 +350,10 @@ hist(valXGC,
 # =    4  Task solutions  ======================================================
 set.seed(2718282)
 N <- 200
 valSGC <- numeric(N)
 set.seed(2718282)                # set RNG seed for repeatable randomness
 for (i in 1:N) {
  GC <- swappedGC(GENETIC_CODE)  # Choose code
  x <- traRev(myAA, GC)          # reverse translate
@@ -353,6 +361,7 @@ for (i in 1:N) {
  x <- traFor(x, GC)             # translate
  valSGC[i] <- evalMut(myAA, x)  # evaluate
 }
 set.seed(NULL)                   # reset the RNG
 hist(valSGC,
     col = "#6688FF88",