Maintenance updates, and revised set.seed() usage

2019-01-07 16:17:23 +10:00 · 2019-01-07 16:17:23 +10:00 · 6f54293592
commit 6f54293592
parent 2ab162e375
9 changed files with 275 additions and 180 deletions
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@ -25,13 +25,14 @@
 #
 # ==============================================================================

+
 #TOC> ==========================================================================
 #TOC>
-#TOC>   Section  Title                   Line
-#TOC> ---------------------------------------
-#TOC>   1        Preparations              39
-#TOC>   2        Suitable MYSPE Species    51
-#TOC>   3        Adopt "MYSPE"             65
+#TOC>   Section  Title                           Line
+#TOC> -----------------------------------------------
+#TOC>   1        Preparations                      39
+#TOC>   2        Suitable MYSPE Species            51
+#TOC>   3        Adopt "MYSPE"                     65
 #TOC>
 #TOC> ==========================================================================

@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
 load("data/MYSPEspecies.RData")     # load the species names
 set.seed(myStudentNumber)           # seed the random number generator
 MYSPE <- sample(MYSPEspecies, 1)    # pick a species at random
+set.seed(NULL)                      # reset the random number generator
 # write the result to your personalized profile data so we can use the result in
 # other functions
 cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?

 # Task: Note down the species name and its five letter label on your Student
 # Wiki user page. Use this species whenever this or future assignments refer
-# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
+# to MYSPE. In code, we will automatically load it from your .myProfile.R file.


 # [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@ -28,15 +28,15 @@


 #TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                        Line
-#TOC> --------------------------------------------
-#TOC>   1        Preparation and Tree Plot      43
-#TOC>   2        Tree Analysis                  82
-#TOC>   2.1      Rooting Trees                 141
-#TOC>   2.2      Rotating Clades               187
-#TOC>   2.3      Computing tree distances      234
-#TOC>
+#TOC> 
+#TOC>   Section  Title                              Line
+#TOC> --------------------------------------------------
+#TOC>   1        Preparation and Tree Plot            43
+#TOC>   2        Tree Analysis                        82
+#TOC>   2.1        Rooting Trees                     141
+#TOC>   2.2        Rotating Clades                   187
+#TOC>   2.3        Computing tree distances          234
+#TOC> 
 #TOC> ==========================================================================


@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label),  # number of tips
                                       #   compare them anyway.

 # Let's compute some random trees this way, calculate the distances to
-# fungiTree, and then compare the values we get for apsTree2:
+# fungiTree, and then compare the values we get for apsTree2. The random
+# trees are provided by ape::rtree().

-set.seed(112358)
 N <- 10000  # takes about 15 seconds
 myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
 colnames(myTreeDistances) <- c("symm", "path")

+set.seed(112358)
 for (i in 1:N) {
  xTree <- rtree(n = length(apsTree2$tip.label),
                 rooted = TRUE,
@ -283,6 +284,7 @@ for (i in 1:N) {
                 br = NULL)
  myTreeDistances[i, ] <- treedist(fungiTree, xTree)
 }
+set.seed(NULL)                      # reset the random number generator

 table(myTreeDistances[, "symm"])

--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@ -27,15 +27,15 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                                     Line
-#TOC> ---------------------------------------------------------
-#TOC>   1        Setup and data                              43
-#TOC>   2        Functional Edges in the Human Proteome      80
-#TOC>   2.1      Cliques                                    123
-#TOC>   2.2      Communities                                164
-#TOC>   2.3      Betweenness Centrality                     176
-#TOC>   3        biomaRt                                    220
-#TOC>   4        Task for submission                        291
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        Setup and data                                    43
+#TOC>   2        Functional Edges in the Human Proteome            80
+#TOC>   2.1        Cliques                                        123
+#TOC>   2.2        Communities                                    164
+#TOC>   2.3        Betweenness Centrality                         178
+#TOC>   3        biomaRt                                          224
+#TOC>   4        Task for submission                              295
 #TOC> 
 #TOC> ==========================================================================

@ -163,8 +163,10 @@ par(oPar)

 # ==   2.2  Communities  =======================================================

-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 gSTRclusters <- cluster_infomap(gSTR)
+set.seed(NULL)                         # reset the RNG
+
 modularity(gSTRclusters) # ... measures how separated the different membership
                         # types are from each other
 tMem <- table(membership(gSTRclusters))
@ -205,9 +207,11 @@ head(sBC)

 # We are going to use these IDs to produce some output for a submitted task:
 # so I need you to personalize ENSPsel with the following
-# two lines of code:
-set.seed(<myStudentNumber>) # enter your student number here
+# three lines of code:
+
+set.seed(<myStudentNumber>)         # enter your student number here
 (ENSPsel <- sample(ENSPsel))
+set.seed(NULL)                      # reset the random number generator

 #  Next, to find what these proteins are...

--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
-# Date:     2017  09  - 2017  10
+# Date:     2017  09  - 2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.3    Update set.seed() usage
 #           1.2    Removed irrelevant task. How did that even get in there? smh
 #           1.1    Add chartr()
 #           1.0    First live version 2017.
@ -27,21 +28,25 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                          Line
-#TOC> ----------------------------------------------
-#TOC>   1        Prepare                          55
-#TOC>   2        Storing Sequence                 73
-#TOC>   3        String properties               102
-#TOC>   4        Substrings                      109
-#TOC>   5        Creating strings: sprintf()     115
-#TOC>   6        Changing strings                146
-#TOC>   6.1      stringi and stringr             198
-#TOC>   6.2      dbSanitizeSequence()            208
-#TOC>   7        Permuting and sampling          220
-#TOC>   7.1      Permutations                    227
-#TOC>   7.2      Sampling                        270
-#TOC>   7.2.1    Equiprobable characters         272
-#TOC>   7.2.2    Defined probability vector      312
+#TOC>   Section  Title                                Line
+#TOC> ----------------------------------------------------
+#TOC>   1        Prepare                                60
+#TOC>   2        Storing Sequence                       78
+#TOC>   3        String properties                     107
+#TOC>   4        Substrings                            114
+#TOC>   5        Creating strings: sprintf()           135
+#TOC>   6        Changing strings                      170
+#TOC>   6.1.1          Changing case                   172
+#TOC>   6.1.2          Reverse                         177
+#TOC>   6.1.3          Change characters               181
+#TOC>   6.1.4          Substitute characters           209
+#TOC>   6.2        stringi and stringr                 229
+#TOC>   6.3        dbSanitizeSequence()                239
+#TOC>   7        Permuting and sampling                251
+#TOC>   7.1        Permutations                        258
+#TOC>   7.2        Sampling                            304
+#TOC>   7.2.1          Equiprobable characters         306
+#TOC>   7.2.2          Defined probability vector      348
 #TOC> 
 #TOC> ==========================================================================

@ -111,16 +116,31 @@ nchar(s)  # aha
 # Use the substr() function
 substr(s, 2, 4)

+# or the similar substring()
+substring(s, 2, 4)
+
+# Note: both functions are vectorized (i.e. they operate on vectors
+# of arguments, you don't need to loop over input)...
+myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
+substr(   myBiCodes, 1, 3)
+substring(myBiCodes, 1, 3)
+
+# ... however only substring() will also use vectors for start and stop
+s <- "gatattgtgatgacccagtaa"     # a DNA sequence
+(i <- seq(1, nchar(s), by = 3))  # an index vector
+substr(   s, i, i+2)             # ... returns only the first nucleotide triplet
+substring(s, i, i+2)             # ... returns all triplets
+

 # =    5  Creating strings: sprintf()  =========================================


 # Sprintf is a very smart, very powerful function and has cognates in all
-# other programming languages. It has a small learning curve, but it's
+# other programming languages. It has a bit of a  learning curve, but this is
 # totally worth it:
 # the function takes a format string, and a list of other arguments. It returns
 # a formatted string. Here are some examples - watch carefully for sprintf()
-# calls in other code.
+# calls elsewhere in the code.

 sprintf("Just a string.")
 sprintf("A string and the number %d.", 5)
@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
 sprintf("Pi is ~ %1.2f ...", pi)
 sprintf("or more accurately ~ %1.11f.", pi)
 x <- "bottles of beer"
-n <- 99
+N <- 99
 sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
-        n, x, n, x, "one down, and pass it around", n-1, x)
+        N, x, N, x, "one down, and pass it around", N - 1, x)

 # Note that in the last example, the value of the string was displayed with
 # R's usual print-formatting function and therefore the line-break "\n" did
 # not actually break the line. To have line breaks, tabs etc, you need to use
 # cat() to display the string:

-for (i in 99:95) {
+for (i in N:(N-4)) {
  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
-              i, x, i, x, "one down, and pass it around", i-1, x))
+              i, x, i, x, "one down, and pass it around", i - 1, x))
 }

+# sprintf() is vectorized: if one of its parameters is a vector, it
+# will generate one output string for each of the vector's elements:
+cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
+

 # =    6  Changing strings  ====================================================

-# Changing case
+# ===   6.1.1  Changing case              
 tolower(s)
 toupper(tolower(s))


-#reverse
+# ===   6.1.2  Reverse                    
 reverse(s)


+# ===   6.1.3  Change characters          
 # chartr(old, new, x) maps all characters in x that appear in "old" to the
 # correpsonding character in "new."

@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),

 # One amusing way to use the function  is for a reversible substitution
 # cypher.
-set.seed(112358)
-myCypher <- paste0(sample(letters), collapse = "")
-lett <- paste0(letters, collapse = "")
+set.seed(112358)                       # set RNG seed for repeatable randomness
+(myCypher <- paste0(sample(letters), collapse = ""))
+set.seed(NULL)                         # reset the RNG
+
+(lett <- paste0(letters, collapse = ""))
+
+# encode ...
 (x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
+
+# decode ...
 chartr(myCypher, lett, x)
 # (Nb. substitution cyphers are easy to crack!)


-# substituing characters
+# ===   6.1.4  Substitute characters      
 (s <- gsub("IV", "i-v", s))  # gsub can change length, first argument is
                             # a "regular expression"!

@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 # remove "whitespace" (spaces, tabs, line breaks)...
 (s <- gsub("\\s", "", s))

-# ==   6.1  stringi and stringr  ===============================================
+# ==   6.2  stringi and stringr  ===============================================

 # But there are also specialized functions eg. to remove leading/trailing
 # whitespace which may be important to sanitize user input etc. Have a look at
@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")



-# ==   6.2  dbSanitizeSequence()  ==============================================
+# ==   6.3  dbSanitizeSequence()  ==============================================

 # In our learning units, we use a function dbSanitizeSequence() to clean up
 # sequences that may be copy/pasted from Web-sources
@ -254,10 +285,13 @@ mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
 (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 N <- 10000
 d <- numeric(N)
-set.seed(112358)
+
+set.seed(112358)                       # set RNG seed for repeatable randomness
 for (i in 1:N) {
  d[i] <- mean(which(sample(s, length(s)) == "K"))
 }
+set.seed(NULL)                         # reset the RNG
+
 hist(d, breaks = 20)
 abline(v = 2.5, lwd = 2, col = "firebrick")
 sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
@ -269,15 +303,17 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the

 # ==   7.2  Sampling  ==========================================================

-# ===  7.2.1  Equiprobable characters    
+# ===   7.2.1  Equiprobable characters    

 # Assume you need a large random-nucleotide string for some statistical model.
 # How to create such a string? sample() can easily create it:

 nuc <- c("A", "C", "G", "T")
 N <- 100
-set.seed(16818)
+
+set.seed(16818)                        # set RNG seed for repeatable randomness
 v <- sample(nuc, N, replace = TRUE)
+set.seed(NULL)                         # reset the RNG
 (mySeq <- paste(v, collapse = ""))

 # What's the GC content?
@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
 #  data(package = "stringi")     # available datasets


-(x <- stri_match_all(mySeq, regex = "CG"))
+(x <- stri::stri_match_all(mySeq, regex = "CG"))
 length(unlist(x))

 # Now you could compare that number with yeast DNA sequences, and determine
@ -309,7 +345,7 @@ length(unlist(x))
 # of the smaller number of Cs and Gs - before biology even comes into play. How
 # do we account for that?

-# ===  7.2.2  Defined probability vector 
+# ===   7.2.2  Defined probability vector 

 # This is where we need to know how to create samples with specific probability
 # distributions. A crude hack would be to create a sampling source vector with
@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))

 nuc <- c("A", "C", "G", "T")
 N <- 100
-set.seed(16818)
-myProb <- c(0.31, 0.19, 0.19, 0.31)  # sampling probabilities
+myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
+
+set.seed(16818)                       # set RNG seed for repeatable randomness
 v <- sample(nuc, N, prob = myProb, replace = TRUE)
+set.seed(NULL)                         # reset the RNG
+
 (mySeq <- paste(v, collapse = ""))

 # What's the GC content?
@ -333,7 +372,7 @@ table(v)
 sum(table(v)[c("G", "C")]) # Close to expected

 # What's the number of CpG motifs?
-(x <- stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 # ... not a single one in this case.


--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-MAT-Graphs_and_networks unit.
 #
-# Version:  1.0
+# Version:  1.1
 #
-# Date:     2017  10  06
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1    Update set.seed() usage
 #           1.0    First final version for learning units.
 #           0.1    First code copied from 2016 material.
 #
@ -27,19 +28,19 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                                  Line
-#TOC> ------------------------------------------------------
-#TOC>   1        Review                                   52
-#TOC>   2        DEGREE DISTRIBUTIONS                    201
-#TOC>   2.1      Random graph                            207
-#TOC>   2.2      scale-free graph (Barabasi-Albert)      251
-#TOC>   2.3      Random geometric graph                  313
-#TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE     433
-#TOC>   3.1      Basics                                  436
-#TOC>   3.2      Components                              508
-#TOC>   4        RANDOM GRAPHS AND GRAPH METRICS         527
-#TOC>   4.1      Diameter                                562
-#TOC>   5        GRAPH CLUSTERING                        630
+#TOC>   Section  Title                                        Line
+#TOC> ------------------------------------------------------------
+#TOC>   1        Review                                         48
+#TOC>   2        DEGREE DISTRIBUTIONS                          201
+#TOC>   2.1        Random graph                                207
+#TOC>   2.2        scale-free graph (Barabasi-Albert)          255
+#TOC>   2.3        Random geometric graph                      320
+#TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE           442
+#TOC>   3.1        Basics                                      445
+#TOC>   3.2        Components                                  517
+#TOC>   4        RANDOM GRAPHS AND GRAPH METRICS               536
+#TOC>   4.1        Diameter                                    573
+#TOC>   5        GRAPH CLUSTERING                              641
 #TOC> 
 #TOC> ==========================================================================

@ -57,7 +58,7 @@

 # To begin let's write a little function that will create random "gene" names;
 # there's no particular purpose to this other than to make our graphs look a
-# little more "biological ...
+# little more "biological" ...
 makeRandomGenenames <- function(N) {
  nam <- character()
  while (length(nam) < N) {
@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {

 N <- 20

-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 (Nnames <- makeRandomGenenames(N))
+set.seed(NULL)                         # reset the RNG

 # One way to represent graphs in a computer is as an "adjacency matrix". In this
 # matrix, each row and each column represents a node, and the cell at the
@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
  return(AM)
 }

-set.seed(112358)
+set.seed(112358)                       # set RNG seed for repeatable randomness
 (myRandAM <- makeRandomAM(Nnames, p = 0.09))
+set.seed(NULL)                         # reset the RNG


 # Listing the matrix is not very informative - we should plot this graph. The
@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {


 myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
-set.seed(112358)
-myGxy <- layout_with_graphopt(myG, charge=0.0012)   # calculate layout coordinates
+
+set.seed(112358)                       # set RNG seed for repeatable randomness
+myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
+set.seed(NULL)                         # reset the RNG


 # The igraph package adds its own function to the collection of plot()
@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)

 # ==   2.1  Random graph  ======================================================

+N <- 200
+
+set.seed(31415927)                     # set RNG seed for repeatable randomness
+my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
+set.seed(NULL)                         # reset the RNG

-set.seed(31415927)
-my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
 myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
-myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates
+myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
+                                                     # coordinates

-oPar <- par(mar= rep(0,4)) # Turn margins off
+oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state
 plot(myG200,
     layout = myGxy,
     rescale = FALSE,
@ -216,7 +225,7 @@ plot(myG200,
     vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
     vertex.size = 150 + (60 * degree(myG200)),
     vertex.label = NA)
-par(oPar)
+par(oPar)                              # restore graphics state

 # This graph has thirteen singletons and one large, connected component. Many
 # biological graphs look approximately like this.
@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
 # stands for "preferential attachment". Preferential attachment is one type of
 # process that will yield scale-free distributions.

-set.seed(31415927)
-GBA <- sample_pa(200, power = 0.8, directed = FALSE)
+N <- 200
+
+set.seed(31415927)                     # set RNG seed for repeatable randomness
+GBA <- sample_pa(N, power = 0.8, directed = FALSE)
+set.seed(NULL)                         # reset the RNG

 GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates

-oPar <- par(mar= rep(0,4)) # Turn margins off
+oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state
 plot(GBA,
     layout = GBAxy,
     rescale = FALSE,
@ -265,7 +277,7 @@ plot(GBA,
     vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
     vertex.size = 200 + (30 * degree(GBA)),
     vertex.label = NA)
-par(oPar)
+par(oPar)                              # restore grphics state

 # This is a very obviously different graph! Some biological networks have
 # features that look like that - but in my experience the hub nodes are usually
@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
 #      xlab = "d", ylab = "p(edge)")

 # 200 node random geomteric graph
-set.seed(112358)
-rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4)
+N <- 200
+set.seed(112358)                       # set RNG seed for repeatable randomness
+rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
+set.seed(NULL)                         # reset the RNG


 myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
@ -539,20 +553,22 @@ names(c1)
 # considered to be more central. And that's also the way the force-directed
 # layout drawas them, obviously.

-set.seed(112358)
-myGxy <- layout_with_fr(myG)   # calculate layout coordinates
-oPar <- par(mar= rep(0,4)) # Turn margins off
+set.seed(112358)                       # set RNG seed for repeatable randomness
+myGxy <- layout_with_fr(myG)           # calculate layout coordinates
+set.seed(NULL)                         # reset the RNG
+
+oPar <- par(mar = rep(0, 4))           # turn margins off, save graphics state
 plot(myG,
     layout = myGxy,
     rescale = FALSE,
     xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
     ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
-     vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
+     vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
     vertex.size = 20 + (10 * degree(myG)),
     vertex.label = V(myG)$name,
     vertex.label.family = "sans",
     vertex.label.cex = 0.8)
-par(oPar)
+par(oPar)                              # restore graphics state

 # ==   4.1  Diameter  ==========================================================

--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    Update set.seed() usage
 #           1.1    Corrected empirical p-value
 #           1.0    First code live version
 #
@ -26,22 +27,22 @@

 #TOC> ==========================================================================
 #TOC>
-#TOC>   Section  Title                                                   Line
-#TOC> -----------------------------------------------------------------------
-#TOC>   1        Introduction                                              49
-#TOC>   2        Three fundamental distributions                          112
-#TOC>   2.1      The Poisson Distribution                                 115
-#TOC>   2.2      The uniform distribution                                 168
-#TOC>   2.3      The Normal Distribution                                  188
-#TOC>   3        quantile-quantile comparison                             229
-#TOC>   3.1      qqnorm()                                                 239
-#TOC>   3.2      qqplot()                                                 299
-#TOC>   4        Quantifying the difference                               316
-#TOC>   4.1      Chi2 test for discrete distributions                     350
-#TOC>   4.2      Kullback-Leibler divergence                              441
-#TOC>   4.2.1    An example from tossing dice                             452
-#TOC>   4.2.2    An example from lognormal distributions                  574
-#TOC>   4.3      Kolmogorov-Smirnov test for continuous distributions     616
+#TOC>   Section  Title                                                     Line
+#TOC> -------------------------------------------------------------------------
+#TOC>   1        Introduction                                                50
+#TOC>   2        Three fundamental distributions                            113
+#TOC>   2.1        The Poisson Distribution                                 116
+#TOC>   2.2        The uniform distribution                                 170
+#TOC>   2.3        The Normal Distribution                                  190
+#TOC>   3        quantile-quantile comparison                               231
+#TOC>   3.1        qqnorm()                                                 241
+#TOC>   3.2        qqplot()                                                 307
+#TOC>   4        Quantifying the difference                                 324
+#TOC>   4.1        Chi2 test for discrete distributions                     359
+#TOC>   4.2        Kullback-Leibler divergence                              451
+#TOC>   4.2.1          An example from tossing dice                         462
+#TOC>   4.2.2          An example from lognormal distributions              585
+#TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions     628
 #TOC>
 #TOC> ==========================================================================

@ -151,6 +152,7 @@ set.seed(112358)
 for (i in 1:N) {
  x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
 }
+set.seed(NULL)

 (t <- table(x)/N)

@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
 # The functions qqnorm() and qqline() perform this
 # comparison with the normal distribution.

-set.seed(1112358)
-x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues
+set.seed(112358)
+x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
+set.seed(NULL)
+
 qqnorm(x)
 qqline(x, col = "seagreen")

@ -253,12 +257,15 @@ qqline(x, col = "seagreen")

 # Create a vector of sample means from the exponential distribution; use
 # only a few samples for the mean
-set.seed(112358)
 x <- rexp(12345)
 v <- numeric(999)
+
+set.seed(112358)
 for (i in 1:length(v)) {
  v[i] <- mean(sample(x, 12))
 }
+set.seed(NULL)
+
 qqnorm(v)
 qqline(v, col = "turquoise") # normal

@ -288,13 +295,14 @@ rEVD <- numeric(9999)
 for (i in seq_along(rEVD)) {
  rEVD[i] <- max(rnorm(100))
 }
+set.seed(NULL)
+
 hist(rEVD, breaks = 20, col = "orchid")
 # Note the long tail on the right!

 qqnorm(rEVD)
-qqline(rEVD, col = "orchid") # normal
+qqline(rEVD, col = "orchid") # Definitely not "normal"!

-# Definitely not "normal"!

 # ==   3.2  qqplot()  ==========================================================

@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
 dg1.2 <- dgamma(x, shape=1.2)   # three gamma distributions with...
 dg1.5 <- dgamma(x, shape=1.5)   # ...wider, and wider...
 dg1.9 <- dgamma(x, shape=1.9)   # ...peak
+set.seed(NULL)

 myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")

@ -361,6 +370,7 @@ rL2   <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
 rG1.2 <- rgamma(N, shape=1.2)   # three gamma distributions with...
 rG1.5 <- rgamma(N, shape=1.5)   # ...wider, and wider...
 rG1.9 <- rgamma(N, shape=1.9)   # ...peak
+set.seed(NULL)

 maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))

@ -449,7 +459,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
 # be applied to discrete distributions. But we need to talk a bit about
 # converting counts to p.m.f.'s.

-# ===  4.2.1  An example from tossing dice
+# ===   4.2.1  An example from tossing dice

 #  The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
 #  there is an issue when we convert sampled counts to frequencies, and estimate
@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
 set.seed(47)
 N <- 20
 (counts <- table(sample(1:6, N, replace = TRUE)))
+set.seed(NULL)

 # We have not observed a "2"!
 #
@ -571,7 +582,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
 # somewhat but not drastically atypical.


-# ===  4.2.2  An example from lognormal distributions
+# ===   4.2.2  An example from lognormal distributions

 # We had compared a set of lognormal and gamma distributions above, now we
 # can use KL-divergence to quantify their similarity:
@ -597,6 +608,7 @@ for (i in 1:N) {
  q <- pmfPC(y, nam = 1:10)  # convert to p.m.f. with pseudocounts
  divs[i] <- KLdiv(pmfL1, q)     # calculate Kullback-Leibler divergence
 }
+set.seed(NULL)

 hist(divs,
     col = "thistle",
@ -605,7 +617,7 @@ hist(divs,
 abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")

 # How many KL-divergences were less than the difference we observed?
-sum(divs < KLdiv(pmfL1, pmfL2)) #933
+sum(divs < KLdiv(pmfL1, pmfL2)) # 933

 # Therefore the empirical p-value that the samples came from the same
 # distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Significance unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  09  - 2017  10
+# Date:     2017  09  - 2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    Update set.seed() usage
 #           1.1    Corrected treatment of empirical p-value
 #           1.0    First contents
 #
@ -25,16 +26,16 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                                        Line
-#TOC> ------------------------------------------------------------
-#TOC>   1        Significance and p-value                       42
-#TOC>   1.1      Significance levels                            53
-#TOC>   1.2      probability and p-value                        70
-#TOC>   1.2.1    p-value illustrated                           100
-#TOC>   2        One- or two-sided                             153
-#TOC>   3        Significance by integration                   193
-#TOC>   4        Significance by simulation or permutation     199
-#TOC>   5        Final tasks                                   302
+#TOC>   Section  Title                                              Line
+#TOC> ------------------------------------------------------------------
+#TOC>   1        Significance and p-value                             43
+#TOC>   1.1        Significance levels                                54
+#TOC>   1.2        probability and p-value                            71
+#TOC>   1.2.1          p-value illustrated                           103
+#TOC>   2        One- or two-sided                                   158
+#TOC>   3        Significance by integration                         198
+#TOC>   4        Significance by simulation or permutation           204
+#TOC>   5        Final tasks                                         312
 #TOC> 
 #TOC> ==========================================================================

@ -75,6 +76,8 @@

 set.seed(sqrt(5))
 x <- rnorm(1)
+set.seed(NULL)
+
 print(x, digits = 22)
 # [1] -0.8969145466249813791748

@ -97,13 +100,15 @@ print(x, digits = 22)
 # curve, as a fraction of the whole.


-# ===  1.2.1  p-value illustrated                      
+# ===   1.2.1  p-value illustrated                      

 # Let's illustrate. First we draw a million random values from our
 # standard, normal distribution:

-set.seed(112358)
-r <- rnorm(1000000)
+N <- 1e6                             # one million
+set.seed(112358)                     # set RNG seed for repeatable randomness
+r <- rnorm(N)                        # N values from a normal distribution
+set.seed(NULL)                       # reset the RNG

 # Let's see what the distribution looks like:

@ -277,9 +282,14 @@ chSep <- function(v) {
 chSep(v)

 # Now we can produce a random permutation of v, and recalculate
-set.seed(pi)
-w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
-                          # code paradigm. It is very useful.
+
+set.seed(pi)                       # set RNG seed for repeatable randomness
+w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
+                                   # code paradigm. It is very useful.
+set.seed(NULL)                     # reset the RNG
+
+
+
 chSep(w)
 # 3.273 ... that's actually less than what we had before.

--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@ -31,7 +31,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                Line
 #TOC> --------------------------------------------------------------------
 #TOC>   1        Preparations                                           53
@ -46,7 +46,7 @@
 #TOC>   5.1      Final task: Gene descriptions                         490
 #TOC>   6        Improving on Discovery by Differential Expression     495
 #TOC>   7        Annotation data                                       577
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -489,7 +489,8 @@ for (name in toupper(myControls)) {

 # ==   5.1  Final task: Gene descriptions  =====================================

-#    Print the descriptions of the top ten differentially expressed genes.
+#    Print the descriptions of the top ten differentially expressed genes
+#    and comment on what they have in common (or not).


 # =    6  Improving on Discovery by Differential Expression  ===================
@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
 str(GPL1914)

 # ... from which we can get the data - which is however NOT necessarily
-# matched to the rows of our expression dataset. Note that here to: the majority
-# of data elements are factors and will likely have to be converted before
-# use.
+# matched to the rows of our expression dataset. Note that here too: the
+# majority of data elements are factors and will likely have to be converted
+# before use.


 # [END]
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -3,12 +3,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Genetic_code_optimality unit.
 #
-# Version:  1.0.1
+# Version:  1.1
 #
-# Date:     2017  10  16
+# Date:     2017  10  -  2019  01
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.1      Update set.seed() usage
 #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
 #           1.0      New material.
 #
@ -26,20 +27,20 @@


 #TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                    Line
-#TOC> --------------------------------------------------------
-#TOC>   1        Designing a computational experiment       57
-#TOC>   2        Setting up the tools                       73
-#TOC>   2.1      Natural and alternative genetic codes      76
-#TOC>   2.2      Effect of mutations                       135
-#TOC>   2.2.1    reverse-translate                         146
-#TOC>   2.2.2    Randomly mutate                           171
-#TOC>   2.2.3    Forward- translate                        196
-#TOC>   2.2.4    measure effect                            214
-#TOC>   3        Run the experiment                        261
-#TOC>   4        Task solutions                            348
-#TOC>
+#TOC> 
+#TOC>   Section  Title                                          Line
+#TOC> --------------------------------------------------------------
+#TOC>   1        Designing a computational experiment             54
+#TOC>   2        Setting up the tools                             70
+#TOC>   2.1        Natural and alternative genetic codes          73
+#TOC>   2.2        Effect of mutations                           132
+#TOC>   2.2.1          reverse-translate                         143
+#TOC>   2.2.2          Randomly mutate                           168
+#TOC>   2.2.3          Forward- translate                        193
+#TOC>   2.2.4          measure effect                            211
+#TOC>   3        Run the experiment                              258
+#TOC>   4        Task solutions                                  351
+#TOC> 
 #TOC> ==========================================================================


@ -139,7 +140,7 @@ swappedGC <- function(GC) {
 #   - we count the number of mutations and evaluate their severity.


-# ===  2.2.1  reverse-translate
+# ===   2.2.1  reverse-translate                    

 # To reverse-translate an amino acid vector, we randomly pick one of its
 # codons from a genetic code, and assemble all codons to a sequence.
@ -164,7 +165,7 @@ traRev <- function(s, GC) {
 }


-# ===  2.2.2  Randomly mutate
+# ===   2.2.2  Randomly mutate                      

 # To mutate, we split a codon into it's three nucleotides, then randomly replace
 # one of the three with another nucleotide.
@ -189,7 +190,7 @@ randMut <- function(vC) {



-# ===  2.2.3  Forward- translate
+# ===   2.2.3  Forward- translate                   

 traFor <- function(vC, GC) {
  # Parameters:
@ -207,7 +208,7 @@ traFor <- function(vC, GC) {
  }


-# ===  2.2.4  measure effect
+# ===   2.2.4  measure effect                       

 # How do we evaluate the effect of the mutation? We'll take a simple ad hoc
 # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
 # Mutate and evaluate
 set.seed(112358)
 x <- randMut(myDNA)
+set.seed(NULL)
 x <- traFor(x, GENETIC_CODE)
 evalMut(myAA, x)  # 166.4

 # Try this 200 times, and see how the values are distributed.
-set.seed(112358)
 N <- 200
 valUGC <- numeric(N)
+
+set.seed(112358)                   # set RNG seed for repeatable randomness
 for (i in 1:N) {
-  x <- randMut(myDNA)            # mutate
-  x <- traFor(x, GENETIC_CODE)   # translate
-  valUGC[i] <- evalMut(myAA, x)     # evaluate
+  x <- randMut(myDNA)              # mutate
+  x <- traFor(x, GENETIC_CODE)     # translate
+  valUGC[i] <- evalMut(myAA, x)    # evaluate
 }
+set.seed(NULL)                     # reset the RNG

 hist(valUGC,
     breaks = 15,
@ -299,6 +303,7 @@ effectUGC <- mean(valUGC)  # 178.1
 set.seed(112358)
 # choose a new code
 GC <- randomGC(GENETIC_CODE)
+set.seed(NULL)

 # reverse translate hypothetical sequence according to the new code
 x <- traRev(myAA, GC)
@ -311,9 +316,10 @@ evalMut(myAA, x)       # evaluate mutation effects: 298.5
 # Let's try with different genetic codes. 200 trials - but this time every trial
 # is with a different, synthetic genetic code.

-set.seed(1414214)
 N <- 200
 valXGC <- numeric(N)
+
+set.seed(1414214)                # set RNG seed for repeatable randomness
 for (i in 1:N) {
  GC <- randomGC(GENETIC_CODE)   # Choose code
  x <- traRev(myAA, GC)          # reverse translate
@ -321,6 +327,7 @@ for (i in 1:N) {
  x <- traFor(x, GC)             # translate
  valXGC[i] <- evalMut(myAA, x)  # evaluate
 }
+set.seed(NULL)                   # reset the RNG

 hist(valXGC,
     col = "plum",
@ -343,9 +350,10 @@ hist(valXGC,

 # =    4  Task solutions  ======================================================

-set.seed(2718282)
 N <- 200
 valSGC <- numeric(N)
+
+set.seed(2718282)                # set RNG seed for repeatable randomness
 for (i in 1:N) {
  GC <- swappedGC(GENETIC_CODE)  # Choose code
  x <- traRev(myAA, GC)          # reverse translate
@ -353,6 +361,7 @@ for (i in 1:N) {
  x <- traFor(x, GC)             # translate
  valSGC[i] <- evalMut(myAA, x)  # evaluate
 }
+set.seed(NULL)                   # reset the RNG

 hist(valSGC,
     col = "#6688FF88",