Maintenance updates, and revised set.seed() usage

This commit is contained in:
hyginn 2019-01-07 16:17:23 +10:00
parent 2ab162e375
commit 6f54293592
9 changed files with 275 additions and 180 deletions

View File

@ -25,10 +25,11 @@
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------- #TOC> -----------------------------------------------
#TOC> 1 Preparations 39 #TOC> 1 Preparations 39
#TOC> 2 Suitable MYSPE Species 51 #TOC> 2 Suitable MYSPE Species 51
#TOC> 3 Adopt "MYSPE" 65 #TOC> 3 Adopt "MYSPE" 65
@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
load("data/MYSPEspecies.RData") # load the species names load("data/MYSPEspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator set.seed(myStudentNumber) # seed the random number generator
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
set.seed(NULL) # reset the random number generator
# write the result to your personalized profile data so we can use the result in # write the result to your personalized profile data so we can use the result in
# other functions # other functions
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE) cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?
# Task: Note down the species name and its five letter label on your Student # Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer # Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. In code, we will automatically load it from your.myProfile.R file. # to MYSPE. In code, we will automatically load it from your .myProfile.R file.
# [END] # [END]

View File

@ -30,7 +30,7 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------- #TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 43 #TOC> 1 Preparation and Tree Plot 43
#TOC> 2 Tree Analysis 82 #TOC> 2 Tree Analysis 82
#TOC> 2.1 Rooting Trees 141 #TOC> 2.1 Rooting Trees 141
@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label), # number of tips
# compare them anyway. # compare them anyway.
# Let's compute some random trees this way, calculate the distances to # Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2: # fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
set.seed(112358)
N <- 10000 # takes about 15 seconds N <- 10000 # takes about 15 seconds
myTreeDistances <- matrix(numeric(N * 2), ncol = 2) myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path") colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) { for (i in 1:N) {
xTree <- rtree(n = length(apsTree2$tip.label), xTree <- rtree(n = length(apsTree2$tip.label),
rooted = TRUE, rooted = TRUE,
@ -283,6 +284,7 @@ for (i in 1:N) {
br = NULL) br = NULL)
myTreeDistances[i, ] <- treedist(fungiTree, xTree) myTreeDistances[i, ] <- treedist(fungiTree, xTree)
} }
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"]) table(myTreeDistances[, "symm"])

View File

@ -28,14 +28,14 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------- #TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 43 #TOC> 1 Setup and data 43
#TOC> 2 Functional Edges in the Human Proteome 80 #TOC> 2 Functional Edges in the Human Proteome 80
#TOC> 2.1 Cliques 123 #TOC> 2.1 Cliques 123
#TOC> 2.2 Communities 164 #TOC> 2.2 Communities 164
#TOC> 2.3 Betweenness Centrality 176 #TOC> 2.3 Betweenness Centrality 178
#TOC> 3 biomaRt 220 #TOC> 3 biomaRt 224
#TOC> 4 Task for submission 291 #TOC> 4 Task for submission 295
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -163,8 +163,10 @@ par(oPar)
# == 2.2 Communities ======================================================= # == 2.2 Communities =======================================================
set.seed(112358) set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- cluster_infomap(gSTR) gSTRclusters <- cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
modularity(gSTRclusters) # ... measures how separated the different membership modularity(gSTRclusters) # ... measures how separated the different membership
# types are from each other # types are from each other
tMem <- table(membership(gSTRclusters)) tMem <- table(membership(gSTRclusters))
@ -205,9 +207,11 @@ head(sBC)
# We are going to use these IDs to produce some output for a submitted task: # We are going to use these IDs to produce some output for a submitted task:
# so I need you to personalize ENSPsel with the following # so I need you to personalize ENSPsel with the following
# two lines of code: # three lines of code:
set.seed(<myStudentNumber>) # enter your student number here set.seed(<myStudentNumber>) # enter your student number here
(ENSPsel <- sample(ENSPsel)) (ENSPsel <- sample(ENSPsel))
set.seed(NULL) # reset the random number generator
# Next, to find what these proteins are... # Next, to find what these proteins are...

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit. # R code accompanying the BIN-Sequence unit.
# #
# Version: 1.2 # Version: 1.3
# #
# Date: 2017 09 - 2017 10 # Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh # 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr() # 1.1 Add chartr()
# 1.0 First live version 2017. # 1.0 First live version 2017.
@ -28,20 +29,24 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------- #TOC> ----------------------------------------------------
#TOC> 1 Prepare 55 #TOC> 1 Prepare 60
#TOC> 2 Storing Sequence 73 #TOC> 2 Storing Sequence 78
#TOC> 3 String properties 102 #TOC> 3 String properties 107
#TOC> 4 Substrings 109 #TOC> 4 Substrings 114
#TOC> 5 Creating strings: sprintf() 115 #TOC> 5 Creating strings: sprintf() 135
#TOC> 6 Changing strings 146 #TOC> 6 Changing strings 170
#TOC> 6.1 stringi and stringr 198 #TOC> 6.1.1 Changing case 172
#TOC> 6.2 dbSanitizeSequence() 208 #TOC> 6.1.2 Reverse 177
#TOC> 7 Permuting and sampling 220 #TOC> 6.1.3 Change characters 181
#TOC> 7.1 Permutations 227 #TOC> 6.1.4 Substitute characters 209
#TOC> 7.2 Sampling 270 #TOC> 6.2 stringi and stringr 229
#TOC> 7.2.1 Equiprobable characters 272 #TOC> 6.3 dbSanitizeSequence() 239
#TOC> 7.2.2 Defined probability vector 312 #TOC> 7 Permuting and sampling 251
#TOC> 7.1 Permutations 258
#TOC> 7.2 Sampling 304
#TOC> 7.2.1 Equiprobable characters 306
#TOC> 7.2.2 Defined probability vector 348
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -111,16 +116,31 @@ nchar(s) # aha
# Use the substr() function # Use the substr() function
substr(s, 2, 4) substr(s, 2, 4)
# or the similar substring()
substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence
(i <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, i, i+2) # ... returns only the first nucleotide triplet
substring(s, i, i+2) # ... returns all triplets
# = 5 Creating strings: sprintf() ========================================= # = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all # Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a small learning curve, but it's # other programming languages. It has a bit of a learning curve, but this is
# totally worth it: # totally worth it:
# the function takes a format string, and a list of other arguments. It returns # the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf() # a formatted string. Here are some examples - watch carefully for sprintf()
# calls in other code. # calls elsewhere in the code.
sprintf("Just a string.") sprintf("Just a string.")
sprintf("A string and the number %d.", 5) sprintf("A string and the number %d.", 5)
@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi) sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi) sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer" x <- "bottles of beer"
n <- 99 N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
n, x, n, x, "one down, and pass it around", n-1, x) N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with # Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did # R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use # not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string: # cat() to display the string:
for (i in 99:95) { for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i-1, x)) i, x, i, x, "one down, and pass it around", i - 1, x))
} }
# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ==================================================== # = 6 Changing strings ====================================================
# Changing case # === 6.1.1 Changing case
tolower(s) tolower(s)
toupper(tolower(s)) toupper(tolower(s))
#reverse # === 6.1.2 Reverse
reverse(s) reverse(s)
# === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the # chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." # correpsonding character in "new."
@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),
# One amusing way to use the function is for a reversible substitution # One amusing way to use the function is for a reversible substitution
# cypher. # cypher.
set.seed(112358) set.seed(112358) # set RNG seed for repeatable randomness
myCypher <- paste0(sample(letters), collapse = "") (myCypher <- paste0(sample(letters), collapse = ""))
lett <- paste0(letters, collapse = "") set.seed(NULL) # reset the RNG
(lett <- paste0(letters, collapse = ""))
# encode ...
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told.")) (x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
# decode ...
chartr(myCypher, lett, x) chartr(myCypher, lett, x)
# (Nb. substitution cyphers are easy to crack!) # (Nb. substitution cyphers are easy to crack!)
# substituing characters # === 6.1.4 Substitute characters
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is (s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
# a "regular expression"! # a "regular expression"!
@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove "whitespace" (spaces, tabs, line breaks)... # remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s)) (s <- gsub("\\s", "", s))
# == 6.1 stringi and stringr =============================================== # == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing # But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at # whitespace which may be important to sanitize user input etc. Have a look at
@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# == 6.2 dbSanitizeSequence() ============================================== # == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up # In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources # sequences that may be copy/pasted from Web-sources
@ -254,10 +285,13 @@ mean(which(x == "K")) # ... gives us the average of the permuted sequence.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000 N <- 10000
d <- numeric(N) d <- numeric(N)
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K")) d[i] <- mean(which(sample(s, length(s)) == "K"))
} }
set.seed(NULL) # reset the RNG
hist(d, breaks = 20) hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick") abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
@ -276,8 +310,10 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
nuc <- c("A", "C", "G", "T") nuc <- c("A", "C", "G", "T")
N <- 100 N <- 100
set.seed(16818)
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE) v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = "")) (mySeq <- paste(v, collapse = ""))
# What's the GC content? # What's the GC content?
@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
# data(package = "stringi") # available datasets # data(package = "stringi") # available datasets
(x <- stri_match_all(mySeq, regex = "CG")) (x <- stri::stri_match_all(mySeq, regex = "CG"))
length(unlist(x)) length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine # Now you could compare that number with yeast DNA sequences, and determine
@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
nuc <- c("A", "C", "G", "T") nuc <- c("A", "C", "G", "T")
N <- 100 N <- 100
set.seed(16818)
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE) v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = "")) (mySeq <- paste(v, collapse = ""))
# What's the GC content? # What's the GC content?
@ -333,7 +372,7 @@ table(v)
sum(table(v)[c("G", "C")]) # Close to expected sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs? # What's the number of CpG motifs?
(x <- stri_match_all(mySeq, regex = "CG")) (x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case. # ... not a single one in this case.

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-MAT-Graphs_and_networks unit. # R code accompanying the FND-MAT-Graphs_and_networks unit.
# #
# Version: 1.0 # Version: 1.1
# #
# Date: 2017 10 06 # Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.1 Update set.seed() usage
# 1.0 First final version for learning units. # 1.0 First final version for learning units.
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
@ -28,18 +29,18 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------ #TOC> ------------------------------------------------------------
#TOC> 1 Review 52 #TOC> 1 Review 48
#TOC> 2 DEGREE DISTRIBUTIONS 201 #TOC> 2 DEGREE DISTRIBUTIONS 201
#TOC> 2.1 Random graph 207 #TOC> 2.1 Random graph 207
#TOC> 2.2 scale-free graph (Barabasi-Albert) 251 #TOC> 2.2 scale-free graph (Barabasi-Albert) 255
#TOC> 2.3 Random geometric graph 313 #TOC> 2.3 Random geometric graph 320
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 433 #TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
#TOC> 3.1 Basics 436 #TOC> 3.1 Basics 445
#TOC> 3.2 Components 508 #TOC> 3.2 Components 517
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 527 #TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
#TOC> 4.1 Diameter 562 #TOC> 4.1 Diameter 573
#TOC> 5 GRAPH CLUSTERING 630 #TOC> 5 GRAPH CLUSTERING 641
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -57,7 +58,7 @@
# To begin let's write a little function that will create random "gene" names; # To begin let's write a little function that will create random "gene" names;
# there's no particular purpose to this other than to make our graphs look a # there's no particular purpose to this other than to make our graphs look a
# little more "biological ... # little more "biological" ...
makeRandomGenenames <- function(N) { makeRandomGenenames <- function(N) {
nam <- character() nam <- character()
while (length(nam) < N) { while (length(nam) < N) {
@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {
N <- 20 N <- 20
set.seed(112358) set.seed(112358) # set RNG seed for repeatable randomness
(Nnames <- makeRandomGenenames(N)) (Nnames <- makeRandomGenenames(N))
set.seed(NULL) # reset the RNG
# One way to represent graphs in a computer is as an "adjacency matrix". In this # One way to represent graphs in a computer is as an "adjacency matrix". In this
# matrix, each row and each column represents a node, and the cell at the # matrix, each row and each column represents a node, and the cell at the
@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
return(AM) return(AM)
} }
set.seed(112358) set.seed(112358) # set RNG seed for repeatable randomness
(myRandAM <- makeRandomAM(Nnames, p = 0.09)) (myRandAM <- makeRandomAM(Nnames, p = 0.09))
set.seed(NULL) # reset the RNG
# Listing the matrix is not very informative - we should plot this graph. The # Listing the matrix is not very informative - we should plot this graph. The
@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected") myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
set.seed(NULL) # reset the RNG
# The igraph package adds its own function to the collection of plot() # The igraph package adds its own function to the collection of plot()
@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)
# == 2.1 Random graph ====================================================== # == 2.1 Random graph ======================================================
N <- 200
set.seed(31415927) # set RNG seed for repeatable randomness
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
set.seed(NULL) # reset the RNG
set.seed(31415927)
my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected") myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
# coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
plot(myG200, plot(myG200,
layout = myGxy, layout = myGxy,
rescale = FALSE, rescale = FALSE,
@ -216,7 +225,7 @@ plot(myG200,
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1], vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
vertex.size = 150 + (60 * degree(myG200)), vertex.size = 150 + (60 * degree(myG200)),
vertex.label = NA) vertex.label = NA)
par(oPar) par(oPar) # restore graphics state
# This graph has thirteen singletons and one large, connected component. Many # This graph has thirteen singletons and one large, connected component. Many
# biological graphs look approximately like this. # biological graphs look approximately like this.
@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
# stands for "preferential attachment". Preferential attachment is one type of # stands for "preferential attachment". Preferential attachment is one type of
# process that will yield scale-free distributions. # process that will yield scale-free distributions.
set.seed(31415927) N <- 200
GBA <- sample_pa(200, power = 0.8, directed = FALSE)
set.seed(31415927) # set RNG seed for repeatable randomness
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
set.seed(NULL) # reset the RNG
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
plot(GBA, plot(GBA,
layout = GBAxy, layout = GBAxy,
rescale = FALSE, rescale = FALSE,
@ -265,7 +277,7 @@ plot(GBA,
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1], vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
vertex.size = 200 + (30 * degree(GBA)), vertex.size = 200 + (30 * degree(GBA)),
vertex.label = NA) vertex.label = NA)
par(oPar) par(oPar) # restore grphics state
# This is a very obviously different graph! Some biological networks have # This is a very obviously different graph! Some biological networks have
# features that look like that - but in my experience the hub nodes are usually # features that look like that - but in my experience the hub nodes are usually
@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
# xlab = "d", ylab = "p(edge)") # xlab = "d", ylab = "p(edge)")
# 200 node random geomteric graph # 200 node random geomteric graph
set.seed(112358) N <- 200
rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4) set.seed(112358) # set RNG seed for repeatable randomness
rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
set.seed(NULL) # reset the RNG
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected") myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
@ -539,20 +553,22 @@ names(c1)
# considered to be more central. And that's also the way the force-directed # considered to be more central. And that's also the way the force-directed
# layout drawas them, obviously. # layout drawas them, obviously.
set.seed(112358) set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_fr(myG) # calculate layout coordinates myGxy <- layout_with_fr(myG) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off set.seed(NULL) # reset the RNG
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
plot(myG, plot(myG,
layout = myGxy, layout = myGxy,
rescale = FALSE, rescale = FALSE,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
vertex.size = 20 + (10 * degree(myG)), vertex.size = 20 + (10 * degree(myG)),
vertex.label = V(myG)$name, vertex.label = V(myG)$name,
vertex.label.family = "sans", vertex.label.family = "sans",
vertex.label.cex = 0.8) vertex.label.cex = 0.8)
par(oPar) par(oPar) # restore graphics state
# == 4.1 Diameter ========================================================== # == 4.1 Diameter ==========================================================

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Probability_distribution unit. # R code accompanying the FND-STA-Probability_distribution unit.
# #
# Version: 1.1 # Version: 1.2
# #
# Date: 2017 10 # Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 Update set.seed() usage
# 1.1 Corrected empirical p-value # 1.1 Corrected empirical p-value
# 1.0 First code live version # 1.0 First code live version
# #
@ -27,21 +28,21 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------------------- #TOC> -------------------------------------------------------------------------
#TOC> 1 Introduction 49 #TOC> 1 Introduction 50
#TOC> 2 Three fundamental distributions 112 #TOC> 2 Three fundamental distributions 113
#TOC> 2.1 The Poisson Distribution 115 #TOC> 2.1 The Poisson Distribution 116
#TOC> 2.2 The uniform distribution 168 #TOC> 2.2 The uniform distribution 170
#TOC> 2.3 The Normal Distribution 188 #TOC> 2.3 The Normal Distribution 190
#TOC> 3 quantile-quantile comparison 229 #TOC> 3 quantile-quantile comparison 231
#TOC> 3.1 qqnorm() 239 #TOC> 3.1 qqnorm() 241
#TOC> 3.2 qqplot() 299 #TOC> 3.2 qqplot() 307
#TOC> 4 Quantifying the difference 316 #TOC> 4 Quantifying the difference 324
#TOC> 4.1 Chi2 test for discrete distributions 350 #TOC> 4.1 Chi2 test for discrete distributions 359
#TOC> 4.2 Kullback-Leibler divergence 441 #TOC> 4.2 Kullback-Leibler divergence 451
#TOC> 4.2.1 An example from tossing dice 452 #TOC> 4.2.1 An example from tossing dice 462
#TOC> 4.2.2 An example from lognormal distributions 574 #TOC> 4.2.2 An example from lognormal distributions 585
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 616 #TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -151,6 +152,7 @@ set.seed(112358)
for (i in 1:N) { for (i in 1:N) {
x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
} }
set.seed(NULL)
(t <- table(x)/N) (t <- table(x)/N)
@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
# The functions qqnorm() and qqline() perform this # The functions qqnorm() and qqline() perform this
# comparison with the normal distribution. # comparison with the normal distribution.
set.seed(1112358) set.seed(112358)
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
set.seed(NULL)
qqnorm(x) qqnorm(x)
qqline(x, col = "seagreen") qqline(x, col = "seagreen")
@ -253,12 +257,15 @@ qqline(x, col = "seagreen")
# Create a vector of sample means from the exponential distribution; use # Create a vector of sample means from the exponential distribution; use
# only a few samples for the mean # only a few samples for the mean
set.seed(112358)
x <- rexp(12345) x <- rexp(12345)
v <- numeric(999) v <- numeric(999)
set.seed(112358)
for (i in 1:length(v)) { for (i in 1:length(v)) {
v[i] <- mean(sample(x, 12)) v[i] <- mean(sample(x, 12))
} }
set.seed(NULL)
qqnorm(v) qqnorm(v)
qqline(v, col = "turquoise") # normal qqline(v, col = "turquoise") # normal
@ -288,13 +295,14 @@ rEVD <- numeric(9999)
for (i in seq_along(rEVD)) { for (i in seq_along(rEVD)) {
rEVD[i] <- max(rnorm(100)) rEVD[i] <- max(rnorm(100))
} }
set.seed(NULL)
hist(rEVD, breaks = 20, col = "orchid") hist(rEVD, breaks = 20, col = "orchid")
# Note the long tail on the right! # Note the long tail on the right!
qqnorm(rEVD) qqnorm(rEVD)
qqline(rEVD, col = "orchid") # normal qqline(rEVD, col = "orchid") # Definitely not "normal"!
# Definitely not "normal"!
# == 3.2 qqplot() ========================================================== # == 3.2 qqplot() ==========================================================
@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with... dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with...
dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider... dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider...
dg1.9 <- dgamma(x, shape=1.9) # ...peak dg1.9 <- dgamma(x, shape=1.9) # ...peak
set.seed(NULL)
myCols <- c("black", "grey", "maroon", "turquoise", "steelblue") myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
@ -361,6 +370,7 @@ rL2 <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with... rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with...
rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider... rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider...
rG1.9 <- rgamma(N, shape=1.9) # ...peak rG1.9 <- rgamma(N, shape=1.9) # ...peak
set.seed(NULL)
maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9)) maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
set.seed(47) set.seed(47)
N <- 20 N <- 20
(counts <- table(sample(1:6, N, replace = TRUE))) (counts <- table(sample(1:6, N, replace = TRUE)))
set.seed(NULL)
# We have not observed a "2"! # We have not observed a "2"!
# #
@ -597,6 +608,7 @@ for (i in 1:N) {
q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts
divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence
} }
set.seed(NULL)
hist(divs, hist(divs,
col = "thistle", col = "thistle",
@ -605,7 +617,7 @@ hist(divs,
abline(v = KLdiv(pmfL1, pmfL2), col="firebrick") abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
# How many KL-divergences were less than the difference we observed? # How many KL-divergences were less than the difference we observed?
sum(divs < KLdiv(pmfL1, pmfL2)) #933 sum(divs < KLdiv(pmfL1, pmfL2)) # 933
# Therefore the empirical p-value that the samples came from the same # Therefore the empirical p-value that the samples came from the same
# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see # distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit. # R code accompanying the FND-STA-Significance unit.
# #
# Version: 1.1 # Version: 1.2
# #
# Date: 2017 09 - 2017 10 # Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 Update set.seed() usage
# 1.1 Corrected treatment of empirical p-value # 1.1 Corrected treatment of empirical p-value
# 1.0 First contents # 1.0 First contents
# #
@ -26,15 +27,15 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------------ #TOC> ------------------------------------------------------------------
#TOC> 1 Significance and p-value 42 #TOC> 1 Significance and p-value 43
#TOC> 1.1 Significance levels 53 #TOC> 1.1 Significance levels 54
#TOC> 1.2 probability and p-value 70 #TOC> 1.2 probability and p-value 71
#TOC> 1.2.1 p-value illustrated 100 #TOC> 1.2.1 p-value illustrated 103
#TOC> 2 One- or two-sided 153 #TOC> 2 One- or two-sided 158
#TOC> 3 Significance by integration 193 #TOC> 3 Significance by integration 198
#TOC> 4 Significance by simulation or permutation 199 #TOC> 4 Significance by simulation or permutation 204
#TOC> 5 Final tasks 302 #TOC> 5 Final tasks 312
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -75,6 +76,8 @@
set.seed(sqrt(5)) set.seed(sqrt(5))
x <- rnorm(1) x <- rnorm(1)
set.seed(NULL)
print(x, digits = 22) print(x, digits = 22)
# [1] -0.8969145466249813791748 # [1] -0.8969145466249813791748
@ -102,8 +105,10 @@ print(x, digits = 22)
# Let's illustrate. First we draw a million random values from our # Let's illustrate. First we draw a million random values from our
# standard, normal distribution: # standard, normal distribution:
set.seed(112358) N <- 1e6 # one million
r <- rnorm(1000000) set.seed(112358) # set RNG seed for repeatable randomness
r <- rnorm(N) # N values from a normal distribution
set.seed(NULL) # reset the RNG
# Let's see what the distribution looks like: # Let's see what the distribution looks like:
@ -277,9 +282,14 @@ chSep <- function(v) {
chSep(v) chSep(v)
# Now we can produce a random permutation of v, and recalculate # Now we can produce a random permutation of v, and recalculate
set.seed(pi)
set.seed(pi) # set RNG seed for repeatable randomness
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful. # code paradigm. It is very useful.
set.seed(NULL) # reset the RNG
chSep(w) chSep(w)
# 3.273 ... that's actually less than what we had before. # 3.273 ... that's actually less than what we had before.

View File

@ -489,7 +489,8 @@ for (name in toupper(myControls)) {
# == 5.1 Final task: Gene descriptions ===================================== # == 5.1 Final task: Gene descriptions =====================================
# Print the descriptions of the top ten differentially expressed genes. # Print the descriptions of the top ten differentially expressed genes
# and comment on what they have in common (or not).
# = 6 Improving on Discovery by Differential Expression =================== # = 6 Improving on Discovery by Differential Expression ===================
@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
str(GPL1914) str(GPL1914)
# ... from which we can get the data - which is however NOT necessarily # ... from which we can get the data - which is however NOT necessarily
# matched to the rows of our expression dataset. Note that here to: the majority # matched to the rows of our expression dataset. Note that here too: the
# of data elements are factors and will likely have to be converted before # majority of data elements are factors and will likely have to be converted
# use. # before use.
# [END] # [END]

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit. # R code accompanying the RPR-Genetic_code_optimality unit.
# #
# Version: 1.0.1 # Version: 1.1
# #
# Date: 2017 10 16 # Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo. # 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material. # 1.0 New material.
# #
@ -28,17 +29,17 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------- #TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 57 #TOC> 1 Designing a computational experiment 54
#TOC> 2 Setting up the tools 73 #TOC> 2 Setting up the tools 70
#TOC> 2.1 Natural and alternative genetic codes 76 #TOC> 2.1 Natural and alternative genetic codes 73
#TOC> 2.2 Effect of mutations 135 #TOC> 2.2 Effect of mutations 132
#TOC> 2.2.1 reverse-translate 146 #TOC> 2.2.1 reverse-translate 143
#TOC> 2.2.2 Randomly mutate 171 #TOC> 2.2.2 Randomly mutate 168
#TOC> 2.2.3 Forward- translate 196 #TOC> 2.2.3 Forward- translate 193
#TOC> 2.2.4 measure effect 214 #TOC> 2.2.4 measure effect 211
#TOC> 3 Run the experiment 261 #TOC> 3 Run the experiment 258
#TOC> 4 Task solutions 348 #TOC> 4 Task solutions 351
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
# Mutate and evaluate # Mutate and evaluate
set.seed(112358) set.seed(112358)
x <- randMut(myDNA) x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, GENETIC_CODE) x <- traFor(x, GENETIC_CODE)
evalMut(myAA, x) # 166.4 evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed. # Try this 200 times, and see how the values are distributed.
set.seed(112358)
N <- 200 N <- 200
valUGC <- numeric(N) valUGC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
x <- randMut(myDNA) # mutate x <- randMut(myDNA) # mutate
x <- traFor(x, GENETIC_CODE) # translate x <- traFor(x, GENETIC_CODE) # translate
valUGC[i] <- evalMut(myAA, x) # evaluate valUGC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG
hist(valUGC, hist(valUGC,
breaks = 15, breaks = 15,
@ -299,6 +303,7 @@ effectUGC <- mean(valUGC) # 178.1
set.seed(112358) set.seed(112358)
# choose a new code # choose a new code
GC <- randomGC(GENETIC_CODE) GC <- randomGC(GENETIC_CODE)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code # reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC) x <- traRev(myAA, GC)
@ -311,9 +316,10 @@ evalMut(myAA, x) # evaluate mutation effects: 298.5
# Let's try with different genetic codes. 200 trials - but this time every trial # Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code. # is with a different, synthetic genetic code.
set.seed(1414214)
N <- 200 N <- 200
valXGC <- numeric(N) valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
GC <- randomGC(GENETIC_CODE) # Choose code GC <- randomGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate x <- traRev(myAA, GC) # reverse translate
@ -321,6 +327,7 @@ for (i in 1:N) {
x <- traFor(x, GC) # translate x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate valXGC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG
hist(valXGC, hist(valXGC,
col = "plum", col = "plum",
@ -343,9 +350,10 @@ hist(valXGC,
# = 4 Task solutions ====================================================== # = 4 Task solutions ======================================================
set.seed(2718282)
N <- 200 N <- 200
valSGC <- numeric(N) valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
GC <- swappedGC(GENETIC_CODE) # Choose code GC <- swappedGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate x <- traRev(myAA, GC) # reverse translate
@ -353,6 +361,7 @@ for (i in 1:N) {
x <- traFor(x, GC) # translate x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate valSGC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG
hist(valSGC, hist(valSGC,
col = "#6688FF88", col = "#6688FF88",