Maintenance updates, and revised set.seed() usage

This commit is contained in:
hyginn 2019-01-07 16:17:23 +10:00
parent 2ab162e375
commit 6f54293592
9 changed files with 275 additions and 180 deletions

View File

@ -25,13 +25,14 @@
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------
#TOC> 1 Preparations 39
#TOC> 2 Suitable MYSPE Species 51
#TOC> 3 Adopt "MYSPE" 65
#TOC> Section Title Line
#TOC> -----------------------------------------------
#TOC> 1 Preparations 39
#TOC> 2 Suitable MYSPE Species 51
#TOC> 3 Adopt "MYSPE" 65
#TOC>
#TOC> ==========================================================================
@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
load("data/MYSPEspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
set.seed(NULL) # reset the random number generator
# write the result to your personalized profile data so we can use the result in
# other functions
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?
# Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
# [END]

View File

@ -28,15 +28,15 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------
#TOC> 1 Preparation and Tree Plot 43
#TOC> 2 Tree Analysis 82
#TOC> 2.1 Rooting Trees 141
#TOC> 2.2 Rotating Clades 187
#TOC> 2.3 Computing tree distances 234
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 43
#TOC> 2 Tree Analysis 82
#TOC> 2.1 Rooting Trees 141
#TOC> 2.2 Rotating Clades 187
#TOC> 2.3 Computing tree distances 234
#TOC>
#TOC> ==========================================================================
@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label), # number of tips
# compare them anyway.
# Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2:
# fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
set.seed(112358)
N <- 10000 # takes about 15 seconds
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) {
xTree <- rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
@ -283,6 +284,7 @@ for (i in 1:N) {
br = NULL)
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
}
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"])

View File

@ -27,15 +27,15 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------
#TOC> 1 Setup and data 43
#TOC> 2 Functional Edges in the Human Proteome 80
#TOC> 2.1 Cliques 123
#TOC> 2.2 Communities 164
#TOC> 2.3 Betweenness Centrality 176
#TOC> 3 biomaRt 220
#TOC> 4 Task for submission 291
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 43
#TOC> 2 Functional Edges in the Human Proteome 80
#TOC> 2.1 Cliques 123
#TOC> 2.2 Communities 164
#TOC> 2.3 Betweenness Centrality 178
#TOC> 3 biomaRt 224
#TOC> 4 Task for submission 295
#TOC>
#TOC> ==========================================================================
@ -163,8 +163,10 @@ par(oPar)
# == 2.2 Communities =======================================================
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
modularity(gSTRclusters) # ... measures how separated the different membership
# types are from each other
tMem <- table(membership(gSTRclusters))
@ -205,9 +207,11 @@ head(sBC)
# We are going to use these IDs to produce some output for a submitted task:
# so I need you to personalize ENSPsel with the following
# two lines of code:
set.seed(<myStudentNumber>) # enter your student number here
# three lines of code:
set.seed(<myStudentNumber>) # enter your student number here
(ENSPsel <- sample(ENSPsel))
set.seed(NULL) # reset the random number generator
# Next, to find what these proteins are...

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit.
#
# Version: 1.2
# Version: 1.3
#
# Date: 2017 09 - 2017 10
# Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr()
# 1.0 First live version 2017.
@ -27,21 +28,25 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Prepare 55
#TOC> 2 Storing Sequence 73
#TOC> 3 String properties 102
#TOC> 4 Substrings 109
#TOC> 5 Creating strings: sprintf() 115
#TOC> 6 Changing strings 146
#TOC> 6.1 stringi and stringr 198
#TOC> 6.2 dbSanitizeSequence() 208
#TOC> 7 Permuting and sampling 220
#TOC> 7.1 Permutations 227
#TOC> 7.2 Sampling 270
#TOC> 7.2.1 Equiprobable characters 272
#TOC> 7.2.2 Defined probability vector 312
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Prepare 60
#TOC> 2 Storing Sequence 78
#TOC> 3 String properties 107
#TOC> 4 Substrings 114
#TOC> 5 Creating strings: sprintf() 135
#TOC> 6 Changing strings 170
#TOC> 6.1.1 Changing case 172
#TOC> 6.1.2 Reverse 177
#TOC> 6.1.3 Change characters 181
#TOC> 6.1.4 Substitute characters 209
#TOC> 6.2 stringi and stringr 229
#TOC> 6.3 dbSanitizeSequence() 239
#TOC> 7 Permuting and sampling 251
#TOC> 7.1 Permutations 258
#TOC> 7.2 Sampling 304
#TOC> 7.2.1 Equiprobable characters 306
#TOC> 7.2.2 Defined probability vector 348
#TOC>
#TOC> ==========================================================================
@ -111,16 +116,31 @@ nchar(s) # aha
# Use the substr() function
substr(s, 2, 4)
# or the similar substring()
substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence
(i <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, i, i+2) # ... returns only the first nucleotide triplet
substring(s, i, i+2) # ... returns all triplets
# = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a small learning curve, but it's
# other programming languages. It has a bit of a learning curve, but this is
# totally worth it:
# the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf()
# calls in other code.
# calls elsewhere in the code.
sprintf("Just a string.")
sprintf("A string and the number %d.", 5)
@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer"
n <- 99
N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
n, x, n, x, "one down, and pass it around", n-1, x)
N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string:
for (i in 99:95) {
for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i-1, x))
i, x, i, x, "one down, and pass it around", i - 1, x))
}
# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ====================================================
# Changing case
# === 6.1.1 Changing case
tolower(s)
toupper(tolower(s))
#reverse
# === 6.1.2 Reverse
reverse(s)
# === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new."
@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),
# One amusing way to use the function is for a reversible substitution
# cypher.
set.seed(112358)
myCypher <- paste0(sample(letters), collapse = "")
lett <- paste0(letters, collapse = "")
set.seed(112358) # set RNG seed for repeatable randomness
(myCypher <- paste0(sample(letters), collapse = ""))
set.seed(NULL) # reset the RNG
(lett <- paste0(letters, collapse = ""))
# encode ...
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
# decode ...
chartr(myCypher, lett, x)
# (Nb. substitution cyphers are easy to crack!)
# substituing characters
# === 6.1.4 Substitute characters
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
# a "regular expression"!
@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s))
# == 6.1 stringi and stringr ===============================================
# == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at
@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# == 6.2 dbSanitizeSequence() ==============================================
# == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources
@ -254,10 +285,13 @@ mean(which(x == "K")) # ... gives us the average of the permuted sequence.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000
d <- numeric(N)
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K"))
}
set.seed(NULL) # reset the RNG
hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
@ -269,15 +303,17 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
# == 7.2 Sampling ==========================================================
# === 7.2.1 Equiprobable characters
# === 7.2.1 Equiprobable characters
# Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it:
nuc <- c("A", "C", "G", "T")
N <- 100
set.seed(16818)
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
# data(package = "stringi") # available datasets
(x <- stri_match_all(mySeq, regex = "CG"))
(x <- stri::stri_match_all(mySeq, regex = "CG"))
length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine
@ -309,7 +345,7 @@ length(unlist(x))
# of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that?
# === 7.2.2 Defined probability vector
# === 7.2.2 Defined probability vector
# This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with
@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
nuc <- c("A", "C", "G", "T")
N <- 100
set.seed(16818)
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
@ -333,7 +372,7 @@ table(v)
sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs?
(x <- stri_match_all(mySeq, regex = "CG"))
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case.

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-MAT-Graphs_and_networks unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 06
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Update set.seed() usage
# 1.0 First final version for learning units.
# 0.1 First code copied from 2016 material.
#
@ -27,19 +28,19 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------
#TOC> 1 Review 52
#TOC> 2 DEGREE DISTRIBUTIONS 201
#TOC> 2.1 Random graph 207
#TOC> 2.2 scale-free graph (Barabasi-Albert) 251
#TOC> 2.3 Random geometric graph 313
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 433
#TOC> 3.1 Basics 436
#TOC> 3.2 Components 508
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 527
#TOC> 4.1 Diameter 562
#TOC> 5 GRAPH CLUSTERING 630
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Review 48
#TOC> 2 DEGREE DISTRIBUTIONS 201
#TOC> 2.1 Random graph 207
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
#TOC> 2.3 Random geometric graph 320
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
#TOC> 3.1 Basics 445
#TOC> 3.2 Components 517
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
#TOC> 4.1 Diameter 573
#TOC> 5 GRAPH CLUSTERING 641
#TOC>
#TOC> ==========================================================================
@ -57,7 +58,7 @@
# To begin let's write a little function that will create random "gene" names;
# there's no particular purpose to this other than to make our graphs look a
# little more "biological ...
# little more "biological" ...
makeRandomGenenames <- function(N) {
nam <- character()
while (length(nam) < N) {
@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {
N <- 20
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
(Nnames <- makeRandomGenenames(N))
set.seed(NULL) # reset the RNG
# One way to represent graphs in a computer is as an "adjacency matrix". In this
# matrix, each row and each column represents a node, and the cell at the
@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
return(AM)
}
set.seed(112358)
set.seed(112358) # set RNG seed for repeatable randomness
(myRandAM <- makeRandomAM(Nnames, p = 0.09))
set.seed(NULL) # reset the RNG
# Listing the matrix is not very informative - we should plot this graph. The
@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
set.seed(112358)
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
set.seed(NULL) # reset the RNG
# The igraph package adds its own function to the collection of plot()
@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)
# == 2.1 Random graph ======================================================
N <- 200
set.seed(31415927) # set RNG seed for repeatable randomness
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
set.seed(NULL) # reset the RNG
set.seed(31415927)
my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
# coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
plot(myG200,
layout = myGxy,
rescale = FALSE,
@ -216,7 +225,7 @@ plot(myG200,
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
vertex.size = 150 + (60 * degree(myG200)),
vertex.label = NA)
par(oPar)
par(oPar) # restore graphics state
# This graph has thirteen singletons and one large, connected component. Many
# biological graphs look approximately like this.
@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
# stands for "preferential attachment". Preferential attachment is one type of
# process that will yield scale-free distributions.
set.seed(31415927)
GBA <- sample_pa(200, power = 0.8, directed = FALSE)
N <- 200
set.seed(31415927) # set RNG seed for repeatable randomness
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
set.seed(NULL) # reset the RNG
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
plot(GBA,
layout = GBAxy,
rescale = FALSE,
@ -265,7 +277,7 @@ plot(GBA,
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
vertex.size = 200 + (30 * degree(GBA)),
vertex.label = NA)
par(oPar)
par(oPar) # restore grphics state
# This is a very obviously different graph! Some biological networks have
# features that look like that - but in my experience the hub nodes are usually
@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
# xlab = "d", ylab = "p(edge)")
# 200 node random geomteric graph
set.seed(112358)
rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4)
N <- 200
set.seed(112358) # set RNG seed for repeatable randomness
rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
set.seed(NULL) # reset the RNG
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
@ -539,20 +553,22 @@ names(c1)
# considered to be more central. And that's also the way the force-directed
# layout drawas them, obviously.
set.seed(112358)
myGxy <- layout_with_fr(myG) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_fr(myG) # calculate layout coordinates
set.seed(NULL) # reset the RNG
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
plot(myG,
layout = myGxy,
rescale = FALSE,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
vertex.size = 20 + (10 * degree(myG)),
vertex.label = V(myG)$name,
vertex.label.family = "sans",
vertex.label.cex = 0.8)
par(oPar)
par(oPar) # restore graphics state
# == 4.1 Diameter ==========================================================

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Probability_distribution unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Update set.seed() usage
# 1.1 Corrected empirical p-value
# 1.0 First code live version
#
@ -26,22 +27,22 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------------
#TOC> 1 Introduction 49
#TOC> 2 Three fundamental distributions 112
#TOC> 2.1 The Poisson Distribution 115
#TOC> 2.2 The uniform distribution 168
#TOC> 2.3 The Normal Distribution 188
#TOC> 3 quantile-quantile comparison 229
#TOC> 3.1 qqnorm() 239
#TOC> 3.2 qqplot() 299
#TOC> 4 Quantifying the difference 316
#TOC> 4.1 Chi2 test for discrete distributions 350
#TOC> 4.2 Kullback-Leibler divergence 441
#TOC> 4.2.1 An example from tossing dice 452
#TOC> 4.2.2 An example from lognormal distributions 574
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 616
#TOC> Section Title Line
#TOC> -------------------------------------------------------------------------
#TOC> 1 Introduction 50
#TOC> 2 Three fundamental distributions 113
#TOC> 2.1 The Poisson Distribution 116
#TOC> 2.2 The uniform distribution 170
#TOC> 2.3 The Normal Distribution 190
#TOC> 3 quantile-quantile comparison 231
#TOC> 3.1 qqnorm() 241
#TOC> 3.2 qqplot() 307
#TOC> 4 Quantifying the difference 324
#TOC> 4.1 Chi2 test for discrete distributions 359
#TOC> 4.2 Kullback-Leibler divergence 451
#TOC> 4.2.1 An example from tossing dice 462
#TOC> 4.2.2 An example from lognormal distributions 585
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
#TOC>
#TOC> ==========================================================================
@ -151,6 +152,7 @@ set.seed(112358)
for (i in 1:N) {
x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
}
set.seed(NULL)
(t <- table(x)/N)
@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
# The functions qqnorm() and qqline() perform this
# comparison with the normal distribution.
set.seed(1112358)
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues
set.seed(112358)
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
set.seed(NULL)
qqnorm(x)
qqline(x, col = "seagreen")
@ -253,12 +257,15 @@ qqline(x, col = "seagreen")
# Create a vector of sample means from the exponential distribution; use
# only a few samples for the mean
set.seed(112358)
x <- rexp(12345)
v <- numeric(999)
set.seed(112358)
for (i in 1:length(v)) {
v[i] <- mean(sample(x, 12))
}
set.seed(NULL)
qqnorm(v)
qqline(v, col = "turquoise") # normal
@ -288,13 +295,14 @@ rEVD <- numeric(9999)
for (i in seq_along(rEVD)) {
rEVD[i] <- max(rnorm(100))
}
set.seed(NULL)
hist(rEVD, breaks = 20, col = "orchid")
# Note the long tail on the right!
qqnorm(rEVD)
qqline(rEVD, col = "orchid") # normal
qqline(rEVD, col = "orchid") # Definitely not "normal"!
# Definitely not "normal"!
# == 3.2 qqplot() ==========================================================
@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with...
dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider...
dg1.9 <- dgamma(x, shape=1.9) # ...peak
set.seed(NULL)
myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
@ -361,6 +370,7 @@ rL2 <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with...
rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider...
rG1.9 <- rgamma(N, shape=1.9) # ...peak
set.seed(NULL)
maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
@ -449,7 +459,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
# be applied to discrete distributions. But we need to talk a bit about
# converting counts to p.m.f.'s.
# === 4.2.1 An example from tossing dice
# === 4.2.1 An example from tossing dice
# The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
# there is an issue when we convert sampled counts to frequencies, and estimate
@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
set.seed(47)
N <- 20
(counts <- table(sample(1:6, N, replace = TRUE)))
set.seed(NULL)
# We have not observed a "2"!
#
@ -571,7 +582,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
# somewhat but not drastically atypical.
# === 4.2.2 An example from lognormal distributions
# === 4.2.2 An example from lognormal distributions
# We had compared a set of lognormal and gamma distributions above, now we
# can use KL-divergence to quantify their similarity:
@ -597,6 +608,7 @@ for (i in 1:N) {
q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts
divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence
}
set.seed(NULL)
hist(divs,
col = "thistle",
@ -605,7 +617,7 @@ hist(divs,
abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
# How many KL-divergences were less than the difference we observed?
sum(divs < KLdiv(pmfL1, pmfL2)) #933
sum(divs < KLdiv(pmfL1, pmfL2)) # 933
# Therefore the empirical p-value that the samples came from the same
# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 09 - 2017 10
# Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Update set.seed() usage
# 1.1 Corrected treatment of empirical p-value
# 1.0 First contents
#
@ -25,16 +26,16 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Significance and p-value 42
#TOC> 1.1 Significance levels 53
#TOC> 1.2 probability and p-value 70
#TOC> 1.2.1 p-value illustrated 100
#TOC> 2 One- or two-sided 153
#TOC> 3 Significance by integration 193
#TOC> 4 Significance by simulation or permutation 199
#TOC> 5 Final tasks 302
#TOC> Section Title Line
#TOC> ------------------------------------------------------------------
#TOC> 1 Significance and p-value 43
#TOC> 1.1 Significance levels 54
#TOC> 1.2 probability and p-value 71
#TOC> 1.2.1 p-value illustrated 103
#TOC> 2 One- or two-sided 158
#TOC> 3 Significance by integration 198
#TOC> 4 Significance by simulation or permutation 204
#TOC> 5 Final tasks 312
#TOC>
#TOC> ==========================================================================
@ -75,6 +76,8 @@
set.seed(sqrt(5))
x <- rnorm(1)
set.seed(NULL)
print(x, digits = 22)
# [1] -0.8969145466249813791748
@ -97,13 +100,15 @@ print(x, digits = 22)
# curve, as a fraction of the whole.
# === 1.2.1 p-value illustrated
# === 1.2.1 p-value illustrated
# Let's illustrate. First we draw a million random values from our
# standard, normal distribution:
set.seed(112358)
r <- rnorm(1000000)
N <- 1e6 # one million
set.seed(112358) # set RNG seed for repeatable randomness
r <- rnorm(N) # N values from a normal distribution
set.seed(NULL) # reset the RNG
# Let's see what the distribution looks like:
@ -277,9 +282,14 @@ chSep <- function(v) {
chSep(v)
# Now we can produce a random permutation of v, and recalculate
set.seed(pi)
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful.
set.seed(pi) # set RNG seed for repeatable randomness
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful.
set.seed(NULL) # reset the RNG
chSep(w)
# 3.273 ... that's actually less than what we had before.

View File

@ -31,7 +31,7 @@
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 1 Preparations 53
@ -46,7 +46,7 @@
#TOC> 5.1 Final task: Gene descriptions 490
#TOC> 6 Improving on Discovery by Differential Expression 495
#TOC> 7 Annotation data 577
#TOC>
#TOC>
#TOC> ==========================================================================
@ -489,7 +489,8 @@ for (name in toupper(myControls)) {
# == 5.1 Final task: Gene descriptions =====================================
# Print the descriptions of the top ten differentially expressed genes.
# Print the descriptions of the top ten differentially expressed genes
# and comment on what they have in common (or not).
# = 6 Improving on Discovery by Differential Expression ===================
@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
str(GPL1914)
# ... from which we can get the data - which is however NOT necessarily
# matched to the rows of our expression dataset. Note that here to: the majority
# of data elements are factors and will likely have to be converted before
# use.
# matched to the rows of our expression dataset. Note that here too: the
# majority of data elements are factors and will likely have to be converted
# before use.
# [END]

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit.
#
# Version: 1.0.1
# Version: 1.1
#
# Date: 2017 10 16
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material.
#
@ -26,20 +27,20 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------
#TOC> 1 Designing a computational experiment 57
#TOC> 2 Setting up the tools 73
#TOC> 2.1 Natural and alternative genetic codes 76
#TOC> 2.2 Effect of mutations 135
#TOC> 2.2.1 reverse-translate 146
#TOC> 2.2.2 Randomly mutate 171
#TOC> 2.2.3 Forward- translate 196
#TOC> 2.2.4 measure effect 214
#TOC> 3 Run the experiment 261
#TOC> 4 Task solutions 348
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 54
#TOC> 2 Setting up the tools 70
#TOC> 2.1 Natural and alternative genetic codes 73
#TOC> 2.2 Effect of mutations 132
#TOC> 2.2.1 reverse-translate 143
#TOC> 2.2.2 Randomly mutate 168
#TOC> 2.2.3 Forward- translate 193
#TOC> 2.2.4 measure effect 211
#TOC> 3 Run the experiment 258
#TOC> 4 Task solutions 351
#TOC>
#TOC> ==========================================================================
@ -139,7 +140,7 @@ swappedGC <- function(GC) {
# - we count the number of mutations and evaluate their severity.
# === 2.2.1 reverse-translate
# === 2.2.1 reverse-translate
# To reverse-translate an amino acid vector, we randomly pick one of its
# codons from a genetic code, and assemble all codons to a sequence.
@ -164,7 +165,7 @@ traRev <- function(s, GC) {
}
# === 2.2.2 Randomly mutate
# === 2.2.2 Randomly mutate
# To mutate, we split a codon into it's three nucleotides, then randomly replace
# one of the three with another nucleotide.
@ -189,7 +190,7 @@ randMut <- function(vC) {
# === 2.2.3 Forward- translate
# === 2.2.3 Forward- translate
traFor <- function(vC, GC) {
# Parameters:
@ -207,7 +208,7 @@ traFor <- function(vC, GC) {
}
# === 2.2.4 measure effect
# === 2.2.4 measure effect
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
# Mutate and evaluate
set.seed(112358)
x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, GENETIC_CODE)
evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed.
set.seed(112358)
N <- 200
valUGC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
x <- randMut(myDNA) # mutate
x <- traFor(x, GENETIC_CODE) # translate
valUGC[i] <- evalMut(myAA, x) # evaluate
x <- randMut(myDNA) # mutate
x <- traFor(x, GENETIC_CODE) # translate
valUGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valUGC,
breaks = 15,
@ -299,6 +303,7 @@ effectUGC <- mean(valUGC) # 178.1
set.seed(112358)
# choose a new code
GC <- randomGC(GENETIC_CODE)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC)
@ -311,9 +316,10 @@ evalMut(myAA, x) # evaluate mutation effects: 298.5
# Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code.
set.seed(1414214)
N <- 200
valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- randomGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate
@ -321,6 +327,7 @@ for (i in 1:N) {
x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valXGC,
col = "plum",
@ -343,9 +350,10 @@ hist(valXGC,
# = 4 Task solutions ======================================================
set.seed(2718282)
N <- 200
valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- swappedGC(GENETIC_CODE) # Choose code
x <- traRev(myAA, GC) # reverse translate
@ -353,6 +361,7 @@ for (i in 1:N) {
x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSGC,
col = "#6688FF88",