Maintenance updates, and revised set.seed() usage
This commit is contained in:
parent
2ab162e375
commit
6f54293592
@ -25,10 +25,11 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------
|
#TOC> -----------------------------------------------
|
||||||
#TOC> 1 Preparations 39
|
#TOC> 1 Preparations 39
|
||||||
#TOC> 2 Suitable MYSPE Species 51
|
#TOC> 2 Suitable MYSPE Species 51
|
||||||
#TOC> 3 Adopt "MYSPE" 65
|
#TOC> 3 Adopt "MYSPE" 65
|
||||||
@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
|
|||||||
load("data/MYSPEspecies.RData") # load the species names
|
load("data/MYSPEspecies.RData") # load the species names
|
||||||
set.seed(myStudentNumber) # seed the random number generator
|
set.seed(myStudentNumber) # seed the random number generator
|
||||||
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
||||||
|
set.seed(NULL) # reset the random number generator
|
||||||
# write the result to your personalized profile data so we can use the result in
|
# write the result to your personalized profile data so we can use the result in
|
||||||
# other functions
|
# other functions
|
||||||
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
||||||
@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?
|
|||||||
|
|
||||||
# Task: Note down the species name and its five letter label on your Student
|
# Task: Note down the species name and its five letter label on your Student
|
||||||
# Wiki user page. Use this species whenever this or future assignments refer
|
# Wiki user page. Use this species whenever this or future assignments refer
|
||||||
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
|
# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -30,7 +30,7 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------
|
#TOC> --------------------------------------------------
|
||||||
#TOC> 1 Preparation and Tree Plot 43
|
#TOC> 1 Preparation and Tree Plot 43
|
||||||
#TOC> 2 Tree Analysis 82
|
#TOC> 2 Tree Analysis 82
|
||||||
#TOC> 2.1 Rooting Trees 141
|
#TOC> 2.1 Rooting Trees 141
|
||||||
@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label), # number of tips
|
|||||||
# compare them anyway.
|
# compare them anyway.
|
||||||
|
|
||||||
# Let's compute some random trees this way, calculate the distances to
|
# Let's compute some random trees this way, calculate the distances to
|
||||||
# fungiTree, and then compare the values we get for apsTree2:
|
# fungiTree, and then compare the values we get for apsTree2. The random
|
||||||
|
# trees are provided by ape::rtree().
|
||||||
|
|
||||||
set.seed(112358)
|
|
||||||
N <- 10000 # takes about 15 seconds
|
N <- 10000 # takes about 15 seconds
|
||||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||||
colnames(myTreeDistances) <- c("symm", "path")
|
colnames(myTreeDistances) <- c("symm", "path")
|
||||||
|
|
||||||
|
set.seed(112358)
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
xTree <- rtree(n = length(apsTree2$tip.label),
|
xTree <- rtree(n = length(apsTree2$tip.label),
|
||||||
rooted = TRUE,
|
rooted = TRUE,
|
||||||
@ -283,6 +284,7 @@ for (i in 1:N) {
|
|||||||
br = NULL)
|
br = NULL)
|
||||||
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
||||||
}
|
}
|
||||||
|
set.seed(NULL) # reset the random number generator
|
||||||
|
|
||||||
table(myTreeDistances[, "symm"])
|
table(myTreeDistances[, "symm"])
|
||||||
|
|
||||||
|
@ -28,14 +28,14 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------
|
#TOC> ---------------------------------------------------------------
|
||||||
#TOC> 1 Setup and data 43
|
#TOC> 1 Setup and data 43
|
||||||
#TOC> 2 Functional Edges in the Human Proteome 80
|
#TOC> 2 Functional Edges in the Human Proteome 80
|
||||||
#TOC> 2.1 Cliques 123
|
#TOC> 2.1 Cliques 123
|
||||||
#TOC> 2.2 Communities 164
|
#TOC> 2.2 Communities 164
|
||||||
#TOC> 2.3 Betweenness Centrality 176
|
#TOC> 2.3 Betweenness Centrality 178
|
||||||
#TOC> 3 biomaRt 220
|
#TOC> 3 biomaRt 224
|
||||||
#TOC> 4 Task for submission 291
|
#TOC> 4 Task for submission 295
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -163,8 +163,10 @@ par(oPar)
|
|||||||
|
|
||||||
# == 2.2 Communities =======================================================
|
# == 2.2 Communities =======================================================
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
gSTRclusters <- cluster_infomap(gSTR)
|
gSTRclusters <- cluster_infomap(gSTR)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
modularity(gSTRclusters) # ... measures how separated the different membership
|
modularity(gSTRclusters) # ... measures how separated the different membership
|
||||||
# types are from each other
|
# types are from each other
|
||||||
tMem <- table(membership(gSTRclusters))
|
tMem <- table(membership(gSTRclusters))
|
||||||
@ -205,9 +207,11 @@ head(sBC)
|
|||||||
|
|
||||||
# We are going to use these IDs to produce some output for a submitted task:
|
# We are going to use these IDs to produce some output for a submitted task:
|
||||||
# so I need you to personalize ENSPsel with the following
|
# so I need you to personalize ENSPsel with the following
|
||||||
# two lines of code:
|
# three lines of code:
|
||||||
|
|
||||||
set.seed(<myStudentNumber>) # enter your student number here
|
set.seed(<myStudentNumber>) # enter your student number here
|
||||||
(ENSPsel <- sample(ENSPsel))
|
(ENSPsel <- sample(ENSPsel))
|
||||||
|
set.seed(NULL) # reset the random number generator
|
||||||
|
|
||||||
# Next, to find what these proteins are...
|
# Next, to find what these proteins are...
|
||||||
|
|
||||||
|
109
BIN-Sequence.R
109
BIN-Sequence.R
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Sequence unit.
|
# R code accompanying the BIN-Sequence unit.
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2017 10
|
# Date: 2017 09 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.3 Update set.seed() usage
|
||||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||||
# 1.1 Add chartr()
|
# 1.1 Add chartr()
|
||||||
# 1.0 First live version 2017.
|
# 1.0 First live version 2017.
|
||||||
@ -28,20 +29,24 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------
|
#TOC> ----------------------------------------------------
|
||||||
#TOC> 1 Prepare 55
|
#TOC> 1 Prepare 60
|
||||||
#TOC> 2 Storing Sequence 73
|
#TOC> 2 Storing Sequence 78
|
||||||
#TOC> 3 String properties 102
|
#TOC> 3 String properties 107
|
||||||
#TOC> 4 Substrings 109
|
#TOC> 4 Substrings 114
|
||||||
#TOC> 5 Creating strings: sprintf() 115
|
#TOC> 5 Creating strings: sprintf() 135
|
||||||
#TOC> 6 Changing strings 146
|
#TOC> 6 Changing strings 170
|
||||||
#TOC> 6.1 stringi and stringr 198
|
#TOC> 6.1.1 Changing case 172
|
||||||
#TOC> 6.2 dbSanitizeSequence() 208
|
#TOC> 6.1.2 Reverse 177
|
||||||
#TOC> 7 Permuting and sampling 220
|
#TOC> 6.1.3 Change characters 181
|
||||||
#TOC> 7.1 Permutations 227
|
#TOC> 6.1.4 Substitute characters 209
|
||||||
#TOC> 7.2 Sampling 270
|
#TOC> 6.2 stringi and stringr 229
|
||||||
#TOC> 7.2.1 Equiprobable characters 272
|
#TOC> 6.3 dbSanitizeSequence() 239
|
||||||
#TOC> 7.2.2 Defined probability vector 312
|
#TOC> 7 Permuting and sampling 251
|
||||||
|
#TOC> 7.1 Permutations 258
|
||||||
|
#TOC> 7.2 Sampling 304
|
||||||
|
#TOC> 7.2.1 Equiprobable characters 306
|
||||||
|
#TOC> 7.2.2 Defined probability vector 348
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -111,16 +116,31 @@ nchar(s) # aha
|
|||||||
# Use the substr() function
|
# Use the substr() function
|
||||||
substr(s, 2, 4)
|
substr(s, 2, 4)
|
||||||
|
|
||||||
|
# or the similar substring()
|
||||||
|
substring(s, 2, 4)
|
||||||
|
|
||||||
|
# Note: both functions are vectorized (i.e. they operate on vectors
|
||||||
|
# of arguments, you don't need to loop over input)...
|
||||||
|
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
|
||||||
|
substr( myBiCodes, 1, 3)
|
||||||
|
substring(myBiCodes, 1, 3)
|
||||||
|
|
||||||
|
# ... however only substring() will also use vectors for start and stop
|
||||||
|
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
||||||
|
(i <- seq(1, nchar(s), by = 3)) # an index vector
|
||||||
|
substr( s, i, i+2) # ... returns only the first nucleotide triplet
|
||||||
|
substring(s, i, i+2) # ... returns all triplets
|
||||||
|
|
||||||
|
|
||||||
# = 5 Creating strings: sprintf() =========================================
|
# = 5 Creating strings: sprintf() =========================================
|
||||||
|
|
||||||
|
|
||||||
# Sprintf is a very smart, very powerful function and has cognates in all
|
# Sprintf is a very smart, very powerful function and has cognates in all
|
||||||
# other programming languages. It has a small learning curve, but it's
|
# other programming languages. It has a bit of a learning curve, but this is
|
||||||
# totally worth it:
|
# totally worth it:
|
||||||
# the function takes a format string, and a list of other arguments. It returns
|
# the function takes a format string, and a list of other arguments. It returns
|
||||||
# a formatted string. Here are some examples - watch carefully for sprintf()
|
# a formatted string. Here are some examples - watch carefully for sprintf()
|
||||||
# calls in other code.
|
# calls elsewhere in the code.
|
||||||
|
|
||||||
sprintf("Just a string.")
|
sprintf("Just a string.")
|
||||||
sprintf("A string and the number %d.", 5)
|
sprintf("A string and the number %d.", 5)
|
||||||
@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
|
|||||||
sprintf("Pi is ~ %1.2f ...", pi)
|
sprintf("Pi is ~ %1.2f ...", pi)
|
||||||
sprintf("or more accurately ~ %1.11f.", pi)
|
sprintf("or more accurately ~ %1.11f.", pi)
|
||||||
x <- "bottles of beer"
|
x <- "bottles of beer"
|
||||||
n <- 99
|
N <- 99
|
||||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
|
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
|
||||||
n, x, n, x, "one down, and pass it around", n-1, x)
|
N, x, N, x, "one down, and pass it around", N - 1, x)
|
||||||
|
|
||||||
# Note that in the last example, the value of the string was displayed with
|
# Note that in the last example, the value of the string was displayed with
|
||||||
# R's usual print-formatting function and therefore the line-break "\n" did
|
# R's usual print-formatting function and therefore the line-break "\n" did
|
||||||
# not actually break the line. To have line breaks, tabs etc, you need to use
|
# not actually break the line. To have line breaks, tabs etc, you need to use
|
||||||
# cat() to display the string:
|
# cat() to display the string:
|
||||||
|
|
||||||
for (i in 99:95) {
|
for (i in N:(N-4)) {
|
||||||
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
|
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
|
||||||
i, x, i, x, "one down, and pass it around", i-1, x))
|
i, x, i, x, "one down, and pass it around", i - 1, x))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# sprintf() is vectorized: if one of its parameters is a vector, it
|
||||||
|
# will generate one output string for each of the vector's elements:
|
||||||
|
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
|
||||||
|
|
||||||
|
|
||||||
# = 6 Changing strings ====================================================
|
# = 6 Changing strings ====================================================
|
||||||
|
|
||||||
# Changing case
|
# === 6.1.1 Changing case
|
||||||
tolower(s)
|
tolower(s)
|
||||||
toupper(tolower(s))
|
toupper(tolower(s))
|
||||||
|
|
||||||
|
|
||||||
#reverse
|
# === 6.1.2 Reverse
|
||||||
reverse(s)
|
reverse(s)
|
||||||
|
|
||||||
|
|
||||||
|
# === 6.1.3 Change characters
|
||||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||||
# correpsonding character in "new."
|
# correpsonding character in "new."
|
||||||
|
|
||||||
@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),
|
|||||||
|
|
||||||
# One amusing way to use the function is for a reversible substitution
|
# One amusing way to use the function is for a reversible substitution
|
||||||
# cypher.
|
# cypher.
|
||||||
set.seed(112358)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
myCypher <- paste0(sample(letters), collapse = "")
|
(myCypher <- paste0(sample(letters), collapse = ""))
|
||||||
lett <- paste0(letters, collapse = "")
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
(lett <- paste0(letters, collapse = ""))
|
||||||
|
|
||||||
|
# encode ...
|
||||||
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
|
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
|
||||||
|
|
||||||
|
# decode ...
|
||||||
chartr(myCypher, lett, x)
|
chartr(myCypher, lett, x)
|
||||||
# (Nb. substitution cyphers are easy to crack!)
|
# (Nb. substitution cyphers are easy to crack!)
|
||||||
|
|
||||||
|
|
||||||
# substituing characters
|
# === 6.1.4 Substitute characters
|
||||||
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
||||||
# a "regular expression"!
|
# a "regular expression"!
|
||||||
|
|
||||||
@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
|||||||
# remove "whitespace" (spaces, tabs, line breaks)...
|
# remove "whitespace" (spaces, tabs, line breaks)...
|
||||||
(s <- gsub("\\s", "", s))
|
(s <- gsub("\\s", "", s))
|
||||||
|
|
||||||
# == 6.1 stringi and stringr ===============================================
|
# == 6.2 stringi and stringr ===============================================
|
||||||
|
|
||||||
# But there are also specialized functions eg. to remove leading/trailing
|
# But there are also specialized functions eg. to remove leading/trailing
|
||||||
# whitespace which may be important to sanitize user input etc. Have a look at
|
# whitespace which may be important to sanitize user input etc. Have a look at
|
||||||
@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# == 6.2 dbSanitizeSequence() ==============================================
|
# == 6.3 dbSanitizeSequence() ==============================================
|
||||||
|
|
||||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
||||||
# sequences that may be copy/pasted from Web-sources
|
# sequences that may be copy/pasted from Web-sources
|
||||||
@ -254,10 +285,13 @@ mean(which(x == "K")) # ... gives us the average of the permuted sequence.
|
|||||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||||
N <- 10000
|
N <- 10000
|
||||||
d <- numeric(N)
|
d <- numeric(N)
|
||||||
set.seed(112358)
|
|
||||||
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
d[i] <- mean(which(sample(s, length(s)) == "K"))
|
d[i] <- mean(which(sample(s, length(s)) == "K"))
|
||||||
}
|
}
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
hist(d, breaks = 20)
|
hist(d, breaks = 20)
|
||||||
abline(v = 2.5, lwd = 2, col = "firebrick")
|
abline(v = 2.5, lwd = 2, col = "firebrick")
|
||||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
||||||
@ -276,8 +310,10 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
|||||||
|
|
||||||
nuc <- c("A", "C", "G", "T")
|
nuc <- c("A", "C", "G", "T")
|
||||||
N <- 100
|
N <- 100
|
||||||
set.seed(16818)
|
|
||||||
|
set.seed(16818) # set RNG seed for repeatable randomness
|
||||||
v <- sample(nuc, N, replace = TRUE)
|
v <- sample(nuc, N, replace = TRUE)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
(mySeq <- paste(v, collapse = ""))
|
(mySeq <- paste(v, collapse = ""))
|
||||||
|
|
||||||
# What's the GC content?
|
# What's the GC content?
|
||||||
@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
|
|||||||
# data(package = "stringi") # available datasets
|
# data(package = "stringi") # available datasets
|
||||||
|
|
||||||
|
|
||||||
(x <- stri_match_all(mySeq, regex = "CG"))
|
(x <- stri::stri_match_all(mySeq, regex = "CG"))
|
||||||
length(unlist(x))
|
length(unlist(x))
|
||||||
|
|
||||||
# Now you could compare that number with yeast DNA sequences, and determine
|
# Now you could compare that number with yeast DNA sequences, and determine
|
||||||
@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
|
|||||||
|
|
||||||
nuc <- c("A", "C", "G", "T")
|
nuc <- c("A", "C", "G", "T")
|
||||||
N <- 100
|
N <- 100
|
||||||
set.seed(16818)
|
|
||||||
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
|
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
|
||||||
|
|
||||||
|
set.seed(16818) # set RNG seed for repeatable randomness
|
||||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
|
v <- sample(nuc, N, prob = myProb, replace = TRUE)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
(mySeq <- paste(v, collapse = ""))
|
(mySeq <- paste(v, collapse = ""))
|
||||||
|
|
||||||
# What's the GC content?
|
# What's the GC content?
|
||||||
@ -333,7 +372,7 @@ table(v)
|
|||||||
sum(table(v)[c("G", "C")]) # Close to expected
|
sum(table(v)[c("G", "C")]) # Close to expected
|
||||||
|
|
||||||
# What's the number of CpG motifs?
|
# What's the number of CpG motifs?
|
||||||
(x <- stri_match_all(mySeq, regex = "CG"))
|
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||||
# ... not a single one in this case.
|
# ... not a single one in this case.
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 06
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Update set.seed() usage
|
||||||
# 1.0 First final version for learning units.
|
# 1.0 First final version for learning units.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,18 +29,18 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------
|
#TOC> ------------------------------------------------------------
|
||||||
#TOC> 1 Review 52
|
#TOC> 1 Review 48
|
||||||
#TOC> 2 DEGREE DISTRIBUTIONS 201
|
#TOC> 2 DEGREE DISTRIBUTIONS 201
|
||||||
#TOC> 2.1 Random graph 207
|
#TOC> 2.1 Random graph 207
|
||||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 251
|
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
|
||||||
#TOC> 2.3 Random geometric graph 313
|
#TOC> 2.3 Random geometric graph 320
|
||||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 433
|
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
|
||||||
#TOC> 3.1 Basics 436
|
#TOC> 3.1 Basics 445
|
||||||
#TOC> 3.2 Components 508
|
#TOC> 3.2 Components 517
|
||||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 527
|
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
|
||||||
#TOC> 4.1 Diameter 562
|
#TOC> 4.1 Diameter 573
|
||||||
#TOC> 5 GRAPH CLUSTERING 630
|
#TOC> 5 GRAPH CLUSTERING 641
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -57,7 +58,7 @@
|
|||||||
|
|
||||||
# To begin let's write a little function that will create random "gene" names;
|
# To begin let's write a little function that will create random "gene" names;
|
||||||
# there's no particular purpose to this other than to make our graphs look a
|
# there's no particular purpose to this other than to make our graphs look a
|
||||||
# little more "biological ...
|
# little more "biological" ...
|
||||||
makeRandomGenenames <- function(N) {
|
makeRandomGenenames <- function(N) {
|
||||||
nam <- character()
|
nam <- character()
|
||||||
while (length(nam) < N) {
|
while (length(nam) < N) {
|
||||||
@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {
|
|||||||
|
|
||||||
N <- 20
|
N <- 20
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
(Nnames <- makeRandomGenenames(N))
|
(Nnames <- makeRandomGenenames(N))
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
# One way to represent graphs in a computer is as an "adjacency matrix". In this
|
# One way to represent graphs in a computer is as an "adjacency matrix". In this
|
||||||
# matrix, each row and each column represents a node, and the cell at the
|
# matrix, each row and each column represents a node, and the cell at the
|
||||||
@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
|
|||||||
return(AM)
|
return(AM)
|
||||||
}
|
}
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
(myRandAM <- makeRandomAM(Nnames, p = 0.09))
|
(myRandAM <- makeRandomAM(Nnames, p = 0.09))
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
# Listing the matrix is not very informative - we should plot this graph. The
|
# Listing the matrix is not very informative - we should plot this graph. The
|
||||||
@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {
|
|||||||
|
|
||||||
|
|
||||||
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
||||||
set.seed(112358)
|
|
||||||
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
|
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
# The igraph package adds its own function to the collection of plot()
|
# The igraph package adds its own function to the collection of plot()
|
||||||
@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)
|
|||||||
|
|
||||||
# == 2.1 Random graph ======================================================
|
# == 2.1 Random graph ======================================================
|
||||||
|
|
||||||
|
N <- 200
|
||||||
|
|
||||||
|
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||||
|
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
set.seed(31415927)
|
|
||||||
my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
|
|
||||||
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
||||||
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates
|
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
||||||
|
# coordinates
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||||
plot(myG200,
|
plot(myG200,
|
||||||
layout = myGxy,
|
layout = myGxy,
|
||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
@ -216,7 +225,7 @@ plot(myG200,
|
|||||||
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
|
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
|
||||||
vertex.size = 150 + (60 * degree(myG200)),
|
vertex.size = 150 + (60 * degree(myG200)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar)
|
par(oPar) # restore graphics state
|
||||||
|
|
||||||
# This graph has thirteen singletons and one large, connected component. Many
|
# This graph has thirteen singletons and one large, connected component. Many
|
||||||
# biological graphs look approximately like this.
|
# biological graphs look approximately like this.
|
||||||
@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
# stands for "preferential attachment". Preferential attachment is one type of
|
# stands for "preferential attachment". Preferential attachment is one type of
|
||||||
# process that will yield scale-free distributions.
|
# process that will yield scale-free distributions.
|
||||||
|
|
||||||
set.seed(31415927)
|
N <- 200
|
||||||
GBA <- sample_pa(200, power = 0.8, directed = FALSE)
|
|
||||||
|
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||||
|
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
|
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||||
plot(GBA,
|
plot(GBA,
|
||||||
layout = GBAxy,
|
layout = GBAxy,
|
||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
@ -265,7 +277,7 @@ plot(GBA,
|
|||||||
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
|
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
|
||||||
vertex.size = 200 + (30 * degree(GBA)),
|
vertex.size = 200 + (30 * degree(GBA)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar)
|
par(oPar) # restore grphics state
|
||||||
|
|
||||||
# This is a very obviously different graph! Some biological networks have
|
# This is a very obviously different graph! Some biological networks have
|
||||||
# features that look like that - but in my experience the hub nodes are usually
|
# features that look like that - but in my experience the hub nodes are usually
|
||||||
@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
|
|||||||
# xlab = "d", ylab = "p(edge)")
|
# xlab = "d", ylab = "p(edge)")
|
||||||
|
|
||||||
# 200 node random geomteric graph
|
# 200 node random geomteric graph
|
||||||
set.seed(112358)
|
N <- 200
|
||||||
rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
|
rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
||||||
@ -539,20 +553,22 @@ names(c1)
|
|||||||
# considered to be more central. And that's also the way the force-directed
|
# considered to be more central. And that's also the way the force-directed
|
||||||
# layout drawas them, obviously.
|
# layout drawas them, obviously.
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
myGxy <- layout_with_fr(myG) # calculate layout coordinates
|
myGxy <- layout_with_fr(myG) # calculate layout coordinates
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
|
||||||
plot(myG,
|
plot(myG,
|
||||||
layout = myGxy,
|
layout = myGxy,
|
||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
|
||||||
vertex.size = 20 + (10 * degree(myG)),
|
vertex.size = 20 + (10 * degree(myG)),
|
||||||
vertex.label = V(myG)$name,
|
vertex.label = V(myG)$name,
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.8)
|
vertex.label.cex = 0.8)
|
||||||
par(oPar)
|
par(oPar) # restore graphics state
|
||||||
|
|
||||||
# == 4.1 Diameter ==========================================================
|
# == 4.1 Diameter ==========================================================
|
||||||
|
|
||||||
|
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Update set.seed() usage
|
||||||
# 1.1 Corrected empirical p-value
|
# 1.1 Corrected empirical p-value
|
||||||
# 1.0 First code live version
|
# 1.0 First code live version
|
||||||
#
|
#
|
||||||
@ -27,21 +28,21 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------------
|
#TOC> -------------------------------------------------------------------------
|
||||||
#TOC> 1 Introduction 49
|
#TOC> 1 Introduction 50
|
||||||
#TOC> 2 Three fundamental distributions 112
|
#TOC> 2 Three fundamental distributions 113
|
||||||
#TOC> 2.1 The Poisson Distribution 115
|
#TOC> 2.1 The Poisson Distribution 116
|
||||||
#TOC> 2.2 The uniform distribution 168
|
#TOC> 2.2 The uniform distribution 170
|
||||||
#TOC> 2.3 The Normal Distribution 188
|
#TOC> 2.3 The Normal Distribution 190
|
||||||
#TOC> 3 quantile-quantile comparison 229
|
#TOC> 3 quantile-quantile comparison 231
|
||||||
#TOC> 3.1 qqnorm() 239
|
#TOC> 3.1 qqnorm() 241
|
||||||
#TOC> 3.2 qqplot() 299
|
#TOC> 3.2 qqplot() 307
|
||||||
#TOC> 4 Quantifying the difference 316
|
#TOC> 4 Quantifying the difference 324
|
||||||
#TOC> 4.1 Chi2 test for discrete distributions 350
|
#TOC> 4.1 Chi2 test for discrete distributions 359
|
||||||
#TOC> 4.2 Kullback-Leibler divergence 441
|
#TOC> 4.2 Kullback-Leibler divergence 451
|
||||||
#TOC> 4.2.1 An example from tossing dice 452
|
#TOC> 4.2.1 An example from tossing dice 462
|
||||||
#TOC> 4.2.2 An example from lognormal distributions 574
|
#TOC> 4.2.2 An example from lognormal distributions 585
|
||||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 616
|
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -151,6 +152,7 @@ set.seed(112358)
|
|||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
|
x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
|
||||||
}
|
}
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
(t <- table(x)/N)
|
(t <- table(x)/N)
|
||||||
|
|
||||||
@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
|
|||||||
# The functions qqnorm() and qqline() perform this
|
# The functions qqnorm() and qqline() perform this
|
||||||
# comparison with the normal distribution.
|
# comparison with the normal distribution.
|
||||||
|
|
||||||
set.seed(1112358)
|
set.seed(112358)
|
||||||
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues
|
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
qqnorm(x)
|
qqnorm(x)
|
||||||
qqline(x, col = "seagreen")
|
qqline(x, col = "seagreen")
|
||||||
|
|
||||||
@ -253,12 +257,15 @@ qqline(x, col = "seagreen")
|
|||||||
|
|
||||||
# Create a vector of sample means from the exponential distribution; use
|
# Create a vector of sample means from the exponential distribution; use
|
||||||
# only a few samples for the mean
|
# only a few samples for the mean
|
||||||
set.seed(112358)
|
|
||||||
x <- rexp(12345)
|
x <- rexp(12345)
|
||||||
v <- numeric(999)
|
v <- numeric(999)
|
||||||
|
|
||||||
|
set.seed(112358)
|
||||||
for (i in 1:length(v)) {
|
for (i in 1:length(v)) {
|
||||||
v[i] <- mean(sample(x, 12))
|
v[i] <- mean(sample(x, 12))
|
||||||
}
|
}
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
qqnorm(v)
|
qqnorm(v)
|
||||||
qqline(v, col = "turquoise") # normal
|
qqline(v, col = "turquoise") # normal
|
||||||
|
|
||||||
@ -288,13 +295,14 @@ rEVD <- numeric(9999)
|
|||||||
for (i in seq_along(rEVD)) {
|
for (i in seq_along(rEVD)) {
|
||||||
rEVD[i] <- max(rnorm(100))
|
rEVD[i] <- max(rnorm(100))
|
||||||
}
|
}
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
hist(rEVD, breaks = 20, col = "orchid")
|
hist(rEVD, breaks = 20, col = "orchid")
|
||||||
# Note the long tail on the right!
|
# Note the long tail on the right!
|
||||||
|
|
||||||
qqnorm(rEVD)
|
qqnorm(rEVD)
|
||||||
qqline(rEVD, col = "orchid") # normal
|
qqline(rEVD, col = "orchid") # Definitely not "normal"!
|
||||||
|
|
||||||
# Definitely not "normal"!
|
|
||||||
|
|
||||||
# == 3.2 qqplot() ==========================================================
|
# == 3.2 qqplot() ==========================================================
|
||||||
|
|
||||||
@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
|
|||||||
dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with...
|
dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with...
|
||||||
dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider...
|
dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider...
|
||||||
dg1.9 <- dgamma(x, shape=1.9) # ...peak
|
dg1.9 <- dgamma(x, shape=1.9) # ...peak
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
|
myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
|
||||||
|
|
||||||
@ -361,6 +370,7 @@ rL2 <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
|
|||||||
rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with...
|
rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with...
|
||||||
rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider...
|
rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider...
|
||||||
rG1.9 <- rgamma(N, shape=1.9) # ...peak
|
rG1.9 <- rgamma(N, shape=1.9) # ...peak
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
|
maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
|
||||||
|
|
||||||
@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
|
|||||||
set.seed(47)
|
set.seed(47)
|
||||||
N <- 20
|
N <- 20
|
||||||
(counts <- table(sample(1:6, N, replace = TRUE)))
|
(counts <- table(sample(1:6, N, replace = TRUE)))
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
# We have not observed a "2"!
|
# We have not observed a "2"!
|
||||||
#
|
#
|
||||||
@ -597,6 +608,7 @@ for (i in 1:N) {
|
|||||||
q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts
|
q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts
|
||||||
divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence
|
divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence
|
||||||
}
|
}
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
hist(divs,
|
hist(divs,
|
||||||
col = "thistle",
|
col = "thistle",
|
||||||
@ -605,7 +617,7 @@ hist(divs,
|
|||||||
abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
|
abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
|
||||||
|
|
||||||
# How many KL-divergences were less than the difference we observed?
|
# How many KL-divergences were less than the difference we observed?
|
||||||
sum(divs < KLdiv(pmfL1, pmfL2)) #933
|
sum(divs < KLdiv(pmfL1, pmfL2)) # 933
|
||||||
|
|
||||||
# Therefore the empirical p-value that the samples came from the same
|
# Therefore the empirical p-value that the samples came from the same
|
||||||
# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
|
# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
|
||||||
|
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Significance unit.
|
# R code accompanying the FND-STA-Significance unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2017 10
|
# Date: 2017 09 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Update set.seed() usage
|
||||||
# 1.1 Corrected treatment of empirical p-value
|
# 1.1 Corrected treatment of empirical p-value
|
||||||
# 1.0 First contents
|
# 1.0 First contents
|
||||||
#
|
#
|
||||||
@ -26,15 +27,15 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------
|
#TOC> ------------------------------------------------------------------
|
||||||
#TOC> 1 Significance and p-value 42
|
#TOC> 1 Significance and p-value 43
|
||||||
#TOC> 1.1 Significance levels 53
|
#TOC> 1.1 Significance levels 54
|
||||||
#TOC> 1.2 probability and p-value 70
|
#TOC> 1.2 probability and p-value 71
|
||||||
#TOC> 1.2.1 p-value illustrated 100
|
#TOC> 1.2.1 p-value illustrated 103
|
||||||
#TOC> 2 One- or two-sided 153
|
#TOC> 2 One- or two-sided 158
|
||||||
#TOC> 3 Significance by integration 193
|
#TOC> 3 Significance by integration 198
|
||||||
#TOC> 4 Significance by simulation or permutation 199
|
#TOC> 4 Significance by simulation or permutation 204
|
||||||
#TOC> 5 Final tasks 302
|
#TOC> 5 Final tasks 312
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -75,6 +76,8 @@
|
|||||||
|
|
||||||
set.seed(sqrt(5))
|
set.seed(sqrt(5))
|
||||||
x <- rnorm(1)
|
x <- rnorm(1)
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
print(x, digits = 22)
|
print(x, digits = 22)
|
||||||
# [1] -0.8969145466249813791748
|
# [1] -0.8969145466249813791748
|
||||||
|
|
||||||
@ -102,8 +105,10 @@ print(x, digits = 22)
|
|||||||
# Let's illustrate. First we draw a million random values from our
|
# Let's illustrate. First we draw a million random values from our
|
||||||
# standard, normal distribution:
|
# standard, normal distribution:
|
||||||
|
|
||||||
set.seed(112358)
|
N <- 1e6 # one million
|
||||||
r <- rnorm(1000000)
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
|
r <- rnorm(N) # N values from a normal distribution
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
# Let's see what the distribution looks like:
|
# Let's see what the distribution looks like:
|
||||||
|
|
||||||
@ -277,9 +282,14 @@ chSep <- function(v) {
|
|||||||
chSep(v)
|
chSep(v)
|
||||||
|
|
||||||
# Now we can produce a random permutation of v, and recalculate
|
# Now we can produce a random permutation of v, and recalculate
|
||||||
set.seed(pi)
|
|
||||||
|
set.seed(pi) # set RNG seed for repeatable randomness
|
||||||
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
|
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
|
||||||
# code paradigm. It is very useful.
|
# code paradigm. It is very useful.
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
chSep(w)
|
chSep(w)
|
||||||
# 3.273 ... that's actually less than what we had before.
|
# 3.273 ... that's actually less than what we had before.
|
||||||
|
|
||||||
|
@ -489,7 +489,8 @@ for (name in toupper(myControls)) {
|
|||||||
|
|
||||||
# == 5.1 Final task: Gene descriptions =====================================
|
# == 5.1 Final task: Gene descriptions =====================================
|
||||||
|
|
||||||
# Print the descriptions of the top ten differentially expressed genes.
|
# Print the descriptions of the top ten differentially expressed genes
|
||||||
|
# and comment on what they have in common (or not).
|
||||||
|
|
||||||
|
|
||||||
# = 6 Improving on Discovery by Differential Expression ===================
|
# = 6 Improving on Discovery by Differential Expression ===================
|
||||||
@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
|
|||||||
str(GPL1914)
|
str(GPL1914)
|
||||||
|
|
||||||
# ... from which we can get the data - which is however NOT necessarily
|
# ... from which we can get the data - which is however NOT necessarily
|
||||||
# matched to the rows of our expression dataset. Note that here to: the majority
|
# matched to the rows of our expression dataset. Note that here too: the
|
||||||
# of data elements are factors and will likely have to be converted before
|
# majority of data elements are factors and will likely have to be converted
|
||||||
# use.
|
# before use.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 16
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Update set.seed() usage
|
||||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||||
# 1.0 New material.
|
# 1.0 New material.
|
||||||
#
|
#
|
||||||
@ -28,17 +29,17 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------
|
#TOC> --------------------------------------------------------------
|
||||||
#TOC> 1 Designing a computational experiment 57
|
#TOC> 1 Designing a computational experiment 54
|
||||||
#TOC> 2 Setting up the tools 73
|
#TOC> 2 Setting up the tools 70
|
||||||
#TOC> 2.1 Natural and alternative genetic codes 76
|
#TOC> 2.1 Natural and alternative genetic codes 73
|
||||||
#TOC> 2.2 Effect of mutations 135
|
#TOC> 2.2 Effect of mutations 132
|
||||||
#TOC> 2.2.1 reverse-translate 146
|
#TOC> 2.2.1 reverse-translate 143
|
||||||
#TOC> 2.2.2 Randomly mutate 171
|
#TOC> 2.2.2 Randomly mutate 168
|
||||||
#TOC> 2.2.3 Forward- translate 196
|
#TOC> 2.2.3 Forward- translate 193
|
||||||
#TOC> 2.2.4 measure effect 214
|
#TOC> 2.2.4 measure effect 211
|
||||||
#TOC> 3 Run the experiment 261
|
#TOC> 3 Run the experiment 258
|
||||||
#TOC> 4 Task solutions 348
|
#TOC> 4 Task solutions 351
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
|
|||||||
# Mutate and evaluate
|
# Mutate and evaluate
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
x <- randMut(myDNA)
|
x <- randMut(myDNA)
|
||||||
|
set.seed(NULL)
|
||||||
x <- traFor(x, GENETIC_CODE)
|
x <- traFor(x, GENETIC_CODE)
|
||||||
evalMut(myAA, x) # 166.4
|
evalMut(myAA, x) # 166.4
|
||||||
|
|
||||||
# Try this 200 times, and see how the values are distributed.
|
# Try this 200 times, and see how the values are distributed.
|
||||||
set.seed(112358)
|
|
||||||
N <- 200
|
N <- 200
|
||||||
valUGC <- numeric(N)
|
valUGC <- numeric(N)
|
||||||
|
|
||||||
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
x <- randMut(myDNA) # mutate
|
x <- randMut(myDNA) # mutate
|
||||||
x <- traFor(x, GENETIC_CODE) # translate
|
x <- traFor(x, GENETIC_CODE) # translate
|
||||||
valUGC[i] <- evalMut(myAA, x) # evaluate
|
valUGC[i] <- evalMut(myAA, x) # evaluate
|
||||||
}
|
}
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
hist(valUGC,
|
hist(valUGC,
|
||||||
breaks = 15,
|
breaks = 15,
|
||||||
@ -299,6 +303,7 @@ effectUGC <- mean(valUGC) # 178.1
|
|||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
# choose a new code
|
# choose a new code
|
||||||
GC <- randomGC(GENETIC_CODE)
|
GC <- randomGC(GENETIC_CODE)
|
||||||
|
set.seed(NULL)
|
||||||
|
|
||||||
# reverse translate hypothetical sequence according to the new code
|
# reverse translate hypothetical sequence according to the new code
|
||||||
x <- traRev(myAA, GC)
|
x <- traRev(myAA, GC)
|
||||||
@ -311,9 +316,10 @@ evalMut(myAA, x) # evaluate mutation effects: 298.5
|
|||||||
# Let's try with different genetic codes. 200 trials - but this time every trial
|
# Let's try with different genetic codes. 200 trials - but this time every trial
|
||||||
# is with a different, synthetic genetic code.
|
# is with a different, synthetic genetic code.
|
||||||
|
|
||||||
set.seed(1414214)
|
|
||||||
N <- 200
|
N <- 200
|
||||||
valXGC <- numeric(N)
|
valXGC <- numeric(N)
|
||||||
|
|
||||||
|
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
GC <- randomGC(GENETIC_CODE) # Choose code
|
GC <- randomGC(GENETIC_CODE) # Choose code
|
||||||
x <- traRev(myAA, GC) # reverse translate
|
x <- traRev(myAA, GC) # reverse translate
|
||||||
@ -321,6 +327,7 @@ for (i in 1:N) {
|
|||||||
x <- traFor(x, GC) # translate
|
x <- traFor(x, GC) # translate
|
||||||
valXGC[i] <- evalMut(myAA, x) # evaluate
|
valXGC[i] <- evalMut(myAA, x) # evaluate
|
||||||
}
|
}
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
hist(valXGC,
|
hist(valXGC,
|
||||||
col = "plum",
|
col = "plum",
|
||||||
@ -343,9 +350,10 @@ hist(valXGC,
|
|||||||
|
|
||||||
# = 4 Task solutions ======================================================
|
# = 4 Task solutions ======================================================
|
||||||
|
|
||||||
set.seed(2718282)
|
|
||||||
N <- 200
|
N <- 200
|
||||||
valSGC <- numeric(N)
|
valSGC <- numeric(N)
|
||||||
|
|
||||||
|
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
GC <- swappedGC(GENETIC_CODE) # Choose code
|
GC <- swappedGC(GENETIC_CODE) # Choose code
|
||||||
x <- traRev(myAA, GC) # reverse translate
|
x <- traRev(myAA, GC) # reverse translate
|
||||||
@ -353,6 +361,7 @@ for (i in 1:N) {
|
|||||||
x <- traFor(x, GC) # translate
|
x <- traFor(x, GC) # translate
|
||||||
valSGC[i] <- evalMut(myAA, x) # evaluate
|
valSGC[i] <- evalMut(myAA, x) # evaluate
|
||||||
}
|
}
|
||||||
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
hist(valSGC,
|
hist(valSGC,
|
||||||
col = "#6688FF88",
|
col = "#6688FF88",
|
||||||
|
Loading…
Reference in New Issue
Block a user