Maintenance updates, and revised set.seed() usage
This commit is contained in:
parent
2ab162e375
commit
6f54293592
@ -25,10 +25,11 @@
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------
|
||||
#TOC> -----------------------------------------------
|
||||
#TOC> 1 Preparations 39
|
||||
#TOC> 2 Suitable MYSPE Species 51
|
||||
#TOC> 3 Adopt "MYSPE" 65
|
||||
@ -71,6 +72,7 @@ if (! exists("myStudentNumber")) {
|
||||
load("data/MYSPEspecies.RData") # load the species names
|
||||
set.seed(myStudentNumber) # seed the random number generator
|
||||
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
||||
set.seed(NULL) # reset the random number generator
|
||||
# write the result to your personalized profile data so we can use the result in
|
||||
# other functions
|
||||
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
||||
@ -80,7 +82,7 @@ biCode(MYSPE) # and what is it's "BiCode" ... ?
|
||||
|
||||
# Task: Note down the species name and its five letter label on your Student
|
||||
# Wiki user page. Use this species whenever this or future assignments refer
|
||||
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
|
||||
# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -30,7 +30,7 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------
|
||||
#TOC> --------------------------------------------------
|
||||
#TOC> 1 Preparation and Tree Plot 43
|
||||
#TOC> 2 Tree Analysis 82
|
||||
#TOC> 2.1 Rooting Trees 141
|
||||
@ -269,13 +269,14 @@ rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
# compare them anyway.
|
||||
|
||||
# Let's compute some random trees this way, calculate the distances to
|
||||
# fungiTree, and then compare the values we get for apsTree2:
|
||||
# fungiTree, and then compare the values we get for apsTree2. The random
|
||||
# trees are provided by ape::rtree().
|
||||
|
||||
set.seed(112358)
|
||||
N <- 10000 # takes about 15 seconds
|
||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||
colnames(myTreeDistances) <- c("symm", "path")
|
||||
|
||||
set.seed(112358)
|
||||
for (i in 1:N) {
|
||||
xTree <- rtree(n = length(apsTree2$tip.label),
|
||||
rooted = TRUE,
|
||||
@ -283,6 +284,7 @@ for (i in 1:N) {
|
||||
br = NULL)
|
||||
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
||||
}
|
||||
set.seed(NULL) # reset the random number generator
|
||||
|
||||
table(myTreeDistances[, "symm"])
|
||||
|
||||
|
@ -28,14 +28,14 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Setup and data 43
|
||||
#TOC> 2 Functional Edges in the Human Proteome 80
|
||||
#TOC> 2.1 Cliques 123
|
||||
#TOC> 2.2 Communities 164
|
||||
#TOC> 2.3 Betweenness Centrality 176
|
||||
#TOC> 3 biomaRt 220
|
||||
#TOC> 4 Task for submission 291
|
||||
#TOC> 2.3 Betweenness Centrality 178
|
||||
#TOC> 3 biomaRt 224
|
||||
#TOC> 4 Task for submission 295
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -163,8 +163,10 @@ par(oPar)
|
||||
|
||||
# == 2.2 Communities =======================================================
|
||||
|
||||
set.seed(112358)
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
gSTRclusters <- cluster_infomap(gSTR)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
modularity(gSTRclusters) # ... measures how separated the different membership
|
||||
# types are from each other
|
||||
tMem <- table(membership(gSTRclusters))
|
||||
@ -205,9 +207,11 @@ head(sBC)
|
||||
|
||||
# We are going to use these IDs to produce some output for a submitted task:
|
||||
# so I need you to personalize ENSPsel with the following
|
||||
# two lines of code:
|
||||
# three lines of code:
|
||||
|
||||
set.seed(<myStudentNumber>) # enter your student number here
|
||||
(ENSPsel <- sample(ENSPsel))
|
||||
set.seed(NULL) # reset the random number generator
|
||||
|
||||
# Next, to find what these proteins are...
|
||||
|
||||
|
109
BIN-Sequence.R
109
BIN-Sequence.R
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Sequence unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017 09 - 2017 10
|
||||
# Date: 2017 09 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 Update set.seed() usage
|
||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||
# 1.1 Add chartr()
|
||||
# 1.0 First live version 2017.
|
||||
@ -28,20 +29,24 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Prepare 55
|
||||
#TOC> 2 Storing Sequence 73
|
||||
#TOC> 3 String properties 102
|
||||
#TOC> 4 Substrings 109
|
||||
#TOC> 5 Creating strings: sprintf() 115
|
||||
#TOC> 6 Changing strings 146
|
||||
#TOC> 6.1 stringi and stringr 198
|
||||
#TOC> 6.2 dbSanitizeSequence() 208
|
||||
#TOC> 7 Permuting and sampling 220
|
||||
#TOC> 7.1 Permutations 227
|
||||
#TOC> 7.2 Sampling 270
|
||||
#TOC> 7.2.1 Equiprobable characters 272
|
||||
#TOC> 7.2.2 Defined probability vector 312
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Prepare 60
|
||||
#TOC> 2 Storing Sequence 78
|
||||
#TOC> 3 String properties 107
|
||||
#TOC> 4 Substrings 114
|
||||
#TOC> 5 Creating strings: sprintf() 135
|
||||
#TOC> 6 Changing strings 170
|
||||
#TOC> 6.1.1 Changing case 172
|
||||
#TOC> 6.1.2 Reverse 177
|
||||
#TOC> 6.1.3 Change characters 181
|
||||
#TOC> 6.1.4 Substitute characters 209
|
||||
#TOC> 6.2 stringi and stringr 229
|
||||
#TOC> 6.3 dbSanitizeSequence() 239
|
||||
#TOC> 7 Permuting and sampling 251
|
||||
#TOC> 7.1 Permutations 258
|
||||
#TOC> 7.2 Sampling 304
|
||||
#TOC> 7.2.1 Equiprobable characters 306
|
||||
#TOC> 7.2.2 Defined probability vector 348
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -111,16 +116,31 @@ nchar(s) # aha
|
||||
# Use the substr() function
|
||||
substr(s, 2, 4)
|
||||
|
||||
# or the similar substring()
|
||||
substring(s, 2, 4)
|
||||
|
||||
# Note: both functions are vectorized (i.e. they operate on vectors
|
||||
# of arguments, you don't need to loop over input)...
|
||||
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
|
||||
substr( myBiCodes, 1, 3)
|
||||
substring(myBiCodes, 1, 3)
|
||||
|
||||
# ... however only substring() will also use vectors for start and stop
|
||||
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
||||
(i <- seq(1, nchar(s), by = 3)) # an index vector
|
||||
substr( s, i, i+2) # ... returns only the first nucleotide triplet
|
||||
substring(s, i, i+2) # ... returns all triplets
|
||||
|
||||
|
||||
# = 5 Creating strings: sprintf() =========================================
|
||||
|
||||
|
||||
# Sprintf is a very smart, very powerful function and has cognates in all
|
||||
# other programming languages. It has a small learning curve, but it's
|
||||
# other programming languages. It has a bit of a learning curve, but this is
|
||||
# totally worth it:
|
||||
# the function takes a format string, and a list of other arguments. It returns
|
||||
# a formatted string. Here are some examples - watch carefully for sprintf()
|
||||
# calls in other code.
|
||||
# calls elsewhere in the code.
|
||||
|
||||
sprintf("Just a string.")
|
||||
sprintf("A string and the number %d.", 5)
|
||||
@ -128,32 +148,37 @@ sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
|
||||
sprintf("Pi is ~ %1.2f ...", pi)
|
||||
sprintf("or more accurately ~ %1.11f.", pi)
|
||||
x <- "bottles of beer"
|
||||
n <- 99
|
||||
N <- 99
|
||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
|
||||
n, x, n, x, "one down, and pass it around", n-1, x)
|
||||
N, x, N, x, "one down, and pass it around", N - 1, x)
|
||||
|
||||
# Note that in the last example, the value of the string was displayed with
|
||||
# R's usual print-formatting function and therefore the line-break "\n" did
|
||||
# not actually break the line. To have line breaks, tabs etc, you need to use
|
||||
# cat() to display the string:
|
||||
|
||||
for (i in 99:95) {
|
||||
for (i in N:(N-4)) {
|
||||
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
|
||||
i, x, i, x, "one down, and pass it around", i-1, x))
|
||||
i, x, i, x, "one down, and pass it around", i - 1, x))
|
||||
}
|
||||
|
||||
# sprintf() is vectorized: if one of its parameters is a vector, it
|
||||
# will generate one output string for each of the vector's elements:
|
||||
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
|
||||
|
||||
|
||||
# = 6 Changing strings ====================================================
|
||||
|
||||
# Changing case
|
||||
# === 6.1.1 Changing case
|
||||
tolower(s)
|
||||
toupper(tolower(s))
|
||||
|
||||
|
||||
#reverse
|
||||
# === 6.1.2 Reverse
|
||||
reverse(s)
|
||||
|
||||
|
||||
# === 6.1.3 Change characters
|
||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||
# correpsonding character in "new."
|
||||
|
||||
@ -167,15 +192,21 @@ chartr(paste0(letters, collapse = ""),
|
||||
|
||||
# One amusing way to use the function is for a reversible substitution
|
||||
# cypher.
|
||||
set.seed(112358)
|
||||
myCypher <- paste0(sample(letters), collapse = "")
|
||||
lett <- paste0(letters, collapse = "")
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
(myCypher <- paste0(sample(letters), collapse = ""))
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(lett <- paste0(letters, collapse = ""))
|
||||
|
||||
# encode ...
|
||||
(x <- chartr(lett, myCypher, "... seven for a secret, never to be told."))
|
||||
|
||||
# decode ...
|
||||
chartr(myCypher, lett, x)
|
||||
# (Nb. substitution cyphers are easy to crack!)
|
||||
|
||||
|
||||
# substituing characters
|
||||
# === 6.1.4 Substitute characters
|
||||
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
||||
# a "regular expression"!
|
||||
|
||||
@ -195,7 +226,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
||||
# remove "whitespace" (spaces, tabs, line breaks)...
|
||||
(s <- gsub("\\s", "", s))
|
||||
|
||||
# == 6.1 stringi and stringr ===============================================
|
||||
# == 6.2 stringi and stringr ===============================================
|
||||
|
||||
# But there are also specialized functions eg. to remove leading/trailing
|
||||
# whitespace which may be important to sanitize user input etc. Have a look at
|
||||
@ -205,7 +236,7 @@ MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
||||
|
||||
|
||||
|
||||
# == 6.2 dbSanitizeSequence() ==============================================
|
||||
# == 6.3 dbSanitizeSequence() ==============================================
|
||||
|
||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
||||
# sequences that may be copy/pasted from Web-sources
|
||||
@ -254,10 +285,13 @@ mean(which(x == "K")) # ... gives us the average of the permuted sequence.
|
||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||
N <- 10000
|
||||
d <- numeric(N)
|
||||
set.seed(112358)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
d[i] <- mean(which(sample(s, length(s)) == "K"))
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(d, breaks = 20)
|
||||
abline(v = 2.5, lwd = 2, col = "firebrick")
|
||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
||||
@ -276,8 +310,10 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
set.seed(16818)
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
@ -297,7 +333,7 @@ if (! require(stringi, quietly=TRUE)) {
|
||||
# data(package = "stringi") # available datasets
|
||||
|
||||
|
||||
(x <- stri_match_all(mySeq, regex = "CG"))
|
||||
(x <- stri::stri_match_all(mySeq, regex = "CG"))
|
||||
length(unlist(x))
|
||||
|
||||
# Now you could compare that number with yeast DNA sequences, and determine
|
||||
@ -323,9 +359,12 @@ c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
set.seed(16818)
|
||||
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
@ -333,7 +372,7 @@ table(v)
|
||||
sum(table(v)[c("G", "C")]) # Close to expected
|
||||
|
||||
# What's the number of CpG motifs?
|
||||
(x <- stri_match_all(mySeq, regex = "CG"))
|
||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||
# ... not a single one in this case.
|
||||
|
||||
|
||||
|
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 06
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0 First final version for learning units.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,18 +29,18 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------
|
||||
#TOC> 1 Review 52
|
||||
#TOC> ------------------------------------------------------------
|
||||
#TOC> 1 Review 48
|
||||
#TOC> 2 DEGREE DISTRIBUTIONS 201
|
||||
#TOC> 2.1 Random graph 207
|
||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 251
|
||||
#TOC> 2.3 Random geometric graph 313
|
||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 433
|
||||
#TOC> 3.1 Basics 436
|
||||
#TOC> 3.2 Components 508
|
||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 527
|
||||
#TOC> 4.1 Diameter 562
|
||||
#TOC> 5 GRAPH CLUSTERING 630
|
||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
|
||||
#TOC> 2.3 Random geometric graph 320
|
||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
|
||||
#TOC> 3.1 Basics 445
|
||||
#TOC> 3.2 Components 517
|
||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
|
||||
#TOC> 4.1 Diameter 573
|
||||
#TOC> 5 GRAPH CLUSTERING 641
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -57,7 +58,7 @@
|
||||
|
||||
# To begin let's write a little function that will create random "gene" names;
|
||||
# there's no particular purpose to this other than to make our graphs look a
|
||||
# little more "biological ...
|
||||
# little more "biological" ...
|
||||
makeRandomGenenames <- function(N) {
|
||||
nam <- character()
|
||||
while (length(nam) < N) {
|
||||
@ -72,8 +73,9 @@ makeRandomGenenames <- function(N) {
|
||||
|
||||
N <- 20
|
||||
|
||||
set.seed(112358)
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
(Nnames <- makeRandomGenenames(N))
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# One way to represent graphs in a computer is as an "adjacency matrix". In this
|
||||
# matrix, each row and each column represents a node, and the cell at the
|
||||
@ -112,8 +114,9 @@ makeRandomAM <- function(nam, p = 0.1) {
|
||||
return(AM)
|
||||
}
|
||||
|
||||
set.seed(112358)
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
(myRandAM <- makeRandomAM(Nnames, p = 0.09))
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
# Listing the matrix is not very informative - we should plot this graph. The
|
||||
@ -131,8 +134,10 @@ if (! require(igraph, quietly=TRUE)) {
|
||||
|
||||
|
||||
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
||||
set.seed(112358)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
# The igraph package adds its own function to the collection of plot()
|
||||
@ -201,13 +206,17 @@ axis(side = 1, at = 0:7)
|
||||
|
||||
# == 2.1 Random graph ======================================================
|
||||
|
||||
N <- 200
|
||||
|
||||
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
set.seed(31415927)
|
||||
my200AM <- makeRandomAM(as.character(1:200), p = 0.015)
|
||||
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
||||
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout coordinates
|
||||
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
||||
# coordinates
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||
plot(myG200,
|
||||
layout = myGxy,
|
||||
rescale = FALSE,
|
||||
@ -216,7 +225,7 @@ plot(myG200,
|
||||
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
|
||||
vertex.size = 150 + (60 * degree(myG200)),
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
par(oPar) # restore graphics state
|
||||
|
||||
# This graph has thirteen singletons and one large, connected component. Many
|
||||
# biological graphs look approximately like this.
|
||||
@ -251,12 +260,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
# stands for "preferential attachment". Preferential attachment is one type of
|
||||
# process that will yield scale-free distributions.
|
||||
|
||||
set.seed(31415927)
|
||||
GBA <- sample_pa(200, power = 0.8, directed = FALSE)
|
||||
N <- 200
|
||||
|
||||
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||
plot(GBA,
|
||||
layout = GBAxy,
|
||||
rescale = FALSE,
|
||||
@ -265,7 +277,7 @@ plot(GBA,
|
||||
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
|
||||
vertex.size = 200 + (30 * degree(GBA)),
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
par(oPar) # restore grphics state
|
||||
|
||||
# This is a very obviously different graph! Some biological networks have
|
||||
# features that look like that - but in my experience the hub nodes are usually
|
||||
@ -386,8 +398,10 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) {
|
||||
# xlab = "d", ylab = "p(edge)")
|
||||
|
||||
# 200 node random geomteric graph
|
||||
set.seed(112358)
|
||||
rGAM <- makeRandomGeometricAM(as.character(1:200), t=0.4)
|
||||
N <- 200
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
||||
@ -539,20 +553,22 @@ names(c1)
|
||||
# considered to be more central. And that's also the way the force-directed
|
||||
# layout drawas them, obviously.
|
||||
|
||||
set.seed(112358)
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
myGxy <- layout_with_fr(myG) # calculate layout coordinates
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
|
||||
plot(myG,
|
||||
layout = myGxy,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
||||
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
|
||||
vertex.size = 20 + (10 * degree(myG)),
|
||||
vertex.label = V(myG)$name,
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.8)
|
||||
par(oPar)
|
||||
par(oPar) # restore graphics state
|
||||
|
||||
# == 4.1 Diameter ==========================================================
|
||||
|
||||
|
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Update set.seed() usage
|
||||
# 1.1 Corrected empirical p-value
|
||||
# 1.0 First code live version
|
||||
#
|
||||
@ -27,21 +28,21 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------------
|
||||
#TOC> 1 Introduction 49
|
||||
#TOC> 2 Three fundamental distributions 112
|
||||
#TOC> 2.1 The Poisson Distribution 115
|
||||
#TOC> 2.2 The uniform distribution 168
|
||||
#TOC> 2.3 The Normal Distribution 188
|
||||
#TOC> 3 quantile-quantile comparison 229
|
||||
#TOC> 3.1 qqnorm() 239
|
||||
#TOC> 3.2 qqplot() 299
|
||||
#TOC> 4 Quantifying the difference 316
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 350
|
||||
#TOC> 4.2 Kullback-Leibler divergence 441
|
||||
#TOC> 4.2.1 An example from tossing dice 452
|
||||
#TOC> 4.2.2 An example from lognormal distributions 574
|
||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 616
|
||||
#TOC> -------------------------------------------------------------------------
|
||||
#TOC> 1 Introduction 50
|
||||
#TOC> 2 Three fundamental distributions 113
|
||||
#TOC> 2.1 The Poisson Distribution 116
|
||||
#TOC> 2.2 The uniform distribution 170
|
||||
#TOC> 2.3 The Normal Distribution 190
|
||||
#TOC> 3 quantile-quantile comparison 231
|
||||
#TOC> 3.1 qqnorm() 241
|
||||
#TOC> 3.2 qqplot() 307
|
||||
#TOC> 4 Quantifying the difference 324
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 359
|
||||
#TOC> 4.2 Kullback-Leibler divergence 451
|
||||
#TOC> 4.2.1 An example from tossing dice 462
|
||||
#TOC> 4.2.2 An example from lognormal distributions 585
|
||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -151,6 +152,7 @@ set.seed(112358)
|
||||
for (i in 1:N) {
|
||||
x[i] <- sum(sample(genes, 250)) # sum of TFs in our sample in this trial
|
||||
}
|
||||
set.seed(NULL)
|
||||
|
||||
(t <- table(x)/N)
|
||||
|
||||
@ -241,8 +243,10 @@ hist(v, breaks = 20, col = "#F8DDFF")
|
||||
# The functions qqnorm() and qqline() perform this
|
||||
# comparison with the normal distribution.
|
||||
|
||||
set.seed(1112358)
|
||||
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed balues
|
||||
set.seed(112358)
|
||||
x <- rnorm(100, mean=0, sd=1) # 100 normally distributed values
|
||||
set.seed(NULL)
|
||||
|
||||
qqnorm(x)
|
||||
qqline(x, col = "seagreen")
|
||||
|
||||
@ -253,12 +257,15 @@ qqline(x, col = "seagreen")
|
||||
|
||||
# Create a vector of sample means from the exponential distribution; use
|
||||
# only a few samples for the mean
|
||||
set.seed(112358)
|
||||
x <- rexp(12345)
|
||||
v <- numeric(999)
|
||||
|
||||
set.seed(112358)
|
||||
for (i in 1:length(v)) {
|
||||
v[i] <- mean(sample(x, 12))
|
||||
}
|
||||
set.seed(NULL)
|
||||
|
||||
qqnorm(v)
|
||||
qqline(v, col = "turquoise") # normal
|
||||
|
||||
@ -288,13 +295,14 @@ rEVD <- numeric(9999)
|
||||
for (i in seq_along(rEVD)) {
|
||||
rEVD[i] <- max(rnorm(100))
|
||||
}
|
||||
set.seed(NULL)
|
||||
|
||||
hist(rEVD, breaks = 20, col = "orchid")
|
||||
# Note the long tail on the right!
|
||||
|
||||
qqnorm(rEVD)
|
||||
qqline(rEVD, col = "orchid") # normal
|
||||
qqline(rEVD, col = "orchid") # Definitely not "normal"!
|
||||
|
||||
# Definitely not "normal"!
|
||||
|
||||
# == 3.2 qqplot() ==========================================================
|
||||
|
||||
@ -331,6 +339,7 @@ dl2 <- dlnorm(x - 0.25) # log-normal distribution, shifted right (a bit)
|
||||
dg1.2 <- dgamma(x, shape=1.2) # three gamma distributions with...
|
||||
dg1.5 <- dgamma(x, shape=1.5) # ...wider, and wider...
|
||||
dg1.9 <- dgamma(x, shape=1.9) # ...peak
|
||||
set.seed(NULL)
|
||||
|
||||
myCols <- c("black", "grey", "maroon", "turquoise", "steelblue")
|
||||
|
||||
@ -361,6 +370,7 @@ rL2 <- rlnorm(N, meanlog = 0.25) # log-normal distribution, shifted right
|
||||
rG1.2 <- rgamma(N, shape=1.2) # three gamma distributions with...
|
||||
rG1.5 <- rgamma(N, shape=1.5) # ...wider, and wider...
|
||||
rG1.9 <- rgamma(N, shape=1.9) # ...peak
|
||||
set.seed(NULL)
|
||||
|
||||
maxX <- max(c(rL1, rL2, rG1.2, rG1.5, rG1.9))
|
||||
|
||||
@ -459,6 +469,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
|
||||
set.seed(47)
|
||||
N <- 20
|
||||
(counts <- table(sample(1:6, N, replace = TRUE)))
|
||||
set.seed(NULL)
|
||||
|
||||
# We have not observed a "2"!
|
||||
#
|
||||
@ -597,6 +608,7 @@ for (i in 1:N) {
|
||||
q <- pmfPC(y, nam = 1:10) # convert to p.m.f. with pseudocounts
|
||||
divs[i] <- KLdiv(pmfL1, q) # calculate Kullback-Leibler divergence
|
||||
}
|
||||
set.seed(NULL)
|
||||
|
||||
hist(divs,
|
||||
col = "thistle",
|
||||
@ -605,7 +617,7 @@ hist(divs,
|
||||
abline(v = KLdiv(pmfL1, pmfL2), col="firebrick")
|
||||
|
||||
# How many KL-divergences were less than the difference we observed?
|
||||
sum(divs < KLdiv(pmfL1, pmfL2)) #933
|
||||
sum(divs < KLdiv(pmfL1, pmfL2)) # 933
|
||||
|
||||
# Therefore the empirical p-value that the samples came from the same
|
||||
# distribution is only 100 * ((N - 933) + 1) / (N + 1) (%) ... 6.8%. You see
|
||||
|
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Significance unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 09 - 2017 10
|
||||
# Date: 2017 09 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Update set.seed() usage
|
||||
# 1.1 Corrected treatment of empirical p-value
|
||||
# 1.0 First contents
|
||||
#
|
||||
@ -26,15 +27,15 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------
|
||||
#TOC> 1 Significance and p-value 42
|
||||
#TOC> 1.1 Significance levels 53
|
||||
#TOC> 1.2 probability and p-value 70
|
||||
#TOC> 1.2.1 p-value illustrated 100
|
||||
#TOC> 2 One- or two-sided 153
|
||||
#TOC> 3 Significance by integration 193
|
||||
#TOC> 4 Significance by simulation or permutation 199
|
||||
#TOC> 5 Final tasks 302
|
||||
#TOC> ------------------------------------------------------------------
|
||||
#TOC> 1 Significance and p-value 43
|
||||
#TOC> 1.1 Significance levels 54
|
||||
#TOC> 1.2 probability and p-value 71
|
||||
#TOC> 1.2.1 p-value illustrated 103
|
||||
#TOC> 2 One- or two-sided 158
|
||||
#TOC> 3 Significance by integration 198
|
||||
#TOC> 4 Significance by simulation or permutation 204
|
||||
#TOC> 5 Final tasks 312
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -75,6 +76,8 @@
|
||||
|
||||
set.seed(sqrt(5))
|
||||
x <- rnorm(1)
|
||||
set.seed(NULL)
|
||||
|
||||
print(x, digits = 22)
|
||||
# [1] -0.8969145466249813791748
|
||||
|
||||
@ -102,8 +105,10 @@ print(x, digits = 22)
|
||||
# Let's illustrate. First we draw a million random values from our
|
||||
# standard, normal distribution:
|
||||
|
||||
set.seed(112358)
|
||||
r <- rnorm(1000000)
|
||||
N <- 1e6 # one million
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
r <- rnorm(N) # N values from a normal distribution
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# Let's see what the distribution looks like:
|
||||
|
||||
@ -277,9 +282,14 @@ chSep <- function(v) {
|
||||
chSep(v)
|
||||
|
||||
# Now we can produce a random permutation of v, and recalculate
|
||||
set.seed(pi)
|
||||
|
||||
set.seed(pi) # set RNG seed for repeatable randomness
|
||||
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
|
||||
# code paradigm. It is very useful.
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
|
||||
chSep(w)
|
||||
# 3.273 ... that's actually less than what we had before.
|
||||
|
||||
|
@ -489,7 +489,8 @@ for (name in toupper(myControls)) {
|
||||
|
||||
# == 5.1 Final task: Gene descriptions =====================================
|
||||
|
||||
# Print the descriptions of the top ten differentially expressed genes.
|
||||
# Print the descriptions of the top ten differentially expressed genes
|
||||
# and comment on what they have in common (or not).
|
||||
|
||||
|
||||
# = 6 Improving on Discovery by Differential Expression ===================
|
||||
@ -617,9 +618,9 @@ GPL1914 <- getGEO("GPL1914")
|
||||
str(GPL1914)
|
||||
|
||||
# ... from which we can get the data - which is however NOT necessarily
|
||||
# matched to the rows of our expression dataset. Note that here to: the majority
|
||||
# of data elements are factors and will likely have to be converted before
|
||||
# use.
|
||||
# matched to the rows of our expression dataset. Note that here too: the
|
||||
# majority of data elements are factors and will likely have to be converted
|
||||
# before use.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 16
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||
# 1.0 New material.
|
||||
#
|
||||
@ -28,17 +29,17 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------
|
||||
#TOC> 1 Designing a computational experiment 57
|
||||
#TOC> 2 Setting up the tools 73
|
||||
#TOC> 2.1 Natural and alternative genetic codes 76
|
||||
#TOC> 2.2 Effect of mutations 135
|
||||
#TOC> 2.2.1 reverse-translate 146
|
||||
#TOC> 2.2.2 Randomly mutate 171
|
||||
#TOC> 2.2.3 Forward- translate 196
|
||||
#TOC> 2.2.4 measure effect 214
|
||||
#TOC> 3 Run the experiment 261
|
||||
#TOC> 4 Task solutions 348
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Designing a computational experiment 54
|
||||
#TOC> 2 Setting up the tools 70
|
||||
#TOC> 2.1 Natural and alternative genetic codes 73
|
||||
#TOC> 2.2 Effect of mutations 132
|
||||
#TOC> 2.2.1 reverse-translate 143
|
||||
#TOC> 2.2.2 Randomly mutate 168
|
||||
#TOC> 2.2.3 Forward- translate 193
|
||||
#TOC> 2.2.4 measure effect 211
|
||||
#TOC> 3 Run the experiment 258
|
||||
#TOC> 4 Task solutions 351
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -269,18 +270,21 @@ myAA <- traFor(myDNA, GENETIC_CODE)
|
||||
# Mutate and evaluate
|
||||
set.seed(112358)
|
||||
x <- randMut(myDNA)
|
||||
set.seed(NULL)
|
||||
x <- traFor(x, GENETIC_CODE)
|
||||
evalMut(myAA, x) # 166.4
|
||||
|
||||
# Try this 200 times, and see how the values are distributed.
|
||||
set.seed(112358)
|
||||
N <- 200
|
||||
valUGC <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
x <- randMut(myDNA) # mutate
|
||||
x <- traFor(x, GENETIC_CODE) # translate
|
||||
valUGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valUGC,
|
||||
breaks = 15,
|
||||
@ -299,6 +303,7 @@ effectUGC <- mean(valUGC) # 178.1
|
||||
set.seed(112358)
|
||||
# choose a new code
|
||||
GC <- randomGC(GENETIC_CODE)
|
||||
set.seed(NULL)
|
||||
|
||||
# reverse translate hypothetical sequence according to the new code
|
||||
x <- traRev(myAA, GC)
|
||||
@ -311,9 +316,10 @@ evalMut(myAA, x) # evaluate mutation effects: 298.5
|
||||
# Let's try with different genetic codes. 200 trials - but this time every trial
|
||||
# is with a different, synthetic genetic code.
|
||||
|
||||
set.seed(1414214)
|
||||
N <- 200
|
||||
valXGC <- numeric(N)
|
||||
|
||||
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- randomGC(GENETIC_CODE) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
@ -321,6 +327,7 @@ for (i in 1:N) {
|
||||
x <- traFor(x, GC) # translate
|
||||
valXGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valXGC,
|
||||
col = "plum",
|
||||
@ -343,9 +350,10 @@ hist(valXGC,
|
||||
|
||||
# = 4 Task solutions ======================================================
|
||||
|
||||
set.seed(2718282)
|
||||
N <- 200
|
||||
valSGC <- numeric(N)
|
||||
|
||||
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- swappedGC(GENETIC_CODE) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
@ -353,6 +361,7 @@ for (i in 1:N) {
|
||||
x <- traFor(x, GC) # translate
|
||||
valSGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valSGC,
|
||||
col = "#6688FF88",
|
||||
|
Loading…
Reference in New Issue
Block a user