Clarifications, and better color palettes for correlations
This commit is contained in:
parent
b42adac3f3
commit
069d8136e3
114
RPR-GEO2R.R
114
RPR-GEO2R.R
@ -1,11 +1,5 @@
|
|||||||
# tocID <- "RPR_GEO2R.R"
|
# tocID <- "RPR_GEO2R.R"
|
||||||
#
|
#
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
# PATIENCE ... #
|
|
||||||
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
||||||
# boris.steipe@utoronto.ca #
|
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
#
|
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR_GEO2R unit.
|
# R code accompanying the RPR_GEO2R unit.
|
||||||
#
|
#
|
||||||
@ -33,6 +27,8 @@
|
|||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
#
|
#
|
||||||
|
# TO SUBMIT FOR CREDIT....
|
||||||
|
#
|
||||||
# Note: to submit tasks for credit for this unit, report on the sections
|
# Note: to submit tasks for credit for this unit, report on the sections
|
||||||
# that have "Task ..." section headers, and report on the lines that are
|
# that have "Task ..." section headers, and report on the lines that are
|
||||||
# identified with #TASK> comments.
|
# identified with #TASK> comments.
|
||||||
@ -68,6 +64,7 @@
|
|||||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
install.packages("BiocManager")
|
install.packages("BiocManager")
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! requireNamespace("Biobase", quietly = TRUE)) {
|
if (! requireNamespace("Biobase", quietly = TRUE)) {
|
||||||
BiocManager::install("Biobase")
|
BiocManager::install("Biobase")
|
||||||
}
|
}
|
||||||
@ -152,47 +149,59 @@ Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns?
|
|||||||
# Access data
|
# Access data
|
||||||
Biobase::exprs(tmp) # exprs() gives us the actual expression values.
|
Biobase::exprs(tmp) # exprs() gives us the actual expression values.
|
||||||
|
|
||||||
|
# == 3.1 Task - understanding the data ==================================
|
||||||
|
|
||||||
#TASK> What are the data:
|
#TASK> What are the data values:
|
||||||
#TASK> ... in each cell?
|
#TASK> ... in each cell?
|
||||||
#TASK> ... in each column?
|
#TASK> ... in each column?
|
||||||
#TASK> ... in each row?
|
#TASK> ... in each row?
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 3 Column wise analysis - time points ==================================
|
# = 3 Column wise analysis - time points ==================================
|
||||||
|
|
||||||
# Each column represents one experiment.
|
# Get an overview of the distribution of values in individual columns
|
||||||
#TASK> What are these experiments?
|
|
||||||
|
|
||||||
|
|
||||||
# == 3.1 Task - Comparison of experiments ==================================
|
|
||||||
|
|
||||||
# Get an overview of the distribution of data values in individual columns
|
|
||||||
summary(Biobase::exprs(GSE3635)[ , 1])
|
summary(Biobase::exprs(GSE3635)[ , 1])
|
||||||
summary(Biobase::exprs(GSE3635)[ , 4])
|
summary(Biobase::exprs(GSE3635)[ , 4])
|
||||||
summary(Biobase::exprs(GSE3635)[ , 7])
|
summary(Biobase::exprs(GSE3635)[ , 7])
|
||||||
|
|
||||||
# as a boxplot
|
# This allows us to com pare the columns, comment on the quality of the data,
|
||||||
cyclicPalette <- colorRampPalette(c("#00AAFF",
|
# and get a sense for the distribution. We need to know how exactly these
|
||||||
"#DDDD00",
|
# numbers were produced: obviously, if we don't know how those numbers were
|
||||||
"#FFAA00",
|
# created in the first place, we would produce a major sin of Cargo Cult
|
||||||
"#00AAFF",
|
# bioinformatics if we would analyze them.
|
||||||
"#DDDD00",
|
|
||||||
"#FFAA00",
|
|
||||||
"#00AAFF"))
|
|
||||||
|
# compare them in a a boxplot
|
||||||
|
cyclicPalette <- colorRampPalette(c("#14b4c9",
|
||||||
|
"#d2d1e6",
|
||||||
|
"#e66594",
|
||||||
|
"#d2d1e6",
|
||||||
|
"#14b4c9",
|
||||||
|
"#d2d1e6",
|
||||||
|
"#e66594",
|
||||||
|
"#d2d1e6",
|
||||||
|
"#14b4c9"))
|
||||||
tCols <- cyclicPalette(13)
|
tCols <- cyclicPalette(13)
|
||||||
boxplot(Biobase::exprs(GSE3635), col = tCols)
|
boxplot(Biobase::exprs(GSE3635), col = tCols)
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.1 Task - Comparison of experiments ==================================
|
||||||
#TASK> Study this boxplot. What's going on? Are these expression values?
|
#TASK> Study this boxplot. What's going on? Are these expression values?
|
||||||
#TASK> What do the numbers that exprs() returns from the dataset mean?
|
#TASK> What do the numbers mean? (Summarize the process and computation
|
||||||
|
#TASK> that has gone i to the preprocessing. You need to understand why
|
||||||
|
#TASK> these columns all have the same mean and range.) Given what common
|
||||||
|
#TASK> sense tells you about the variability of experiments, do you
|
||||||
|
#TASK> believe your understanding is complete?
|
||||||
|
|
||||||
|
|
||||||
# Lets plot the distributions of values in a more fine-grained manner:
|
# Lets plot the distributions of values in a more fine-grained manner:
|
||||||
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100)
|
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100, col = tCols[1])
|
||||||
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100)
|
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100, col = tCols[4])
|
||||||
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100)
|
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100, col = tCols[7])
|
||||||
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100)
|
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100, col = tCols[10])
|
||||||
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100)
|
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100, col = tCols[13])
|
||||||
|
|
||||||
|
|
||||||
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
|
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
|
||||||
@ -289,7 +298,7 @@ readLines("./data/SGD_features.tab", n = 5)
|
|||||||
#
|
#
|
||||||
# - read "./data/SGD_features.tab" into a data frame
|
# - read "./data/SGD_features.tab" into a data frame
|
||||||
# called "SGD_features"
|
# called "SGD_features"
|
||||||
# - remove unneeded columns - keep the following information:
|
# - remove unneeded columns - keep the following data columns:
|
||||||
# - Primary SGDID
|
# - Primary SGDID
|
||||||
# - Feature type
|
# - Feature type
|
||||||
# - Feature qualifier
|
# - Feature qualifier
|
||||||
@ -312,10 +321,12 @@ readLines("./data/SGD_features.tab", n = 5)
|
|||||||
# - confirm: are all rows of the expression data set represented in
|
# - confirm: are all rows of the expression data set represented in
|
||||||
# the feature table? Hint: use setdiff() to print all that
|
# the feature table? Hint: use setdiff() to print all that
|
||||||
# are not.
|
# are not.
|
||||||
# Example: A <- c("duck", "crow", "gull", "tern")
|
# Example usage of setdiff():
|
||||||
# B <- c("gull", "rook", "tern", "kite", "myna")
|
# A <- c("duck", "crow", "gull", "tern")
|
||||||
# setdiff(A, B)
|
# B <- c("gull", "rook", "tern", "kite", "myna")
|
||||||
# setdiff(B, A)
|
#
|
||||||
|
# setdiff(A, B) # [1] "duck" "crow"
|
||||||
|
# setdiff(B, A) # [1] "rook" "kite" "myna"
|
||||||
|
|
||||||
# If some of the features in the expression set are not listed in the
|
# If some of the features in the expression set are not listed in the
|
||||||
# systematic names, you have to be aware of that, when you try to get
|
# systematic names, you have to be aware of that, when you try to get
|
||||||
@ -331,6 +342,8 @@ readLines("./data/SGD_features.tab", n = 5)
|
|||||||
|
|
||||||
# == 4.2 Selected Expression profiles ======================================
|
# == 4.2 Selected Expression profiles ======================================
|
||||||
|
|
||||||
|
# The code below assumes that you have read ./data/SGD_features.tab and assigned
|
||||||
|
# the resulting data frame to SGD_features, with columns as specified above.
|
||||||
|
|
||||||
# Here is an expression profile for Mbp1.
|
# Here is an expression profile for Mbp1.
|
||||||
|
|
||||||
@ -484,7 +497,7 @@ for (i in 1:10) {
|
|||||||
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
|
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Our guess that we might discover interesting genes be selecting groups A and B
|
# Our guess that we might discover interesting genes by selecting groups A and B
|
||||||
# like we did was not bad. But limma knows nothing about the biology and though
|
# like we did was not bad. But limma knows nothing about the biology and though
|
||||||
# the expression profiles look good, there is no guarantee that these are the
|
# the expression profiles look good, there is no guarantee that these are the
|
||||||
# most biologically relevant genes. Significantly different in expression
|
# most biologically relevant genes. Significantly different in expression
|
||||||
@ -512,8 +525,8 @@ for (name in toupper(myControls)) {
|
|||||||
|
|
||||||
# == 5.1 Final task: Gene descriptions =====================================
|
# == 5.1 Final task: Gene descriptions =====================================
|
||||||
|
|
||||||
# Print the descriptions of the top ten differentially expressed genes
|
#TASK> Print the descriptions of the top ten differentially expressed genes
|
||||||
# and comment on what they have in common (or not).
|
#TASK> and comment on what they have in common (or not).
|
||||||
|
|
||||||
|
|
||||||
# = 6 Improving on Discovery by Differential Expression ===================
|
# = 6 Improving on Discovery by Differential Expression ===================
|
||||||
@ -535,9 +548,12 @@ plot(seq(0, 120, by = 10),
|
|||||||
xlab = "time (min)",
|
xlab = "time (min)",
|
||||||
ylab = "expression",
|
ylab = "expression",
|
||||||
type = "b",
|
type = "b",
|
||||||
col= "maroon")
|
pch = 16,
|
||||||
abline(h = 0, col = "#00000055")
|
cex = 1.5,
|
||||||
abline(v = 60, col = "#00000055")
|
lwd = 2,
|
||||||
|
col= "#40b886")
|
||||||
|
abline(h = 0, col = "#0000FF55")
|
||||||
|
abline(v = 60, col = "#0000FF55")
|
||||||
|
|
||||||
# Set up a vector of correlation values
|
# Set up a vector of correlation values
|
||||||
|
|
||||||
@ -548,7 +564,8 @@ for (i in 1:length(myCorrelations)) {
|
|||||||
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
|
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
|
||||||
}
|
}
|
||||||
|
|
||||||
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten
|
nTOP <- 20
|
||||||
|
myTopC <- order(myCorrelations, decreasing = TRUE)[1:nTOP]
|
||||||
|
|
||||||
# Number 1
|
# Number 1
|
||||||
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
|
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
|
||||||
@ -559,12 +576,15 @@ SGD_features[which(SGD_features$sysName == ID), ]
|
|||||||
# control for the experiment.
|
# control for the experiment.
|
||||||
|
|
||||||
# Let's plot the rest
|
# Let's plot the rest
|
||||||
for (i in 2:length(myTopC)) {
|
myPal <- colorRampPalette(c("#82f58d", "#E0F2E2", "#f6f6f6"))
|
||||||
|
|
||||||
|
for (i in 2:nTOP) {
|
||||||
ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
|
ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
|
||||||
points(seq(0, 120, by = 10),
|
points(seq(0, 120, by = 10),
|
||||||
Biobase::exprs(GSE3635)[ID, ],
|
Biobase::exprs(GSE3635)[ID, ],
|
||||||
type = "b",
|
type = "b",
|
||||||
col= "chartreuse")
|
cex = 0.8,
|
||||||
|
col= myPal(nTOP)[i])
|
||||||
print(SGD_features[which(SGD_features$sysName == ID),
|
print(SGD_features[which(SGD_features$sysName == ID),
|
||||||
c("name", "description")])
|
c("name", "description")])
|
||||||
}
|
}
|
||||||
@ -576,13 +596,17 @@ for (i in 2:length(myTopC)) {
|
|||||||
# mean small biological effects? Certainly not!
|
# mean small biological effects? Certainly not!
|
||||||
|
|
||||||
# And we haven't even looked at the anticorrelated genes yet...
|
# And we haven't even looked at the anticorrelated genes yet...
|
||||||
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten
|
nBOT <- nTOP
|
||||||
for (i in 1:length(myBottomC)) {
|
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:nBOT] # bottom ten
|
||||||
|
myPal <- colorRampPalette(c("#ba112a", "#E3C8CC", "#ebebeb"))
|
||||||
|
|
||||||
|
for (i in 1:nBOT) {
|
||||||
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
|
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
|
||||||
points(seq(0, 120, by = 10),
|
points(seq(0, 120, by = 10),
|
||||||
Biobase::exprs(GSE3635)[ID, ],
|
Biobase::exprs(GSE3635)[ID, ],
|
||||||
type = "b",
|
type = "b",
|
||||||
col= "coral")
|
cex = 0.8,
|
||||||
|
col= myPal(nBOT)[i])
|
||||||
print(SGD_features[which(SGD_features$sysName == ID),
|
print(SGD_features[which(SGD_features$sysName == ID),
|
||||||
c("name", "description")])
|
c("name", "description")])
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user