Clarifications, and better color palettes for correlations

This commit is contained in:
hyginn 2020-09-25 18:51:38 +10:00
parent b42adac3f3
commit 069d8136e3

View File

@ -1,11 +1,5 @@
# tocID <- "RPR_GEO2R.R" # tocID <- "RPR_GEO2R.R"
# #
# ---------------------------------------------------------------------------- #
# PATIENCE ... #
# Do not yet work wih this code. Updates in progress. Thank you. #
# boris.steipe@utoronto.ca #
# ---------------------------------------------------------------------------- #
#
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR_GEO2R unit. # R code accompanying the RPR_GEO2R unit.
# #
@ -33,6 +27,8 @@
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# TO SUBMIT FOR CREDIT....
#
# Note: to submit tasks for credit for this unit, report on the sections # Note: to submit tasks for credit for this unit, report on the sections
# that have "Task ..." section headers, and report on the lines that are # that have "Task ..." section headers, and report on the lines that are
# identified with #TASK> comments. # identified with #TASK> comments.
@ -68,6 +64,7 @@
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biobase", quietly = TRUE)) { if (! requireNamespace("Biobase", quietly = TRUE)) {
BiocManager::install("Biobase") BiocManager::install("Biobase")
} }
@ -152,47 +149,59 @@ Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns?
# Access data # Access data
Biobase::exprs(tmp) # exprs() gives us the actual expression values. Biobase::exprs(tmp) # exprs() gives us the actual expression values.
# == 3.1 Task - understanding the data ==================================
#TASK> What are the data: #TASK> What are the data values:
#TASK> ... in each cell? #TASK> ... in each cell?
#TASK> ... in each column? #TASK> ... in each column?
#TASK> ... in each row? #TASK> ... in each row?
# = 3 Column wise analysis - time points ================================== # = 3 Column wise analysis - time points ==================================
# Each column represents one experiment. # Get an overview of the distribution of values in individual columns
#TASK> What are these experiments?
# == 3.1 Task - Comparison of experiments ==================================
# Get an overview of the distribution of data values in individual columns
summary(Biobase::exprs(GSE3635)[ , 1]) summary(Biobase::exprs(GSE3635)[ , 1])
summary(Biobase::exprs(GSE3635)[ , 4]) summary(Biobase::exprs(GSE3635)[ , 4])
summary(Biobase::exprs(GSE3635)[ , 7]) summary(Biobase::exprs(GSE3635)[ , 7])
# as a boxplot # This allows us to com pare the columns, comment on the quality of the data,
cyclicPalette <- colorRampPalette(c("#00AAFF", # and get a sense for the distribution. We need to know how exactly these
"#DDDD00", # numbers were produced: obviously, if we don't know how those numbers were
"#FFAA00", # created in the first place, we would produce a major sin of Cargo Cult
"#00AAFF", # bioinformatics if we would analyze them.
"#DDDD00",
"#FFAA00",
"#00AAFF"))
# compare them in a a boxplot
cyclicPalette <- colorRampPalette(c("#14b4c9",
"#d2d1e6",
"#e66594",
"#d2d1e6",
"#14b4c9",
"#d2d1e6",
"#e66594",
"#d2d1e6",
"#14b4c9"))
tCols <- cyclicPalette(13) tCols <- cyclicPalette(13)
boxplot(Biobase::exprs(GSE3635), col = tCols) boxplot(Biobase::exprs(GSE3635), col = tCols)
# == 3.1 Task - Comparison of experiments ==================================
#TASK> Study this boxplot. What's going on? Are these expression values? #TASK> Study this boxplot. What's going on? Are these expression values?
#TASK> What do the numbers that exprs() returns from the dataset mean? #TASK> What do the numbers mean? (Summarize the process and computation
#TASK> that has gone i to the preprocessing. You need to understand why
#TASK> these columns all have the same mean and range.) Given what common
#TASK> sense tells you about the variability of experiments, do you
#TASK> believe your understanding is complete?
# Lets plot the distributions of values in a more fine-grained manner: # Lets plot the distributions of values in a more fine-grained manner:
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100) hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100, col = tCols[1])
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100) hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100, col = tCols[4])
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100) hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100, col = tCols[7])
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100) hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100, col = tCols[10])
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100) hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100, col = tCols[13])
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5)) plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
@ -289,7 +298,7 @@ readLines("./data/SGD_features.tab", n = 5)
# #
# - read "./data/SGD_features.tab" into a data frame # - read "./data/SGD_features.tab" into a data frame
# called "SGD_features" # called "SGD_features"
# - remove unneeded columns - keep the following information: # - remove unneeded columns - keep the following data columns:
# - Primary SGDID # - Primary SGDID
# - Feature type # - Feature type
# - Feature qualifier # - Feature qualifier
@ -312,10 +321,12 @@ readLines("./data/SGD_features.tab", n = 5)
# - confirm: are all rows of the expression data set represented in # - confirm: are all rows of the expression data set represented in
# the feature table? Hint: use setdiff() to print all that # the feature table? Hint: use setdiff() to print all that
# are not. # are not.
# Example: A <- c("duck", "crow", "gull", "tern") # Example usage of setdiff():
# B <- c("gull", "rook", "tern", "kite", "myna") # A <- c("duck", "crow", "gull", "tern")
# setdiff(A, B) # B <- c("gull", "rook", "tern", "kite", "myna")
# setdiff(B, A) #
# setdiff(A, B) # [1] "duck" "crow"
# setdiff(B, A) # [1] "rook" "kite" "myna"
# If some of the features in the expression set are not listed in the # If some of the features in the expression set are not listed in the
# systematic names, you have to be aware of that, when you try to get # systematic names, you have to be aware of that, when you try to get
@ -331,6 +342,8 @@ readLines("./data/SGD_features.tab", n = 5)
# == 4.2 Selected Expression profiles ====================================== # == 4.2 Selected Expression profiles ======================================
# The code below assumes that you have read ./data/SGD_features.tab and assigned
# the resulting data frame to SGD_features, with columns as specified above.
# Here is an expression profile for Mbp1. # Here is an expression profile for Mbp1.
@ -484,7 +497,7 @@ for (i in 1:10) {
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b") points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
} }
# Our guess that we might discover interesting genes be selecting groups A and B # Our guess that we might discover interesting genes by selecting groups A and B
# like we did was not bad. But limma knows nothing about the biology and though # like we did was not bad. But limma knows nothing about the biology and though
# the expression profiles look good, there is no guarantee that these are the # the expression profiles look good, there is no guarantee that these are the
# most biologically relevant genes. Significantly different in expression # most biologically relevant genes. Significantly different in expression
@ -512,8 +525,8 @@ for (name in toupper(myControls)) {
# == 5.1 Final task: Gene descriptions ===================================== # == 5.1 Final task: Gene descriptions =====================================
# Print the descriptions of the top ten differentially expressed genes #TASK> Print the descriptions of the top ten differentially expressed genes
# and comment on what they have in common (or not). #TASK> and comment on what they have in common (or not).
# = 6 Improving on Discovery by Differential Expression =================== # = 6 Improving on Discovery by Differential Expression ===================
@ -535,9 +548,12 @@ plot(seq(0, 120, by = 10),
xlab = "time (min)", xlab = "time (min)",
ylab = "expression", ylab = "expression",
type = "b", type = "b",
col= "maroon") pch = 16,
abline(h = 0, col = "#00000055") cex = 1.5,
abline(v = 60, col = "#00000055") lwd = 2,
col= "#40b886")
abline(h = 0, col = "#0000FF55")
abline(v = 60, col = "#0000FF55")
# Set up a vector of correlation values # Set up a vector of correlation values
@ -548,7 +564,8 @@ for (i in 1:length(myCorrelations)) {
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ]) myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
} }
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten nTOP <- 20
myTopC <- order(myCorrelations, decreasing = TRUE)[1:nTOP]
# Number 1 # Number 1
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]]) (ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
@ -559,12 +576,15 @@ SGD_features[which(SGD_features$sysName == ID), ]
# control for the experiment. # control for the experiment.
# Let's plot the rest # Let's plot the rest
for (i in 2:length(myTopC)) { myPal <- colorRampPalette(c("#82f58d", "#E0F2E2", "#f6f6f6"))
for (i in 2:nTOP) {
ID <- Biobase::featureNames(GSE3635)[myTopC[i]] ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
points(seq(0, 120, by = 10), points(seq(0, 120, by = 10),
Biobase::exprs(GSE3635)[ID, ], Biobase::exprs(GSE3635)[ID, ],
type = "b", type = "b",
col= "chartreuse") cex = 0.8,
col= myPal(nTOP)[i])
print(SGD_features[which(SGD_features$sysName == ID), print(SGD_features[which(SGD_features$sysName == ID),
c("name", "description")]) c("name", "description")])
} }
@ -576,13 +596,17 @@ for (i in 2:length(myTopC)) {
# mean small biological effects? Certainly not! # mean small biological effects? Certainly not!
# And we haven't even looked at the anticorrelated genes yet... # And we haven't even looked at the anticorrelated genes yet...
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten nBOT <- nTOP
for (i in 1:length(myBottomC)) { myBottomC <- order(myCorrelations, decreasing = FALSE)[1:nBOT] # bottom ten
myPal <- colorRampPalette(c("#ba112a", "#E3C8CC", "#ebebeb"))
for (i in 1:nBOT) {
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]] ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
points(seq(0, 120, by = 10), points(seq(0, 120, by = 10),
Biobase::exprs(GSE3635)[ID, ], Biobase::exprs(GSE3635)[ID, ],
type = "b", type = "b",
col= "coral") cex = 0.8,
col= myPal(nBOT)[i])
print(SGD_features[which(SGD_features$sysName == ID), print(SGD_features[which(SGD_features$sysName == ID),
c("name", "description")]) c("name", "description")])
} }