Clarifications, and better color palettes for correlations

2020-09-25 18:51:38 +10:00
parent b42adac3f3
commit 069d8136e3
1 changed files with 69 additions and 45 deletions
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@@ -1,11 +1,5 @@
 # tocID <- "RPR_GEO2R.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
-#
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR_GEO2R unit.
 #
@@ -33,6 +27,8 @@
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
 #
+# TO SUBMIT FOR CREDIT....
+#
 # Note: to submit tasks for credit for this unit, report on the sections
 # that have "Task ..." section headers, and report on the lines that are
 # identified with #TASK>  comments.
@@ -68,6 +64,7 @@
 if (! requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
 }
+
 if (! requireNamespace("Biobase", quietly = TRUE)) {
  BiocManager::install("Biobase")
 }
@@ -152,47 +149,59 @@ Biobase::sampleNames(GSE3635)[1:10]    # Columns. What are these columns?
 # Access data
 Biobase::exprs(tmp)   # exprs() gives us the actual expression values.

+# ==   3.1  Task - understanding the data  ==================================

-#TASK> What are the data:
+#TASK> What are the data values:
 #TASK>  ... in each cell?
 #TASK>  ... in each column?
 #TASK>  ... in each row?

+
+
 # =    3  Column wise analysis - time points  ==================================

-#  Each column represents one experiment.
-#TASK>  What are these experiments?
-
-
-# ==   3.1  Task - Comparison of experiments  ==================================
-
-# Get an overview of the distribution of data values in individual columns
+# Get an overview of the distribution of values in individual columns
 summary(Biobase::exprs(GSE3635)[ , 1])
 summary(Biobase::exprs(GSE3635)[ , 4])
 summary(Biobase::exprs(GSE3635)[ , 7])

-# as a boxplot
-cyclicPalette <- colorRampPalette(c("#00AAFF",
-                                    "#DDDD00",
-                                    "#FFAA00",
-                                    "#00AAFF",
-                                    "#DDDD00",
-                                    "#FFAA00",
-                                    "#00AAFF"))
+# This allows us to com pare the columns, comment on the quality of the data,
+# and get a sense for the distribution. We need to know how exactly these
+# numbers were produced: obviously, if we don't know how those numbers were
+# created in the first place, we would produce a major sin of Cargo Cult
+# bioinformatics if we would analyze them.
+
+
+
+# compare them in a a boxplot
+cyclicPalette <- colorRampPalette(c("#14b4c9",
+                                    "#d2d1e6",
+                                    "#e66594",
+                                    "#d2d1e6",
+                                    "#14b4c9",
+                                    "#d2d1e6",
+                                    "#e66594",
+                                    "#d2d1e6",
+                                    "#14b4c9"))
 tCols <- cyclicPalette(13)
 boxplot(Biobase::exprs(GSE3635), col = tCols)


+# ==   3.1  Task - Comparison of experiments  ==================================
 #TASK>     Study this boxplot. What's going on? Are these expression values?
-#TASK>     What do the numbers that exprs() returns from the dataset mean?
+#TASK>     What do the numbers mean? (Summarize the process and computation
+#TASK>     that has gone i to the preprocessing. You need to understand why
+#TASK>     these columns all have the same mean and range.) Given what common
+#TASK>     sense tells you about the variability of experiments, do you
+#TASK>     believe your understanding is complete?


 # Lets plot the distributions of values in a more fine-grained manner:
-hT0  <- hist(Biobase::exprs(GSE3635)[ ,  1], breaks = 100)
-hT3  <- hist(Biobase::exprs(GSE3635)[ ,  4], breaks = 100)
-hT6  <- hist(Biobase::exprs(GSE3635)[ ,  7], breaks = 100)
-hT9  <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100)
-hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100)
+hT0  <- hist(Biobase::exprs(GSE3635)[ ,  1], breaks = 100, col =  tCols[1])
+hT3  <- hist(Biobase::exprs(GSE3635)[ ,  4], breaks = 100, col =  tCols[4])
+hT6  <- hist(Biobase::exprs(GSE3635)[ ,  7], breaks = 100, col =  tCols[7])
+hT9  <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100, col =  tCols[10])
+hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100, col =  tCols[13])


 plot(  hT0$mids,  hT0$counts,  type = "l", col =  tCols[1], xlim = c(-0.5, 0.5))
@@ -289,7 +298,7 @@ readLines("./data/SGD_features.tab", n = 5)
 #
 #    -  read "./data/SGD_features.tab" into a data frame
 #          called "SGD_features"
-#    -  remove unneeded columns - keep the following information:
+#    -  remove unneeded columns - keep the following data columns:
 #         -   Primary SGDID
 #         -   Feature type
 #         -   Feature qualifier
@@ -312,10 +321,12 @@ readLines("./data/SGD_features.tab", n = 5)
 #    -  confirm: are all rows of the expression data set represented in
 #                  the feature table? Hint: use setdiff() to print all that
 #                  are not.
-#                  Example:  A <- c("duck", "crow", "gull", "tern")
-#                            B <- c("gull", "rook", "tern", "kite", "myna")
-#                            setdiff(A, B)
-#                            setdiff(B, A)
+#                  Example usage of setdiff():
+#                      A <- c("duck", "crow", "gull", "tern")
+#                      B <- c("gull", "rook", "tern", "kite", "myna")
+#
+#                      setdiff(A, B)  #  [1] "duck" "crow"
+#                      setdiff(B, A)  #  [1] "rook" "kite" "myna"

 #       If some of the features in the expression set are not listed in the
 #       systematic names, you have to be aware of that, when you try to get
@@ -331,6 +342,8 @@ readLines("./data/SGD_features.tab", n = 5)

 # ==   4.2  Selected Expression profiles  ======================================

+# The code below assumes that you have read ./data/SGD_features.tab and assigned
+# the resulting data frame to SGD_features, with columns as specified above.

 # Here is an expression profile for Mbp1.

@@ -484,7 +497,7 @@ for (i in 1:10) {
  points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
 }

-# Our guess that we might discover interesting genes be selecting groups A and B
+# Our guess that we might discover interesting genes by selecting groups A and B
 # like we did was not bad. But limma knows nothing about the biology and though
 # the expression profiles look good, there is no guarantee that these are the
 # most biologically relevant genes. Significantly different in expression
@@ -512,8 +525,8 @@ for (name in toupper(myControls)) {

 # ==   5.1  Final task: Gene descriptions  =====================================

-#    Print the descriptions of the top ten differentially expressed genes
-#    and comment on what they have in common (or not).
+#TASK> Print the descriptions of the top ten differentially expressed genes
+#TASK> and comment on what they have in common (or not).


 # =    6  Improving on Discovery by Differential Expression  ===================
@@ -535,9 +548,12 @@ plot(seq(0, 120, by = 10),
     xlab = "time (min)",
     ylab = "expression",
     type = "b",
-     col= "maroon")
-abline(h =  0, col = "#00000055")
-abline(v = 60, col = "#00000055")
+     pch = 16,
+     cex = 1.5,
+     lwd = 2,
+     col= "#40b886")
+abline(h =  0, col = "#0000FF55")
+abline(v = 60, col = "#0000FF55")

 # Set up a vector of correlation values

@@ -548,7 +564,8 @@ for (i in 1:length(myCorrelations)) {
  myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
 }

-myTopC <- order(myCorrelations, decreasing = TRUE)[1:10]  # top ten
+nTOP <- 20
+myTopC <- order(myCorrelations, decreasing = TRUE)[1:nTOP]

 # Number 1
 (ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
@@ -559,12 +576,15 @@ SGD_features[which(SGD_features$sysName == ID), ]
 # control for the experiment.

 # Let's plot the rest
-for (i in 2:length(myTopC)) {
+myPal <- colorRampPalette(c("#82f58d", "#E0F2E2", "#f6f6f6"))
+
+for (i in 2:nTOP) {
  ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
  points(seq(0, 120, by = 10),
         Biobase::exprs(GSE3635)[ID, ],
       type = "b",
-       col= "chartreuse")
+       cex = 0.8,
+       col= myPal(nTOP)[i])
  print(SGD_features[which(SGD_features$sysName == ID),
                     c("name", "description")])
 }
@@ -576,13 +596,17 @@ for (i in 2:length(myTopC)) {
 # mean small biological effects? Certainly not!

 # And we haven't even looked at the anticorrelated genes yet...
-myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10]  # bottom ten
-for (i in 1:length(myBottomC)) {
+nBOT <- nTOP
+myBottomC <- order(myCorrelations, decreasing = FALSE)[1:nBOT]  # bottom ten
+myPal <- colorRampPalette(c("#ba112a", "#E3C8CC", "#ebebeb"))
+
+for (i in 1:nBOT) {
  ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
  points(seq(0, 120, by = 10),
         Biobase::exprs(GSE3635)[ID, ],
         type = "b",
-         col= "coral")
+         cex = 0.8,
+         col= myPal(nBOT)[i])
  print(SGD_features[which(SGD_features$sysName == ID),
                     c("name", "description")])
 }