Add section on GPL annotations to RPR-GEO2R

2018-01-28 18:23:21 -05:00
parent 359434d863
commit b17a9e202a
4 changed files with 53 additions and 25 deletions
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
@@ -238,7 +238,7 @@ for (i in seq_along(highScoringRanges$lengths)) {

 # We computed a T-Coffee alignment at the EBI. msa has no native import function
 # so we need to improvise, and it's a bit of a pain to do - but a good
-# illustration of startegies to convert data into any kind of object:
+# illustration of strategies to convert data into any kind of object:
 #   -  read an .aln file
 #   -  adjust the sequence names
 #   -  convert to msaAAMultipleAlignment object
@@ -452,7 +452,7 @@ legend("bottomright",
       cex = 0.7,
       bty = "n")

-# Your alignment is going to be differnte from mine, due to the inclusion of
+# Your alignment is going to be different from mine, due to the inclusion of
 # MYSPE - but what I see is that MUSCLE gives the highest score overall, and
 # achieves this with fewer indels then most, and the lowest number of gaps of
 # all algorithms.
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
@@ -27,7 +27,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                             Line
 #TOC> -----------------------------------------------------------------
 #TOC>   1        A Relational Datamodel in R: review                 62
@@ -50,7 +50,7 @@
 #TOC>   3.3      Create an R script to create your own database     540
 #TOC>   3.3.1    Check and validate                                 560
 #TOC>   3.4      Task: submit for credit (part 2/2)                 601
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@@ -205,7 +205,7 @@ str(philDB)
 # go back, re-read, play with it, and ask for help. This is essential.


-# ===  1.1.1  completing the database                       
+# ===  1.1.1  completing the database


 # Next I'll add one more person, and create the other two tables:
@@ -311,7 +311,7 @@ for (ID in pID) {


 # Have a look at the structure of the yeast Mbp1 protein data:
-file.edit("./data/MBP1_SACCE.json")
+file.show("./data/MBP1_SACCE.json")

 # - The whole thing is an array: [ ... ]. This is not necessary for a single
 #     object, but we will have more objects in other files. And it's perfectly
@@ -369,7 +369,7 @@ dbSanitizeSequence(x)

 # ==   2.3  Create a protein table for our data model  =========================

-# ===  2.3.1  Initialize the database                       
+# ===  2.3.1  Initialize the database


 # The function dbInit contains all the code to return a list of empty
@@ -381,7 +381,7 @@ myDB <- dbInit()
 str(myDB)


-# ===  2.3.2  Add data                                      
+# ===  2.3.2  Add data


 # fromJSON() returns a dataframe that we can readily process to add data
@@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R")
 str(myDB)


-# ===  2.4.1  Examples of navigating the database           
+# ===  2.4.1  Examples of navigating the database


 # You can look at the contents of the tables in the usual way we access
@@ -552,7 +552,7 @@ myDB$taxonomy$species[sel]
 # in any of the JSON files. Later you will add more information ...


-# ===  3.3.1  Check and validate                            
+# ===  3.3.1  Check and validate


 # Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@@ -32,17 +32,18 @@
 #TOC> 
 #TOC>   Section  Title                                                Line
 #TOC> --------------------------------------------------------------------
-#TOC>   1        Preparations                                           50
-#TOC>   2        Loading a GEO Dataset                                  81
-#TOC>   3        Column wise analysis - time points                    151
-#TOC>   3.1      Task - Comparison of experiments                      157
-#TOC>   3.2      Grouped Samples                                       204
-#TOC>   4        Row-wise Analysis: Expression Profiles                239
-#TOC>   4.1      Task - Read a table of features                       274
-#TOC>   4.2      Selected Expression profiles                          322
-#TOC>   5        Differential Expression                               363
-#TOC>   5.1      Final task: Gene descriptions                         487
-#TOC>   6        Improving on Discovery by Differential Expression     492
+#TOC>   1        Preparations                                           51
+#TOC>   2        Loading a GEO Dataset                                  82
+#TOC>   3        Column wise analysis - time points                    152
+#TOC>   3.1      Task - Comparison of experiments                      158
+#TOC>   3.2      Grouped Samples                                       205
+#TOC>   4        Row-wise Analysis: Expression Profiles                240
+#TOC>   4.1      Task - Read a table of features                       275
+#TOC>   4.2      Selected Expression profiles                          323
+#TOC>   5        Differential Expression                               364
+#TOC>   5.1      Final task: Gene descriptions                         488
+#TOC>   6        Improving on Discovery by Differential Expression     493
+#TOC>   7        Annotation data                                       575
 #TOC> 
 #TOC> ==========================================================================

@@ -74,8 +75,8 @@ if (! require(GEOquery, quietly=TRUE)) {
 }
 # Package information:
 #  library(help = GEOquery)       # basic information
-#  browseVignettes("GEOquery")  # available vignettes
-#  data(package = "GEOquery")   # available datasets
+#  browseVignettes("GEOquery")    # available vignettes
+#  data(package = "GEOquery")      # available datasets


 # =    2  Loading a GEO Dataset  ===============================================
@@ -264,7 +265,7 @@ file.show("./data/SGD_features.README.txt")
 #     Note: the file as downloaded from SGD actually crashed RStudio due to an
 #           unbalanced quotation mark which caused R to try and read the whole
 #           of the subsequent file into a single string. This was caused by an
-#           alias gene name (B"). I have removed this abomination,
+#           alias gene name (B"). I have removed this abomination
 #           by editing the file. The version in the ./data directory can be
 #           read without issues.

@@ -571,5 +572,31 @@ for (i in 1:length(myBottomC)) {
 #        and explore. There is a learning curve - but the payoffs are
 #        significant.

+# =    7  Annotation data  =====================================================
+#
+# Loading feature data "by hand" as we've done above, is usually not necessary
+# since GEO provides rich annotations in the GPL platform files, which are
+# associated with its Gene Expression Sets files. In the code above,
+# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
+# annotations were not loaded. We could use getGPL = TRUE instead ...
+
+GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
+GSE3635annot <- GSE3635annot[[1]]
+
+# ... and the feature data is then available in the GSE3635@featureData@data
+# slot:
+str(GSE3635annot@featureData@data)
+GSE3635annot@featureData@data[ 1:20 , ]
+
+# ... or we could have identified the GPL file for this set:
+GSE3635@annotation   # "GPL1914"
+
+# ... and downloaded it directly from NCBI:
+GPL1914 <- getGEO("GPL1914")
+str(GPL1914)
+
+# ... from which we can get the data - which is however NOT necessarily
+# matched to the rows of our expression dataset.
+

 # [END]
--- a/RPR-SX-PDB.R
+++ b/RPR-SX-PDB.R
@@ -16,6 +16,7 @@
 #
 # TODO:
 #          Confirm that SS residue numbers are indices
+#          Set task seed from student number
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
 #
@@ -403,7 +404,7 @@ om <- c(360 + tor$omega[tor$omega < 0],
 hist(om, xlim=c(0,360))
 abline(v=180, col="red")

-# Note: a cis-peptide bond will have an omega torsion angle of around 0°
+# Note: a cis-peptide bond will have an omega torsion angle around 0°


 # =    5  H-bond lengths  ======================================================