From b17a9e202ad3429a716683538b3d0ed8d7d9ff9e Mon Sep 17 00:00:00 2001 From: hyginn Date: Sun, 28 Jan 2018 18:23:21 -0500 Subject: [PATCH] Add section on GPL annotations to RPR-GEO2R --- BIN-ALI-MSA.R | 4 ++-- BIN-Storing_data.R | 16 +++++++------- RPR-GEO2R.R | 55 ++++++++++++++++++++++++++++++++++------------ RPR-SX-PDB.R | 3 ++- 4 files changed, 53 insertions(+), 25 deletions(-) diff --git a/BIN-ALI-MSA.R b/BIN-ALI-MSA.R index 2fde3d2..164c420 100644 --- a/BIN-ALI-MSA.R +++ b/BIN-ALI-MSA.R @@ -238,7 +238,7 @@ for (i in seq_along(highScoringRanges$lengths)) { # We computed a T-Coffee alignment at the EBI. msa has no native import function # so we need to improvise, and it's a bit of a pain to do - but a good -# illustration of startegies to convert data into any kind of object: +# illustration of strategies to convert data into any kind of object: # - read an .aln file # - adjust the sequence names # - convert to msaAAMultipleAlignment object @@ -452,7 +452,7 @@ legend("bottomright", cex = 0.7, bty = "n") -# Your alignment is going to be differnte from mine, due to the inclusion of +# Your alignment is going to be different from mine, due to the inclusion of # MYSPE - but what I see is that MUSCLE gives the highest score overall, and # achieves this with fewer indels then most, and the lowest number of gaps of # all algorithms. diff --git a/BIN-Storing_data.R b/BIN-Storing_data.R index 350a261..f61da14 100644 --- a/BIN-Storing_data.R +++ b/BIN-Storing_data.R @@ -27,7 +27,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------- #TOC> 1 A Relational Datamodel in R: review 62 @@ -50,7 +50,7 @@ #TOC> 3.3 Create an R script to create your own database 540 #TOC> 3.3.1 Check and validate 560 #TOC> 3.4 Task: submit for credit (part 2/2) 601 -#TOC> +#TOC> #TOC> ========================================================================== @@ -205,7 +205,7 @@ str(philDB) # go back, re-read, play with it, and ask for help. This is essential. -# === 1.1.1 completing the database +# === 1.1.1 completing the database # Next I'll add one more person, and create the other two tables: @@ -311,7 +311,7 @@ for (ID in pID) { # Have a look at the structure of the yeast Mbp1 protein data: -file.edit("./data/MBP1_SACCE.json") +file.show("./data/MBP1_SACCE.json") # - The whole thing is an array: [ ... ]. This is not necessary for a single # object, but we will have more objects in other files. And it's perfectly @@ -369,7 +369,7 @@ dbSanitizeSequence(x) # == 2.3 Create a protein table for our data model ========================= -# === 2.3.1 Initialize the database +# === 2.3.1 Initialize the database # The function dbInit contains all the code to return a list of empty @@ -381,7 +381,7 @@ myDB <- dbInit() str(myDB) -# === 2.3.2 Add data +# === 2.3.2 Add data # fromJSON() returns a dataframe that we can readily process to add data @@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R") str(myDB) -# === 2.4.1 Examples of navigating the database +# === 2.4.1 Examples of navigating the database # You can look at the contents of the tables in the usual way we access @@ -552,7 +552,7 @@ myDB$taxonomy$species[sel] # in any of the JSON files. Later you will add more information ... -# === 3.3.1 Check and validate +# === 3.3.1 Check and validate # Is your protein named according to the pattern "MBP1_MYSPE"? It should be. diff --git a/RPR-GEO2R.R b/RPR-GEO2R.R index fab7af3..e7f2372 100644 --- a/RPR-GEO2R.R +++ b/RPR-GEO2R.R @@ -32,17 +32,18 @@ #TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------------- -#TOC> 1 Preparations 50 -#TOC> 2 Loading a GEO Dataset 81 -#TOC> 3 Column wise analysis - time points 151 -#TOC> 3.1 Task - Comparison of experiments 157 -#TOC> 3.2 Grouped Samples 204 -#TOC> 4 Row-wise Analysis: Expression Profiles 239 -#TOC> 4.1 Task - Read a table of features 274 -#TOC> 4.2 Selected Expression profiles 322 -#TOC> 5 Differential Expression 363 -#TOC> 5.1 Final task: Gene descriptions 487 -#TOC> 6 Improving on Discovery by Differential Expression 492 +#TOC> 1 Preparations 51 +#TOC> 2 Loading a GEO Dataset 82 +#TOC> 3 Column wise analysis - time points 152 +#TOC> 3.1 Task - Comparison of experiments 158 +#TOC> 3.2 Grouped Samples 205 +#TOC> 4 Row-wise Analysis: Expression Profiles 240 +#TOC> 4.1 Task - Read a table of features 275 +#TOC> 4.2 Selected Expression profiles 323 +#TOC> 5 Differential Expression 364 +#TOC> 5.1 Final task: Gene descriptions 488 +#TOC> 6 Improving on Discovery by Differential Expression 493 +#TOC> 7 Annotation data 575 #TOC> #TOC> ========================================================================== @@ -74,8 +75,8 @@ if (! require(GEOquery, quietly=TRUE)) { } # Package information: # library(help = GEOquery) # basic information -# browseVignettes("GEOquery") # available vignettes -# data(package = "GEOquery") # available datasets +# browseVignettes("GEOquery") # available vignettes +# data(package = "GEOquery") # available datasets # = 2 Loading a GEO Dataset =============================================== @@ -264,7 +265,7 @@ file.show("./data/SGD_features.README.txt") # Note: the file as downloaded from SGD actually crashed RStudio due to an # unbalanced quotation mark which caused R to try and read the whole # of the subsequent file into a single string. This was caused by an -# alias gene name (B"). I have removed this abomination, +# alias gene name (B"). I have removed this abomination # by editing the file. The version in the ./data directory can be # read without issues. @@ -571,5 +572,31 @@ for (i in 1:length(myBottomC)) { # and explore. There is a learning curve - but the payoffs are # significant. +# = 7 Annotation data ===================================================== +# +# Loading feature data "by hand" as we've done above, is usually not necessary +# since GEO provides rich annotations in the GPL platform files, which are +# associated with its Gene Expression Sets files. In the code above, +# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL +# annotations were not loaded. We could use getGPL = TRUE instead ... + +GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE) +GSE3635annot <- GSE3635annot[[1]] + +# ... and the feature data is then available in the GSE3635@featureData@data +# slot: +str(GSE3635annot@featureData@data) +GSE3635annot@featureData@data[ 1:20 , ] + +# ... or we could have identified the GPL file for this set: +GSE3635@annotation # "GPL1914" + +# ... and downloaded it directly from NCBI: +GPL1914 <- getGEO("GPL1914") +str(GPL1914) + +# ... from which we can get the data - which is however NOT necessarily +# matched to the rows of our expression dataset. + # [END] diff --git a/RPR-SX-PDB.R b/RPR-SX-PDB.R index c4f0073..9928dc5 100644 --- a/RPR-SX-PDB.R +++ b/RPR-SX-PDB.R @@ -16,6 +16,7 @@ # # TODO: # Confirm that SS residue numbers are indices +# Set task seed from student number # # == DO NOT SIMPLY source() THIS FILE! ======================================= # @@ -403,7 +404,7 @@ om <- c(360 + tor$omega[tor$omega < 0], hist(om, xlim=c(0,360)) abline(v=180, col="red") -# Note: a cis-peptide bond will have an omega torsion angle of around 0° +# Note: a cis-peptide bond will have an omega torsion angle around 0° # = 5 H-bond lengths ======================================================