Add section on GPL annotations to RPR-GEO2R

This commit is contained in:
hyginn 2018-01-28 18:23:21 -05:00
parent 359434d863
commit c4fed71a7f
4 changed files with 79 additions and 28 deletions

View File

@ -238,7 +238,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
# We computed a T-Coffee alignment at the EBI. msa has no native import function # We computed a T-Coffee alignment at the EBI. msa has no native import function
# so we need to improvise, and it's a bit of a pain to do - but a good # so we need to improvise, and it's a bit of a pain to do - but a good
# illustration of startegies to convert data into any kind of object: # illustration of strategies to convert data into any kind of object:
# - read an .aln file # - read an .aln file
# - adjust the sequence names # - adjust the sequence names
# - convert to msaAAMultipleAlignment object # - convert to msaAAMultipleAlignment object
@ -452,7 +452,7 @@ legend("bottomright",
cex = 0.7, cex = 0.7,
bty = "n") bty = "n")
# Your alignment is going to be differnte from mine, due to the inclusion of # Your alignment is going to be different from mine, due to the inclusion of
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and # MYSPE - but what I see is that MUSCLE gives the highest score overall, and
# achieves this with fewer indels then most, and the lowest number of gaps of # achieves this with fewer indels then most, and the lowest number of gaps of
# all algorithms. # all algorithms.

View File

@ -27,7 +27,7 @@
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------------- #TOC> -----------------------------------------------------------------
#TOC> 1 A Relational Datamodel in R: review 62 #TOC> 1 A Relational Datamodel in R: review 62
@ -50,7 +50,7 @@
#TOC> 3.3 Create an R script to create your own database 540 #TOC> 3.3 Create an R script to create your own database 540
#TOC> 3.3.1 Check and validate 560 #TOC> 3.3.1 Check and validate 560
#TOC> 3.4 Task: submit for credit (part 2/2) 601 #TOC> 3.4 Task: submit for credit (part 2/2) 601
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -205,7 +205,7 @@ str(philDB)
# go back, re-read, play with it, and ask for help. This is essential. # go back, re-read, play with it, and ask for help. This is essential.
# === 1.1.1 completing the database # === 1.1.1 completing the database
# Next I'll add one more person, and create the other two tables: # Next I'll add one more person, and create the other two tables:
@ -311,7 +311,7 @@ for (ID in pID) {
# Have a look at the structure of the yeast Mbp1 protein data: # Have a look at the structure of the yeast Mbp1 protein data:
file.edit("./data/MBP1_SACCE.json") file.show("./data/MBP1_SACCE.json")
# - The whole thing is an array: [ ... ]. This is not necessary for a single # - The whole thing is an array: [ ... ]. This is not necessary for a single
# object, but we will have more objects in other files. And it's perfectly # object, but we will have more objects in other files. And it's perfectly
@ -369,7 +369,7 @@ dbSanitizeSequence(x)
# == 2.3 Create a protein table for our data model ========================= # == 2.3 Create a protein table for our data model =========================
# === 2.3.1 Initialize the database # === 2.3.1 Initialize the database
# The function dbInit contains all the code to return a list of empty # The function dbInit contains all the code to return a list of empty
@ -381,7 +381,7 @@ myDB <- dbInit()
str(myDB) str(myDB)
# === 2.3.2 Add data # === 2.3.2 Add data
# fromJSON() returns a dataframe that we can readily process to add data # fromJSON() returns a dataframe that we can readily process to add data
@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R")
str(myDB) str(myDB)
# === 2.4.1 Examples of navigating the database # === 2.4.1 Examples of navigating the database
# You can look at the contents of the tables in the usual way we access # You can look at the contents of the tables in the usual way we access
@ -552,7 +552,7 @@ myDB$taxonomy$species[sel]
# in any of the JSON files. Later you will add more information ... # in any of the JSON files. Later you will add more information ...
# === 3.3.1 Check and validate # === 3.3.1 Check and validate
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be. # Is your protein named according to the pattern "MBP1_MYSPE"? It should be.

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR_GEO2R unit. # R code accompanying the RPR_GEO2R unit.
# #
# Version: 0.1 # Version: 1.1
# #
# Date: 2017 MM DD # Date: 2017 09 - 2018 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe <boris.steipe@utoronto.ca>
# #
# Versions: # Versions:
# 1.1 Add section on GPL annotations
# 1.0 Updates for BCH441 2017
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
@ -32,17 +34,18 @@
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------------------- #TOC> --------------------------------------------------------------------
#TOC> 1 Preparations 50 #TOC> 1 Preparations 53
#TOC> 2 Loading a GEO Dataset 81 #TOC> 2 Loading a GEO Dataset 84
#TOC> 3 Column wise analysis - time points 151 #TOC> 3 Column wise analysis - time points 154
#TOC> 3.1 Task - Comparison of experiments 157 #TOC> 3.1 Task - Comparison of experiments 160
#TOC> 3.2 Grouped Samples 204 #TOC> 3.2 Grouped Samples 207
#TOC> 4 Row-wise Analysis: Expression Profiles 239 #TOC> 4 Row-wise Analysis: Expression Profiles 242
#TOC> 4.1 Task - Read a table of features 274 #TOC> 4.1 Task - Read a table of features 277
#TOC> 4.2 Selected Expression profiles 322 #TOC> 4.2 Selected Expression profiles 325
#TOC> 5 Differential Expression 363 #TOC> 5 Differential Expression 366
#TOC> 5.1 Final task: Gene descriptions 487 #TOC> 5.1 Final task: Gene descriptions 490
#TOC> 6 Improving on Discovery by Differential Expression 492 #TOC> 6 Improving on Discovery by Differential Expression 495
#TOC> 7 Annotation data 577
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -74,8 +77,8 @@ if (! require(GEOquery, quietly=TRUE)) {
} }
# Package information: # Package information:
# library(help = GEOquery) # basic information # library(help = GEOquery) # basic information
# browseVignettes("GEOquery") # available vignettes # browseVignettes("GEOquery") # available vignettes
# data(package = "GEOquery") # available datasets # data(package = "GEOquery") # available datasets
# = 2 Loading a GEO Dataset =============================================== # = 2 Loading a GEO Dataset ===============================================
@ -264,7 +267,7 @@ file.show("./data/SGD_features.README.txt")
# Note: the file as downloaded from SGD actually crashed RStudio due to an # Note: the file as downloaded from SGD actually crashed RStudio due to an
# unbalanced quotation mark which caused R to try and read the whole # unbalanced quotation mark which caused R to try and read the whole
# of the subsequent file into a single string. This was caused by an # of the subsequent file into a single string. This was caused by an
# alias gene name (B"). I have removed this abomination, # alias gene name (B"). I have removed this abomination
# by editing the file. The version in the ./data directory can be # by editing the file. The version in the ./data directory can be
# read without issues. # read without issues.
@ -571,5 +574,52 @@ for (i in 1:length(myBottomC)) {
# and explore. There is a learning curve - but the payoffs are # and explore. There is a learning curve - but the payoffs are
# significant. # significant.
# = 7 Annotation data =====================================================
#
# Loading feature data "by hand" as we've done above, is usually not necessary
# since GEO provides rich annotations in the GPL platform files, which are
# associated with its Gene Expression Sets files. In the code above,
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
# annotations were not loaded. We could use getGPL = TRUE instead ...
GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
GSE3635annot <- GSE3635annot[[1]]
# ... and the feature data is then available in the GSE3635@featureData@data
# slot:
str(GSE3635annot@featureData@data)
GSE3635annot@featureData@data[ 1:20 , ]
# ... from where you can access the columns you need, e.g. spotIDs and gene
# symbols:
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
str(myAnnot)
# ... Note that this is a data frame, but - oy veh - the gene symbols are
# factors. Really, we need to fix this! To convert a factor into a string,
# we need to cast it to character.
myAnnot$Gene <- as.character(myAnnot$Gene)
# ... whereupon we can find things we might be looking for ...
myAnnot[which(myAnnot$Gene == "MBP1"), ]
# ... or identify rows that might give us trouble, such as probes that
# hybridize to more than one gene.
# Alternatively, we could have identified the GPL file for this set:
GSE3635@annotation # "GPL1914"
# ... and downloaded it directly from NCBI:
GPL1914 <- getGEO("GPL1914")
str(GPL1914)
# ... from which we can get the data - which is however NOT necessarily
# matched to the rows of our expression dataset. Note that here to: the majority
# of data elements are factors and will likely have to be converted before
# use.
# [END] # [END]

View File

@ -16,6 +16,7 @@
# #
# TODO: # TODO:
# Confirm that SS residue numbers are indices # Confirm that SS residue numbers are indices
# Set task seed from student number
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
@ -403,7 +404,7 @@ om <- c(360 + tor$omega[tor$omega < 0],
hist(om, xlim=c(0,360)) hist(om, xlim=c(0,360))
abline(v=180, col="red") abline(v=180, col="red")
# Note: a cis-peptide bond will have an omega torsion angle of around 0° # Note: a cis-peptide bond will have an omega torsion angle around 0°
# = 5 H-bond lengths ====================================================== # = 5 H-bond lengths ======================================================