Add section on GPL annotations to RPR-GEO2R
This commit is contained in:
		@@ -238,7 +238,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# We computed a T-Coffee alignment at the EBI. msa has no native import function
 | 
					# We computed a T-Coffee alignment at the EBI. msa has no native import function
 | 
				
			||||||
# so we need to improvise, and it's a bit of a pain to do - but a good
 | 
					# so we need to improvise, and it's a bit of a pain to do - but a good
 | 
				
			||||||
# illustration of startegies to convert data into any kind of object:
 | 
					# illustration of strategies to convert data into any kind of object:
 | 
				
			||||||
#   -  read an .aln file
 | 
					#   -  read an .aln file
 | 
				
			||||||
#   -  adjust the sequence names
 | 
					#   -  adjust the sequence names
 | 
				
			||||||
#   -  convert to msaAAMultipleAlignment object
 | 
					#   -  convert to msaAAMultipleAlignment object
 | 
				
			||||||
@@ -452,7 +452,7 @@ legend("bottomright",
 | 
				
			|||||||
       cex = 0.7,
 | 
					       cex = 0.7,
 | 
				
			||||||
       bty = "n")
 | 
					       bty = "n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Your alignment is going to be differnte from mine, due to the inclusion of
 | 
					# Your alignment is going to be different from mine, due to the inclusion of
 | 
				
			||||||
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
 | 
					# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
 | 
				
			||||||
# achieves this with fewer indels then most, and the lowest number of gaps of
 | 
					# achieves this with fewer indels then most, and the lowest number of gaps of
 | 
				
			||||||
# all algorithms.
 | 
					# all algorithms.
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -27,7 +27,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
#TOC> 
 | 
					#TOC>
 | 
				
			||||||
#TOC>   Section  Title                                             Line
 | 
					#TOC>   Section  Title                                             Line
 | 
				
			||||||
#TOC> -----------------------------------------------------------------
 | 
					#TOC> -----------------------------------------------------------------
 | 
				
			||||||
#TOC>   1        A Relational Datamodel in R: review                 62
 | 
					#TOC>   1        A Relational Datamodel in R: review                 62
 | 
				
			||||||
@@ -50,7 +50,7 @@
 | 
				
			|||||||
#TOC>   3.3      Create an R script to create your own database     540
 | 
					#TOC>   3.3      Create an R script to create your own database     540
 | 
				
			||||||
#TOC>   3.3.1    Check and validate                                 560
 | 
					#TOC>   3.3.1    Check and validate                                 560
 | 
				
			||||||
#TOC>   3.4      Task: submit for credit (part 2/2)                 601
 | 
					#TOC>   3.4      Task: submit for credit (part 2/2)                 601
 | 
				
			||||||
#TOC> 
 | 
					#TOC>
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -205,7 +205,7 @@ str(philDB)
 | 
				
			|||||||
# go back, re-read, play with it, and ask for help. This is essential.
 | 
					# go back, re-read, play with it, and ask for help. This is essential.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===  1.1.1  completing the database                       
 | 
					# ===  1.1.1  completing the database
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Next I'll add one more person, and create the other two tables:
 | 
					# Next I'll add one more person, and create the other two tables:
 | 
				
			||||||
@@ -311,7 +311,7 @@ for (ID in pID) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Have a look at the structure of the yeast Mbp1 protein data:
 | 
					# Have a look at the structure of the yeast Mbp1 protein data:
 | 
				
			||||||
file.edit("./data/MBP1_SACCE.json")
 | 
					file.show("./data/MBP1_SACCE.json")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# - The whole thing is an array: [ ... ]. This is not necessary for a single
 | 
					# - The whole thing is an array: [ ... ]. This is not necessary for a single
 | 
				
			||||||
#     object, but we will have more objects in other files. And it's perfectly
 | 
					#     object, but we will have more objects in other files. And it's perfectly
 | 
				
			||||||
@@ -369,7 +369,7 @@ dbSanitizeSequence(x)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# ==   2.3  Create a protein table for our data model  =========================
 | 
					# ==   2.3  Create a protein table for our data model  =========================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===  2.3.1  Initialize the database                       
 | 
					# ===  2.3.1  Initialize the database
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The function dbInit contains all the code to return a list of empty
 | 
					# The function dbInit contains all the code to return a list of empty
 | 
				
			||||||
@@ -381,7 +381,7 @@ myDB <- dbInit()
 | 
				
			|||||||
str(myDB)
 | 
					str(myDB)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===  2.3.2  Add data                                      
 | 
					# ===  2.3.2  Add data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# fromJSON() returns a dataframe that we can readily process to add data
 | 
					# fromJSON() returns a dataframe that we can readily process to add data
 | 
				
			||||||
@@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R")
 | 
				
			|||||||
str(myDB)
 | 
					str(myDB)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===  2.4.1  Examples of navigating the database           
 | 
					# ===  2.4.1  Examples of navigating the database
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# You can look at the contents of the tables in the usual way we access
 | 
					# You can look at the contents of the tables in the usual way we access
 | 
				
			||||||
@@ -552,7 +552,7 @@ myDB$taxonomy$species[sel]
 | 
				
			|||||||
# in any of the JSON files. Later you will add more information ...
 | 
					# in any of the JSON files. Later you will add more information ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===  3.3.1  Check and validate                            
 | 
					# ===  3.3.1  Check and validate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
 | 
					# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										84
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
							
						
						
									
										84
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							@@ -3,12 +3,14 @@
 | 
				
			|||||||
# Purpose:  A Bioinformatics Course:
 | 
					# Purpose:  A Bioinformatics Course:
 | 
				
			||||||
#              R code accompanying the RPR_GEO2R unit.
 | 
					#              R code accompanying the RPR_GEO2R unit.
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Version:  0.1
 | 
					# Version:  1.1
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Date:     2017  MM  DD
 | 
					# Date:     2017 09  -  2018 01
 | 
				
			||||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
					# Author:   Boris Steipe <boris.steipe@utoronto.ca>
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Versions:
 | 
					# Versions:
 | 
				
			||||||
 | 
					#           1.1    Add section on GPL annotations
 | 
				
			||||||
 | 
					#           1.0    Updates for BCH441 2017
 | 
				
			||||||
#           0.1    First code copied from 2016 material.
 | 
					#           0.1    First code copied from 2016 material.
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -32,17 +34,18 @@
 | 
				
			|||||||
#TOC> 
 | 
					#TOC> 
 | 
				
			||||||
#TOC>   Section  Title                                                Line
 | 
					#TOC>   Section  Title                                                Line
 | 
				
			||||||
#TOC> --------------------------------------------------------------------
 | 
					#TOC> --------------------------------------------------------------------
 | 
				
			||||||
#TOC>   1        Preparations                                           50
 | 
					#TOC>   1        Preparations                                           53
 | 
				
			||||||
#TOC>   2        Loading a GEO Dataset                                  81
 | 
					#TOC>   2        Loading a GEO Dataset                                  84
 | 
				
			||||||
#TOC>   3        Column wise analysis - time points                    151
 | 
					#TOC>   3        Column wise analysis - time points                    154
 | 
				
			||||||
#TOC>   3.1      Task - Comparison of experiments                      157
 | 
					#TOC>   3.1      Task - Comparison of experiments                      160
 | 
				
			||||||
#TOC>   3.2      Grouped Samples                                       204
 | 
					#TOC>   3.2      Grouped Samples                                       207
 | 
				
			||||||
#TOC>   4        Row-wise Analysis: Expression Profiles                239
 | 
					#TOC>   4        Row-wise Analysis: Expression Profiles                242
 | 
				
			||||||
#TOC>   4.1      Task - Read a table of features                       274
 | 
					#TOC>   4.1      Task - Read a table of features                       277
 | 
				
			||||||
#TOC>   4.2      Selected Expression profiles                          322
 | 
					#TOC>   4.2      Selected Expression profiles                          325
 | 
				
			||||||
#TOC>   5        Differential Expression                               363
 | 
					#TOC>   5        Differential Expression                               366
 | 
				
			||||||
#TOC>   5.1      Final task: Gene descriptions                         487
 | 
					#TOC>   5.1      Final task: Gene descriptions                         490
 | 
				
			||||||
#TOC>   6        Improving on Discovery by Differential Expression     492
 | 
					#TOC>   6        Improving on Discovery by Differential Expression     495
 | 
				
			||||||
 | 
					#TOC>   7        Annotation data                                       577
 | 
				
			||||||
#TOC> 
 | 
					#TOC> 
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -74,8 +77,8 @@ if (! require(GEOquery, quietly=TRUE)) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
# Package information:
 | 
					# Package information:
 | 
				
			||||||
#  library(help = GEOquery)       # basic information
 | 
					#  library(help = GEOquery)       # basic information
 | 
				
			||||||
#  browseVignettes("GEOquery")  # available vignettes
 | 
					#  browseVignettes("GEOquery")    # available vignettes
 | 
				
			||||||
#  data(package = "GEOquery")   # available datasets
 | 
					#  data(package = "GEOquery")      # available datasets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# =    2  Loading a GEO Dataset  ===============================================
 | 
					# =    2  Loading a GEO Dataset  ===============================================
 | 
				
			||||||
@@ -264,7 +267,7 @@ file.show("./data/SGD_features.README.txt")
 | 
				
			|||||||
#     Note: the file as downloaded from SGD actually crashed RStudio due to an
 | 
					#     Note: the file as downloaded from SGD actually crashed RStudio due to an
 | 
				
			||||||
#           unbalanced quotation mark which caused R to try and read the whole
 | 
					#           unbalanced quotation mark which caused R to try and read the whole
 | 
				
			||||||
#           of the subsequent file into a single string. This was caused by an
 | 
					#           of the subsequent file into a single string. This was caused by an
 | 
				
			||||||
#           alias gene name (B"). I have removed this abomination,
 | 
					#           alias gene name (B"). I have removed this abomination
 | 
				
			||||||
#           by editing the file. The version in the ./data directory can be
 | 
					#           by editing the file. The version in the ./data directory can be
 | 
				
			||||||
#           read without issues.
 | 
					#           read without issues.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -571,5 +574,52 @@ for (i in 1:length(myBottomC)) {
 | 
				
			|||||||
#        and explore. There is a learning curve - but the payoffs are
 | 
					#        and explore. There is a learning curve - but the payoffs are
 | 
				
			||||||
#        significant.
 | 
					#        significant.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    7  Annotation data  =====================================================
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Loading feature data "by hand" as we've done above, is usually not necessary
 | 
				
			||||||
 | 
					# since GEO provides rich annotations in the GPL platform files, which are
 | 
				
			||||||
 | 
					# associated with its Gene Expression Sets files. In the code above,
 | 
				
			||||||
 | 
					# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
 | 
				
			||||||
 | 
					# annotations were not loaded. We could use getGPL = TRUE instead ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
 | 
				
			||||||
 | 
					GSE3635annot <- GSE3635annot[[1]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... and the feature data is then available in the GSE3635@featureData@data
 | 
				
			||||||
 | 
					#     slot:
 | 
				
			||||||
 | 
					str(GSE3635annot@featureData@data)
 | 
				
			||||||
 | 
					GSE3635annot@featureData@data[ 1:20 , ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... from where you can access the columns you need, e.g. spotIDs and gene
 | 
				
			||||||
 | 
					#     symbols:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
 | 
				
			||||||
 | 
					str(myAnnot)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... Note that this is a data frame, but - oy veh - the gene symbols are
 | 
				
			||||||
 | 
					#     factors. Really, we need to fix this! To convert a factor into a string,
 | 
				
			||||||
 | 
					#     we need to cast it to character.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					myAnnot$Gene <- as.character(myAnnot$Gene)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... whereupon we can find things we might be looking for ...
 | 
				
			||||||
 | 
					myAnnot[which(myAnnot$Gene == "MBP1"), ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... or identify rows that might give us trouble, such as probes that
 | 
				
			||||||
 | 
					#     hybridize to more than one gene.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Alternatively, we could have identified the GPL file for this set:
 | 
				
			||||||
 | 
					GSE3635@annotation   # "GPL1914"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... and downloaded it directly from NCBI:
 | 
				
			||||||
 | 
					GPL1914 <- getGEO("GPL1914")
 | 
				
			||||||
 | 
					str(GPL1914)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ... from which we can get the data - which is however NOT necessarily
 | 
				
			||||||
 | 
					# matched to the rows of our expression dataset. Note that here to: the majority
 | 
				
			||||||
 | 
					# of data elements are factors and will likely have to be converted before
 | 
				
			||||||
 | 
					# use.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# [END]
 | 
					# [END]
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,6 +16,7 @@
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# TODO:
 | 
					# TODO:
 | 
				
			||||||
#          Confirm that SS residue numbers are indices
 | 
					#          Confirm that SS residue numbers are indices
 | 
				
			||||||
 | 
					#          Set task seed from student number
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
					# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -403,7 +404,7 @@ om <- c(360 + tor$omega[tor$omega < 0],
 | 
				
			|||||||
hist(om, xlim=c(0,360))
 | 
					hist(om, xlim=c(0,360))
 | 
				
			||||||
abline(v=180, col="red")
 | 
					abline(v=180, col="red")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Note: a cis-peptide bond will have an omega torsion angle of around 0°
 | 
					# Note: a cis-peptide bond will have an omega torsion angle around 0°
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# =    5  H-bond lengths  ======================================================
 | 
					# =    5  H-bond lengths  ======================================================
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user