Add section on GPL annotations to RPR-GEO2R
This commit is contained in:
parent
359434d863
commit
c4fed71a7f
@ -238,7 +238,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
|
|||||||
|
|
||||||
# We computed a T-Coffee alignment at the EBI. msa has no native import function
|
# We computed a T-Coffee alignment at the EBI. msa has no native import function
|
||||||
# so we need to improvise, and it's a bit of a pain to do - but a good
|
# so we need to improvise, and it's a bit of a pain to do - but a good
|
||||||
# illustration of startegies to convert data into any kind of object:
|
# illustration of strategies to convert data into any kind of object:
|
||||||
# - read an .aln file
|
# - read an .aln file
|
||||||
# - adjust the sequence names
|
# - adjust the sequence names
|
||||||
# - convert to msaAAMultipleAlignment object
|
# - convert to msaAAMultipleAlignment object
|
||||||
@ -452,7 +452,7 @@ legend("bottomright",
|
|||||||
cex = 0.7,
|
cex = 0.7,
|
||||||
bty = "n")
|
bty = "n")
|
||||||
|
|
||||||
# Your alignment is going to be differnte from mine, due to the inclusion of
|
# Your alignment is going to be different from mine, due to the inclusion of
|
||||||
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
||||||
# achieves this with fewer indels then most, and the lowest number of gaps of
|
# achieves this with fewer indels then most, and the lowest number of gaps of
|
||||||
# all algorithms.
|
# all algorithms.
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------
|
||||||
#TOC> 1 A Relational Datamodel in R: review 62
|
#TOC> 1 A Relational Datamodel in R: review 62
|
||||||
@ -50,7 +50,7 @@
|
|||||||
#TOC> 3.3 Create an R script to create your own database 540
|
#TOC> 3.3 Create an R script to create your own database 540
|
||||||
#TOC> 3.3.1 Check and validate 560
|
#TOC> 3.3.1 Check and validate 560
|
||||||
#TOC> 3.4 Task: submit for credit (part 2/2) 601
|
#TOC> 3.4 Task: submit for credit (part 2/2) 601
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -205,7 +205,7 @@ str(philDB)
|
|||||||
# go back, re-read, play with it, and ask for help. This is essential.
|
# go back, re-read, play with it, and ask for help. This is essential.
|
||||||
|
|
||||||
|
|
||||||
# === 1.1.1 completing the database
|
# === 1.1.1 completing the database
|
||||||
|
|
||||||
|
|
||||||
# Next I'll add one more person, and create the other two tables:
|
# Next I'll add one more person, and create the other two tables:
|
||||||
@ -311,7 +311,7 @@ for (ID in pID) {
|
|||||||
|
|
||||||
|
|
||||||
# Have a look at the structure of the yeast Mbp1 protein data:
|
# Have a look at the structure of the yeast Mbp1 protein data:
|
||||||
file.edit("./data/MBP1_SACCE.json")
|
file.show("./data/MBP1_SACCE.json")
|
||||||
|
|
||||||
# - The whole thing is an array: [ ... ]. This is not necessary for a single
|
# - The whole thing is an array: [ ... ]. This is not necessary for a single
|
||||||
# object, but we will have more objects in other files. And it's perfectly
|
# object, but we will have more objects in other files. And it's perfectly
|
||||||
@ -369,7 +369,7 @@ dbSanitizeSequence(x)
|
|||||||
|
|
||||||
# == 2.3 Create a protein table for our data model =========================
|
# == 2.3 Create a protein table for our data model =========================
|
||||||
|
|
||||||
# === 2.3.1 Initialize the database
|
# === 2.3.1 Initialize the database
|
||||||
|
|
||||||
|
|
||||||
# The function dbInit contains all the code to return a list of empty
|
# The function dbInit contains all the code to return a list of empty
|
||||||
@ -381,7 +381,7 @@ myDB <- dbInit()
|
|||||||
str(myDB)
|
str(myDB)
|
||||||
|
|
||||||
|
|
||||||
# === 2.3.2 Add data
|
# === 2.3.2 Add data
|
||||||
|
|
||||||
|
|
||||||
# fromJSON() returns a dataframe that we can readily process to add data
|
# fromJSON() returns a dataframe that we can readily process to add data
|
||||||
@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R")
|
|||||||
str(myDB)
|
str(myDB)
|
||||||
|
|
||||||
|
|
||||||
# === 2.4.1 Examples of navigating the database
|
# === 2.4.1 Examples of navigating the database
|
||||||
|
|
||||||
|
|
||||||
# You can look at the contents of the tables in the usual way we access
|
# You can look at the contents of the tables in the usual way we access
|
||||||
@ -552,7 +552,7 @@ myDB$taxonomy$species[sel]
|
|||||||
# in any of the JSON files. Later you will add more information ...
|
# in any of the JSON files. Later you will add more information ...
|
||||||
|
|
||||||
|
|
||||||
# === 3.3.1 Check and validate
|
# === 3.3.1 Check and validate
|
||||||
|
|
||||||
|
|
||||||
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
||||||
|
84
RPR-GEO2R.R
84
RPR-GEO2R.R
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR_GEO2R unit.
|
# R code accompanying the RPR_GEO2R unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 MM DD
|
# Date: 2017 09 - 2018 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Add section on GPL annotations
|
||||||
|
# 1.0 Updates for BCH441 2017
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -32,17 +34,18 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 50
|
#TOC> 1 Preparations 53
|
||||||
#TOC> 2 Loading a GEO Dataset 81
|
#TOC> 2 Loading a GEO Dataset 84
|
||||||
#TOC> 3 Column wise analysis - time points 151
|
#TOC> 3 Column wise analysis - time points 154
|
||||||
#TOC> 3.1 Task - Comparison of experiments 157
|
#TOC> 3.1 Task - Comparison of experiments 160
|
||||||
#TOC> 3.2 Grouped Samples 204
|
#TOC> 3.2 Grouped Samples 207
|
||||||
#TOC> 4 Row-wise Analysis: Expression Profiles 239
|
#TOC> 4 Row-wise Analysis: Expression Profiles 242
|
||||||
#TOC> 4.1 Task - Read a table of features 274
|
#TOC> 4.1 Task - Read a table of features 277
|
||||||
#TOC> 4.2 Selected Expression profiles 322
|
#TOC> 4.2 Selected Expression profiles 325
|
||||||
#TOC> 5 Differential Expression 363
|
#TOC> 5 Differential Expression 366
|
||||||
#TOC> 5.1 Final task: Gene descriptions 487
|
#TOC> 5.1 Final task: Gene descriptions 490
|
||||||
#TOC> 6 Improving on Discovery by Differential Expression 492
|
#TOC> 6 Improving on Discovery by Differential Expression 495
|
||||||
|
#TOC> 7 Annotation data 577
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -74,8 +77,8 @@ if (! require(GEOquery, quietly=TRUE)) {
|
|||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = GEOquery) # basic information
|
# library(help = GEOquery) # basic information
|
||||||
# browseVignettes("GEOquery") # available vignettes
|
# browseVignettes("GEOquery") # available vignettes
|
||||||
# data(package = "GEOquery") # available datasets
|
# data(package = "GEOquery") # available datasets
|
||||||
|
|
||||||
|
|
||||||
# = 2 Loading a GEO Dataset ===============================================
|
# = 2 Loading a GEO Dataset ===============================================
|
||||||
@ -264,7 +267,7 @@ file.show("./data/SGD_features.README.txt")
|
|||||||
# Note: the file as downloaded from SGD actually crashed RStudio due to an
|
# Note: the file as downloaded from SGD actually crashed RStudio due to an
|
||||||
# unbalanced quotation mark which caused R to try and read the whole
|
# unbalanced quotation mark which caused R to try and read the whole
|
||||||
# of the subsequent file into a single string. This was caused by an
|
# of the subsequent file into a single string. This was caused by an
|
||||||
# alias gene name (B"). I have removed this abomination,
|
# alias gene name (B"). I have removed this abomination
|
||||||
# by editing the file. The version in the ./data directory can be
|
# by editing the file. The version in the ./data directory can be
|
||||||
# read without issues.
|
# read without issues.
|
||||||
|
|
||||||
@ -571,5 +574,52 @@ for (i in 1:length(myBottomC)) {
|
|||||||
# and explore. There is a learning curve - but the payoffs are
|
# and explore. There is a learning curve - but the payoffs are
|
||||||
# significant.
|
# significant.
|
||||||
|
|
||||||
|
# = 7 Annotation data =====================================================
|
||||||
|
#
|
||||||
|
# Loading feature data "by hand" as we've done above, is usually not necessary
|
||||||
|
# since GEO provides rich annotations in the GPL platform files, which are
|
||||||
|
# associated with its Gene Expression Sets files. In the code above,
|
||||||
|
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
|
||||||
|
# annotations were not loaded. We could use getGPL = TRUE instead ...
|
||||||
|
|
||||||
|
GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
|
||||||
|
GSE3635annot <- GSE3635annot[[1]]
|
||||||
|
|
||||||
|
# ... and the feature data is then available in the GSE3635@featureData@data
|
||||||
|
# slot:
|
||||||
|
str(GSE3635annot@featureData@data)
|
||||||
|
GSE3635annot@featureData@data[ 1:20 , ]
|
||||||
|
|
||||||
|
# ... from where you can access the columns you need, e.g. spotIDs and gene
|
||||||
|
# symbols:
|
||||||
|
|
||||||
|
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
|
||||||
|
str(myAnnot)
|
||||||
|
|
||||||
|
# ... Note that this is a data frame, but - oy veh - the gene symbols are
|
||||||
|
# factors. Really, we need to fix this! To convert a factor into a string,
|
||||||
|
# we need to cast it to character.
|
||||||
|
|
||||||
|
myAnnot$Gene <- as.character(myAnnot$Gene)
|
||||||
|
|
||||||
|
# ... whereupon we can find things we might be looking for ...
|
||||||
|
myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
||||||
|
|
||||||
|
# ... or identify rows that might give us trouble, such as probes that
|
||||||
|
# hybridize to more than one gene.
|
||||||
|
|
||||||
|
|
||||||
|
# Alternatively, we could have identified the GPL file for this set:
|
||||||
|
GSE3635@annotation # "GPL1914"
|
||||||
|
|
||||||
|
# ... and downloaded it directly from NCBI:
|
||||||
|
GPL1914 <- getGEO("GPL1914")
|
||||||
|
str(GPL1914)
|
||||||
|
|
||||||
|
# ... from which we can get the data - which is however NOT necessarily
|
||||||
|
# matched to the rows of our expression dataset. Note that here to: the majority
|
||||||
|
# of data elements are factors and will likely have to be converted before
|
||||||
|
# use.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
# Confirm that SS residue numbers are indices
|
# Confirm that SS residue numbers are indices
|
||||||
|
# Set task seed from student number
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
#
|
#
|
||||||
@ -403,7 +404,7 @@ om <- c(360 + tor$omega[tor$omega < 0],
|
|||||||
hist(om, xlim=c(0,360))
|
hist(om, xlim=c(0,360))
|
||||||
abline(v=180, col="red")
|
abline(v=180, col="red")
|
||||||
|
|
||||||
# Note: a cis-peptide bond will have an omega torsion angle of around 0°
|
# Note: a cis-peptide bond will have an omega torsion angle around 0°
|
||||||
|
|
||||||
|
|
||||||
# = 5 H-bond lengths ======================================================
|
# = 5 H-bond lengths ======================================================
|
||||||
|
Loading…
Reference in New Issue
Block a user