bch441-work-abc-units/BIN-FUNC-Semantic_similarity.R
2017-11-13 00:51:04 -05:00

177 lines
5.7 KiB
R

# BIN-FUNC_Semantic_similarity.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
#
# Version: 1.0
#
# Date: 2017 11 12
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 New code.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39
#TOC> 2 Fetch GO Annotations 89
#TOC> 3 Semantic Similarities 98
#TOC> 4 GO Term Enrichment in Gene Sets 116
#TOC>
#TOC> ==========================================================================
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
# GOSim is an R-package in the Bioconductor project.
if (! require(GOSim, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("GOSim")
library(GOSim)
}
# Package information:
# library(help = GOSim) # basic information
# browseVignettes("GOSim") # available vignettes
# data(package = "GOSim") # available datasets
# GOSim loads human annotations by default. We load yeast annotations instead...
if (!require(org.Sc.sgd.db, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("org.Sc.sgd.db")
library(org.Sc.sgd.db)
}
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ...
keys(org.Sc.sgd.db)[1500:1510]
# ... and the types of available annotations with the columns() function
columns(org.Sc.sgd.db)
# Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim:
# Choose GOterms to use
setEvidenceLevel(evidences="all",
organism=org.Sc.sgdORGANISM,
gomap=org.Sc.sgdGO)
# Use Biological Process ontology
setOntology("BP", loadIC=FALSE)
# confirm that we loaded the correct ontology
head(get("gomap", envir=GOSimEnv))
# = 2 Fetch GO Annotations ================================================
# All keys being used here are yeast systematic names.
# Get one set of annotations
getGOInfo(c("YDL056W")) # Mbp1
# = 3 Semantic Similarities ===============================================
# Get semantic similarities between genes
?getGeneSim
# There are _many_ different metrics of term similarity implemented
# in this package.
# Mbp1 and...
getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex
getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators
getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator
getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist
getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist
getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis
# = 4 GO Term Enrichment in Gene Sets =====================================
# Calculating GO term enrichment in gene sets is done with the topGO package.
if (! require(topGO, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("topGO")
library(topGO)
}
# Package information:
# library(help = topGO) # basic information
# browseVignettes("topGO") # available vignettes
# data(package = "topGO") # available datasets
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14
"YDL056W", # Mbp1
"YLR182W", # Swi6
"YER111C", # Swi4
"YOR083W", # Whi5
"YBR160W", # Cdc28
"YMR199W", # Cln1
"YPL256C", # Cln2
"YAL040C") # Cln3
allGenes <- keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
# we define enrichment
myEnr <- GOenrichment(mySet, allGenes)
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
#Yes: most significantly enriched is GO:0071931. What is this?
getGOTerm("GO:0071931") # ... makes sense.
(fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term?
intersect(mySet, fullSet) # These are in both sets
setdiff(mySet, fullSet) # These mySet members are not annotated to that term
setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
# ... that's the most interesting set. From a set of
# genes we have identified a function that they
# share, and that shared function has allowed us
# to identify
# What are these genes?
# Select annotations from the annotation database:
select(org.Sc.sgd.db,
keys = setdiff(fullSet, mySet),
columns = c("COMMON", "DESCRIPTION"))
# Note that these annotations are partially redundant to several different
# aliases of the same three genes.
# [END]