New unit
This commit is contained in:
parent
93523c6533
commit
8cb69f2986
176
BIN-FUNC-Semantic_similarity.R
Normal file
176
BIN-FUNC-Semantic_similarity.R
Normal file
@ -0,0 +1,176 @@
|
||||
# BIN-FUNC_Semantic_similarity.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 11 12
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 New code.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39
|
||||
#TOC> 2 Fetch GO Annotations 89
|
||||
#TOC> 3 Semantic Similarities 98
|
||||
#TOC> 4 GO Term Enrichment in Gene Sets 116
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
||||
|
||||
|
||||
# GOSim is an R-package in the Bioconductor project.
|
||||
if (! require(GOSim, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("GOSim")
|
||||
library(GOSim)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = GOSim) # basic information
|
||||
# browseVignettes("GOSim") # available vignettes
|
||||
# data(package = "GOSim") # available datasets
|
||||
|
||||
|
||||
# GOSim loads human annotations by default. We load yeast annotations instead...
|
||||
if (!require(org.Sc.sgd.db, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("org.Sc.sgd.db")
|
||||
library(org.Sc.sgd.db)
|
||||
}
|
||||
|
||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
||||
# databases exist for all model organisms. It's a kind of a fancy data frame
|
||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
||||
keys(org.Sc.sgd.db)[1500:1510]
|
||||
|
||||
# ... and the types of available annotations with the columns() function
|
||||
columns(org.Sc.sgd.db)
|
||||
|
||||
# Note that one of the columns is "GO" ... and we load that into the
|
||||
# datastructures used by GOSim:
|
||||
|
||||
# Choose GOterms to use
|
||||
setEvidenceLevel(evidences="all",
|
||||
organism=org.Sc.sgdORGANISM,
|
||||
gomap=org.Sc.sgdGO)
|
||||
|
||||
# Use Biological Process ontology
|
||||
setOntology("BP", loadIC=FALSE)
|
||||
|
||||
# confirm that we loaded the correct ontology
|
||||
head(get("gomap", envir=GOSimEnv))
|
||||
|
||||
|
||||
|
||||
# = 2 Fetch GO Annotations ================================================
|
||||
|
||||
|
||||
# All keys being used here are yeast systematic names.
|
||||
|
||||
# Get one set of annotations
|
||||
getGOInfo(c("YDL056W")) # Mbp1
|
||||
|
||||
|
||||
# = 3 Semantic Similarities ===============================================
|
||||
|
||||
|
||||
# Get semantic similarities between genes
|
||||
?getGeneSim
|
||||
|
||||
# There are _many_ different metrics of term similarity implemented
|
||||
# in this package.
|
||||
|
||||
# Mbp1 and...
|
||||
getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex
|
||||
getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators
|
||||
getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator
|
||||
getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist
|
||||
getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist
|
||||
getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||
|
||||
|
||||
# = 4 GO Term Enrichment in Gene Sets =====================================
|
||||
|
||||
|
||||
# Calculating GO term enrichment in gene sets is done with the topGO package.
|
||||
if (! require(topGO, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("topGO")
|
||||
library(topGO)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = topGO) # basic information
|
||||
# browseVignettes("topGO") # available vignettes
|
||||
# data(package = "topGO") # available datasets
|
||||
|
||||
|
||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
||||
mySet <- c("YFR028C", # Cdc14
|
||||
"YDL056W", # Mbp1
|
||||
"YLR182W", # Swi6
|
||||
"YER111C", # Swi4
|
||||
"YOR083W", # Whi5
|
||||
"YBR160W", # Cdc28
|
||||
"YMR199W", # Cln1
|
||||
"YPL256C", # Cln2
|
||||
"YAL040C") # Cln3
|
||||
|
||||
allGenes <- keys(org.Sc.sgd.db)
|
||||
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
||||
# we define enrichment
|
||||
|
||||
myEnr <- GOenrichment(mySet, allGenes)
|
||||
|
||||
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
|
||||
|
||||
#Yes: most significantly enriched is GO:0071931. What is this?
|
||||
getGOTerm("GO:0071931") # ... makes sense.
|
||||
|
||||
(fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term?
|
||||
|
||||
intersect(mySet, fullSet) # These are in both sets
|
||||
setdiff(mySet, fullSet) # These mySet members are not annotated to that term
|
||||
setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
|
||||
# ... that's the most interesting set. From a set of
|
||||
# genes we have identified a function that they
|
||||
# share, and that shared function has allowed us
|
||||
# to identify
|
||||
|
||||
# What are these genes?
|
||||
# Select annotations from the annotation database:
|
||||
select(org.Sc.sgd.db,
|
||||
keys = setdiff(fullSet, mySet),
|
||||
columns = c("COMMON", "DESCRIPTION"))
|
||||
|
||||
# Note that these annotations are partially redundant to several different
|
||||
# aliases of the same three genes.
|
||||
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user