bch441-work-abc-units/BIN-FUNC-Semantic_similarity.R

# tocID <- "BIN-FUNC_Semantic_similarity.R"
#
# ---------------------------------------------------------------------------- #
#  PATIENCE  ...                                                               #
#    Do not yet work wih this code. Updates in progress. Thank you.            #
#    boris.steipe@utoronto.ca                                                  #
# ---------------------------------------------------------------------------- #
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
#
# Version:  1.1
#
# Date:     2017  11  -  2019  01
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.1    Change from require() to requireNamespace(),
#                      use <package>::<function>() idiom throughout,
#                      use Biocmanager:: not biocLite()
#           1.0    New code.
#
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                                                Line
#TOC> --------------------------------------------------------------------
#TOC>   1        Preparations: Packages, AnnotationDB, Setup            42
#TOC>   2        Fetch GO Annotations                                   98
#TOC>   3        Semantic Similarities                                 107
#TOC>   4        GO Term Enrichment in Gene Sets                       125
#TOC>
#TOC> ==========================================================================


# =    1  Preparations: Packages, AnnotationDB, Setup  =========================

if (! requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

# GOSim is an R-package in the Bioconductor project.
if (! requireNamespace("GOSim", quietly = TRUE)) {
  BiocManager::install("GOSim")
}
# Package information:
#  library(help = GOSim)       # basic information
#  browseVignettes("GOSim")    # available vignettes
#  data(package = "GOSim")     # available datasets

# GOSim makes extensive assumptions about loaded packages, and many base
# methods are masked. We will thus use library(GOSim) to load it
# in its entirety and with all packages it depends on. We will still use
# the <package>::<function>() syntax in the code below, but this now serves
# more of a didactic purpose, rather than actual syntax requirements.

library(GOSim)

# GOSim loads human annotations by default. We load yeast annotations instead...
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
  BiocManager::install("org.Sc.sgd.db")
}

# Bioconductor annotation packages won't work stably unless we actually load
# them:
library(org.Sc.sgd.db)

# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ...
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]

# ... and the types of available annotations with the columns() function
AnnotationDbi::columns(org.Sc.sgd.db)

# Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim:

# Choose GOterms to use
GOSim::setEvidenceLevel(evidences = "all",
                        organism = org.Sc.sgdORGANISM,
                        gomap = org.Sc.sgdGO)

# Use Biological Process ontology
GOSim::setOntology("BP", loadIC = FALSE)

# confirm that we loaded the correct ontology
head(get("gomap", envir = GOSimEnv))


# =    2  Fetch GO Annotations  ================================================


# All keys being used here are yeast systematic names.

# Get one set of annotations
GOSim::getGOInfo(c("YDL056W"))  # Mbp1


# =    3  Semantic Similarities  ===============================================


# Get semantic similarities between genes
?getGeneSim

# There are _many_ different metrics of term similarity implemented
# in this package.

                                                         # Mbp1 and...
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis


# =    4  GO Term Enrichment in Gene Sets  =====================================


# Calculating GO term enrichment in gene sets is done with the Bioconductor
# topGO package.
if (! requireNamespace("topGO", quietly = TRUE)) {
  BiocManager::install("topGO")
}
# Package information:
#  library(help = topGO)       # basic information
#  browseVignettes("topGO")    # available vignettes
#  data(package = "topGO")     # available datasets

# Once again - assumptions are made by GOsim that require us to load the
# topGO package wholesale:
library(topGO)

# Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14
           "YDL056W", # Mbp1
           "YLR182W", # Swi6
           "YER111C", # Swi4
           "YOR083W", # Whi5
           "YBR160W", # Cdc28
           "YMR199W", # Cln1
           "YPL256C", # Cln2
           "YAL040C") # Cln3

allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
                                            # we define enrichment

myEnr <- GOenrichment(mySet, allGenes)

sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...

#Most significantly enriched is GO:0071931. What is this?
annotate::getGOTerm("GO:0071931")  # ... makes sense.


# [END]
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`# tocID <- "BIN-FUNC_Semantic_similarity.R"`
			`#`
			`# ---------------------------------------------------------------------------- #`
			`# PATIENCE ... #`
			`# Do not yet work wih this code. Updates in progress. Thank you. #`
			`# boris.steipe@utoronto.ca #`
			`# ---------------------------------------------------------------------------- #`
New unit 2017-11-13 05:51:04 +00:00			`#`
			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the BIN-FUNC_Semantic_similarity unit.`
			`#`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Version: 1.1`
New unit 2017-11-13 05:51:04 +00:00			`#`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Date: 2017 11 - 2019 01`
New unit 2017-11-13 05:51:04 +00:00			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# 1.1 Change from require() to requireNamespace(),`
			`# use <package>::<function>() idiom throughout,`
			`# use Biocmanager:: not biocLite()`
New unit 2017-11-13 05:51:04 +00:00			`# 1.0 New code.`
			`#`
			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
			`#`
			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
			`#`
			`# ==============================================================================`


			`#TOC> ==========================================================================`
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#TOC>`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`#TOC> Section Title Line`
			`#TOC> --------------------------------------------------------------------`
			`#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42`
			`#TOC> 2 Fetch GO Annotations 98`
			`#TOC> 3 Semantic Similarities 107`
			`#TOC> 4 GO Term Enrichment in Gene Sets 125`
2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#TOC>`
New unit 2017-11-13 05:51:04 +00:00			`#TOC> ==========================================================================`


			`# = 1 Preparations: Packages, AnnotationDB, Setup =========================`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`if (! requireNamespace("BiocManager", quietly = TRUE)) {`
			`install.packages("BiocManager")`
			`}`
New unit 2017-11-13 05:51:04 +00:00
			`# GOSim is an R-package in the Bioconductor project.`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`if (! requireNamespace("GOSim", quietly = TRUE)) {`
			`BiocManager::install("GOSim")`
New unit 2017-11-13 05:51:04 +00:00			`}`
			`# Package information:`
			`# library(help = GOSim) # basic information`
			`# browseVignettes("GOSim") # available vignettes`
			`# data(package = "GOSim") # available datasets`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# GOSim makes extensive assumptions about loaded packages, and many base`
			`# methods are masked. We will thus use library(GOSim) to load it`
			`# in its entirety and with all packages it depends on. We will still use`
			`# the <package>::<function>() syntax in the code below, but this now serves`
			`# more of a didactic purpose, rather than actual syntax requirements.`

			`library(GOSim)`
New unit 2017-11-13 05:51:04 +00:00
			`# GOSim loads human annotations by default. We load yeast annotations instead...`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {`
			`BiocManager::install("org.Sc.sgd.db")`
New unit 2017-11-13 05:51:04 +00:00			`}`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Bioconductor annotation packages won't work stably unless we actually load`
			`# them:`
			`library(org.Sc.sgd.db)`

New unit 2017-11-13 05:51:04 +00:00			`# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such`
			`# databases exist for all model organisms. It's a kind of a fancy data frame`
			`# from which we can get annotations by rows (genes) with the keys() funtion ...`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]`
New unit 2017-11-13 05:51:04 +00:00
			`# ... and the types of available annotations with the columns() function`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`AnnotationDbi::columns(org.Sc.sgd.db)`
New unit 2017-11-13 05:51:04 +00:00
			`# Note that one of the columns is "GO" ... and we load that into the`
			`# datastructures used by GOSim:`

			`# Choose GOterms to use`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`GOSim::setEvidenceLevel(evidences = "all",`
			`organism = org.Sc.sgdORGANISM,`
			`gomap = org.Sc.sgdGO)`
New unit 2017-11-13 05:51:04 +00:00
			`# Use Biological Process ontology`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`GOSim::setOntology("BP", loadIC = FALSE)`
New unit 2017-11-13 05:51:04 +00:00
			`# confirm that we loaded the correct ontology`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`head(get("gomap", envir = GOSimEnv))`
New unit 2017-11-13 05:51:04 +00:00


			`# = 2 Fetch GO Annotations ================================================`


			`# All keys being used here are yeast systematic names.`

			`# Get one set of annotations`
Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`GOSim::getGOInfo(c("YDL056W")) # Mbp1`
New unit 2017-11-13 05:51:04 +00:00

			`# = 3 Semantic Similarities ===============================================`


			`# Get semantic similarities between genes`
			`?getGeneSim`

			`# There are _many_ different metrics of term similarity implemented`
			`# in this package.`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Mbp1 and...`
			`GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex`
			`GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators`
			`GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator`
			`GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist`
			`GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist`
			`GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis`
New unit 2017-11-13 05:51:04 +00:00

			`# = 4 GO Term Enrichment in Gene Sets =====================================`


Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Calculating GO term enrichment in gene sets is done with the Bioconductor`
			`# topGO package.`
			`if (! requireNamespace("topGO", quietly = TRUE)) {`
			`BiocManager::install("topGO")`
New unit 2017-11-13 05:51:04 +00:00			`}`
			`# Package information:`
			`# library(help = topGO) # basic information`
			`# browseVignettes("topGO") # available vignettes`
			`# data(package = "topGO") # available datasets`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`# Once again - assumptions are made by GOsim that require us to load the`
			`# topGO package wholesale:`
			`library(topGO)`
New unit 2017-11-13 05:51:04 +00:00
			`# Let's define a gene set: GOterm enrichment for G1/S switch activators:`
			`mySet <- c("YFR028C", # Cdc14`
			`"YDL056W", # Mbp1`
			`"YLR182W", # Swi6`
			`"YER111C", # Swi4`
			`"YOR083W", # Whi5`
			`"YBR160W", # Cdc28`
			`"YMR199W", # Cln1`
			`"YPL256C", # Cln2`
			`"YAL040C") # Cln3`

Use requireNamespace(), <package>::<function>() idiom, Biocmanager:: - not biocLite() 2019-01-08 07:11:25 +00:00			`allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)`
New unit 2017-11-13 05:51:04 +00:00			`allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which`
			`# we define enrichment`

			`myEnr <- GOenrichment(mySet, allGenes)`

			`sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...`

2020 updates - deactivate for maintenance 2020-09-18 11:56:30 +00:00			`#Most significantly enriched is GO:0071931. What is this?`
			`annotate::getGOTerm("GO:0071931") # ... makes sense.`
New unit 2017-11-13 05:51:04 +00:00



			`# [END]`