From 8cb69f29860a78b919dada6393ec2926f91f16f8 Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Mon, 13 Nov 2017 00:51:04 -0500
Subject: [PATCH] New unit

---
 BIN-FUNC-Semantic_similarity.R | 176 +++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 BIN-FUNC-Semantic_similarity.R

diff --git a/BIN-FUNC-Semantic_similarity.R b/BIN-FUNC-Semantic_similarity.R
new file mode 100644
index 0000000..c922237
--- /dev/null
+++ b/BIN-FUNC-Semantic_similarity.R
@@ -0,0 +1,176 @@
+# BIN-FUNC_Semantic_similarity.R
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
+#
+# Version:  1.0
+#
+# Date:     2017  11  12
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    New code.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                          Line
+#TOC> --------------------------------------------------------------
+#TOC>   1        Preparations: Packages, AnnotationDB, Setup      39
+#TOC>   2        Fetch GO Annotations                             89
+#TOC>   3        Semantic Similarities                            98
+#TOC>   4        GO Term Enrichment in Gene Sets                 116
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
+
+
+# GOSim is an R-package in the Bioconductor project.
+if (! require(GOSim, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("GOSim")
+  library(GOSim)
+}
+# Package information:
+#  library(help = GOSim)       # basic information
+#  browseVignettes("GOSim")    # available vignettes
+#  data(package = "GOSim")     # available datasets
+
+
+# GOSim loads human annotations by default. We load yeast annotations instead...
+if (!require(org.Sc.sgd.db, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("org.Sc.sgd.db")
+  library(org.Sc.sgd.db)
+}
+
+# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
+# databases exist for all model organisms. It's a kind of a fancy data frame
+# from which we can get annotations by rows (genes) with the keys() funtion ...
+keys(org.Sc.sgd.db)[1500:1510]
+
+# ... and the types of available annotations with the columns() function
+columns(org.Sc.sgd.db)
+
+# Note that one of the columns is "GO" ... and we load that into the
+# datastructures used by GOSim:
+
+# Choose GOterms to use
+setEvidenceLevel(evidences="all",
+                 organism=org.Sc.sgdORGANISM,
+                 gomap=org.Sc.sgdGO)
+
+# Use Biological Process ontology
+setOntology("BP", loadIC=FALSE)
+
+# confirm that we loaded the correct ontology
+head(get("gomap", envir=GOSimEnv))
+
+
+
+# =    2  Fetch GO Annotations  ================================================
+
+
+# All keys being used here are yeast systematic names.
+
+# Get one set of annotations
+getGOInfo(c("YDL056W"))  # Mbp1
+
+
+# =    3  Semantic Similarities  ===============================================
+
+
+# Get semantic similarities between genes
+?getGeneSim
+
+# There are _many_ different metrics of term similarity implemented
+# in this package.
+
+                                                     # Mbp1 and...
+getGeneSim("YDL056W", "YLR182W", similarity = "OA")  # Swi6 - MCB complex
+getGeneSim("YDL056W", "YER111C", similarity = "OA")  # Swi4 - collaborators
+getGeneSim("YDL056W", "YBR160W", similarity = "OA")  # Cdc28 - mediator
+getGeneSim("YDL056W", "YGR108W", similarity = "OA")  # Clb1 - antagonist
+getGeneSim("YDL056W", "YLR079W", similarity = "OA")  # Sic1 - antagonist
+getGeneSim("YDL056W", "YJL130C", similarity = "OA")  # Pgk1 - Gluconeogenesis
+
+
+# =    4  GO Term Enrichment in Gene Sets  =====================================
+
+
+# Calculating GO term enrichment in gene sets is done with the topGO package.
+if (! require(topGO, quietly=TRUE)) {
+  if (! exists("biocLite")) {
+    source("https://bioconductor.org/biocLite.R")
+  }
+  biocLite("topGO")
+  library(topGO)
+}
+# Package information:
+#  library(help = topGO)       # basic information
+#  browseVignettes("topGO")    # available vignettes
+#  data(package = "topGO")     # available datasets
+
+
+# Let's define a gene set: GOterm enrichment for G1/S switch activators:
+mySet <- c("YFR028C", # Cdc14
+           "YDL056W", # Mbp1
+           "YLR182W", # Swi6
+           "YER111C", # Swi4
+           "YOR083W", # Whi5
+           "YBR160W", # Cdc28
+           "YMR199W", # Cln1
+           "YPL256C", # Cln2
+           "YAL040C") # Cln3
+
+allGenes <- keys(org.Sc.sgd.db)
+allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
+                                            # we define enrichment
+
+myEnr <- GOenrichment(mySet, allGenes)
+
+sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
+
+#Yes: most significantly enriched is GO:0071931. What is this?
+getGOTerm("GO:0071931")  # ... makes sense.
+
+(fullSet <- myEnr$genes$`GO:0071931`)  # What genes are annotated to this term?
+
+intersect(mySet, fullSet) # These are in both sets
+setdiff(mySet, fullSet)   # These mySet members are not annotated to that term
+setdiff(fullSet, mySet)   # These are annotated to that term but not in mySet.
+                          # ... that's the most interesting set. From a set of
+                          # genes we have identified a function that they
+                          # share, and that shared function has allowed us
+                          # to identify
+
+# What are these genes?
+# Select annotations from the annotation database:
+select(org.Sc.sgd.db,
+       keys = setdiff(fullSet, mySet),
+       columns = c("COMMON", "DESCRIPTION"))
+
+# Note that these annotations are partially redundant to several different
+# aliases of the same three genes.
+
+
+
+# [END]