From 8cb69f29860a78b919dada6393ec2926f91f16f8 Mon Sep 17 00:00:00 2001 From: hyginn Date: Mon, 13 Nov 2017 00:51:04 -0500 Subject: [PATCH] New unit --- BIN-FUNC-Semantic_similarity.R | 176 +++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 BIN-FUNC-Semantic_similarity.R diff --git a/BIN-FUNC-Semantic_similarity.R b/BIN-FUNC-Semantic_similarity.R new file mode 100644 index 0000000..c922237 --- /dev/null +++ b/BIN-FUNC-Semantic_similarity.R @@ -0,0 +1,176 @@ +# BIN-FUNC_Semantic_similarity.R +# +# Purpose: A Bioinformatics Course: +# R code accompanying the BIN-FUNC_Semantic_similarity unit. +# +# Version: 1.0 +# +# Date: 2017 11 12 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 1.0 New code. +# +# +# TODO: +# +# +# == DO NOT SIMPLY source() THIS FILE! ======================================= +# +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... +# +# ============================================================================== + + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> -------------------------------------------------------------- +#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39 +#TOC> 2 Fetch GO Annotations 89 +#TOC> 3 Semantic Similarities 98 +#TOC> 4 GO Term Enrichment in Gene Sets 116 +#TOC> +#TOC> ========================================================================== + + +# = 1 Preparations: Packages, AnnotationDB, Setup ========================= + + +# GOSim is an R-package in the Bioconductor project. +if (! require(GOSim, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } + biocLite("GOSim") + library(GOSim) +} +# Package information: +# library(help = GOSim) # basic information +# browseVignettes("GOSim") # available vignettes +# data(package = "GOSim") # available datasets + + +# GOSim loads human annotations by default. We load yeast annotations instead... +if (!require(org.Sc.sgd.db, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } + biocLite("org.Sc.sgd.db") + library(org.Sc.sgd.db) +} + +# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such +# databases exist for all model organisms. It's a kind of a fancy data frame +# from which we can get annotations by rows (genes) with the keys() funtion ... +keys(org.Sc.sgd.db)[1500:1510] + +# ... and the types of available annotations with the columns() function +columns(org.Sc.sgd.db) + +# Note that one of the columns is "GO" ... and we load that into the +# datastructures used by GOSim: + +# Choose GOterms to use +setEvidenceLevel(evidences="all", + organism=org.Sc.sgdORGANISM, + gomap=org.Sc.sgdGO) + +# Use Biological Process ontology +setOntology("BP", loadIC=FALSE) + +# confirm that we loaded the correct ontology +head(get("gomap", envir=GOSimEnv)) + + + +# = 2 Fetch GO Annotations ================================================ + + +# All keys being used here are yeast systematic names. + +# Get one set of annotations +getGOInfo(c("YDL056W")) # Mbp1 + + +# = 3 Semantic Similarities =============================================== + + +# Get semantic similarities between genes +?getGeneSim + +# There are _many_ different metrics of term similarity implemented +# in this package. + + # Mbp1 and... +getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex +getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators +getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator +getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist +getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist +getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis + + +# = 4 GO Term Enrichment in Gene Sets ===================================== + + +# Calculating GO term enrichment in gene sets is done with the topGO package. +if (! require(topGO, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } + biocLite("topGO") + library(topGO) +} +# Package information: +# library(help = topGO) # basic information +# browseVignettes("topGO") # available vignettes +# data(package = "topGO") # available datasets + + +# Let's define a gene set: GOterm enrichment for G1/S switch activators: +mySet <- c("YFR028C", # Cdc14 + "YDL056W", # Mbp1 + "YLR182W", # Swi6 + "YER111C", # Swi4 + "YOR083W", # Whi5 + "YBR160W", # Cdc28 + "YMR199W", # Cln1 + "YPL256C", # Cln2 + "YAL040C") # Cln3 + +allGenes <- keys(org.Sc.sgd.db) +allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which + # we define enrichment + +myEnr <- GOenrichment(mySet, allGenes) + +sort(myEnr$p.values) # Any significantly enriched terms? All of these are ... + +#Yes: most significantly enriched is GO:0071931. What is this? +getGOTerm("GO:0071931") # ... makes sense. + +(fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term? + +intersect(mySet, fullSet) # These are in both sets +setdiff(mySet, fullSet) # These mySet members are not annotated to that term +setdiff(fullSet, mySet) # These are annotated to that term but not in mySet. + # ... that's the most interesting set. From a set of + # genes we have identified a function that they + # share, and that shared function has allowed us + # to identify + +# What are these genes? +# Select annotations from the annotation database: +select(org.Sc.sgd.db, + keys = setdiff(fullSet, mySet), + columns = c("COMMON", "DESCRIPTION")) + +# Note that these annotations are partially redundant to several different +# aliases of the same three genes. + + + +# [END]