# BIN-FUNC_Semantic_similarity.R # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC_Semantic_similarity unit. # # Version: 1.0 # # Date: 2017 11 12 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.0 New code. # # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------- #TOC> 1 Preparations: Packages, AnnotationDB, Setup 39 #TOC> 2 Fetch GO Annotations 89 #TOC> 3 Semantic Similarities 98 #TOC> 4 GO Term Enrichment in Gene Sets 116 #TOC> #TOC> ========================================================================== # = 1 Preparations: Packages, AnnotationDB, Setup ========================= # GOSim is an R-package in the Bioconductor project. if (! require(GOSim, quietly=TRUE)) { if (! exists("biocLite")) { source("https://bioconductor.org/biocLite.R") } biocLite("GOSim") library(GOSim) } # Package information: # library(help = GOSim) # basic information # browseVignettes("GOSim") # available vignettes # data(package = "GOSim") # available datasets # GOSim loads human annotations by default. We load yeast annotations instead... if (!require(org.Sc.sgd.db, quietly=TRUE)) { if (! exists("biocLite")) { source("https://bioconductor.org/biocLite.R") } biocLite("org.Sc.sgd.db") library(org.Sc.sgd.db) } # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such # databases exist for all model organisms. It's a kind of a fancy data frame # from which we can get annotations by rows (genes) with the keys() funtion ... keys(org.Sc.sgd.db)[1500:1510] # ... and the types of available annotations with the columns() function columns(org.Sc.sgd.db) # Note that one of the columns is "GO" ... and we load that into the # datastructures used by GOSim: # Choose GOterms to use setEvidenceLevel(evidences="all", organism=org.Sc.sgdORGANISM, gomap=org.Sc.sgdGO) # Use Biological Process ontology setOntology("BP", loadIC=FALSE) # confirm that we loaded the correct ontology head(get("gomap", envir=GOSimEnv)) # = 2 Fetch GO Annotations ================================================ # All keys being used here are yeast systematic names. # Get one set of annotations getGOInfo(c("YDL056W")) # Mbp1 # = 3 Semantic Similarities =============================================== # Get semantic similarities between genes ?getGeneSim # There are _many_ different metrics of term similarity implemented # in this package. # Mbp1 and... getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis # = 4 GO Term Enrichment in Gene Sets ===================================== # Calculating GO term enrichment in gene sets is done with the topGO package. if (! require(topGO, quietly=TRUE)) { if (! exists("biocLite")) { source("https://bioconductor.org/biocLite.R") } biocLite("topGO") library(topGO) } # Package information: # library(help = topGO) # basic information # browseVignettes("topGO") # available vignettes # data(package = "topGO") # available datasets # Let's define a gene set: GOterm enrichment for G1/S switch activators: mySet <- c("YFR028C", # Cdc14 "YDL056W", # Mbp1 "YLR182W", # Swi6 "YER111C", # Swi4 "YOR083W", # Whi5 "YBR160W", # Cdc28 "YMR199W", # Cln1 "YPL256C", # Cln2 "YAL040C") # Cln3 allGenes <- keys(org.Sc.sgd.db) allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which # we define enrichment myEnr <- GOenrichment(mySet, allGenes) sort(myEnr$p.values) # Any significantly enriched terms? All of these are ... #Yes: most significantly enriched is GO:0071931. What is this? getGOTerm("GO:0071931") # ... makes sense. (fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term? intersect(mySet, fullSet) # These are in both sets setdiff(mySet, fullSet) # These mySet members are not annotated to that term setdiff(fullSet, mySet) # These are annotated to that term but not in mySet. # ... that's the most interesting set. From a set of # genes we have identified a function that they # share, and that shared function has allowed us # to identify # What are these genes? # Select annotations from the annotation database: select(org.Sc.sgd.db, keys = setdiff(fullSet, mySet), columns = c("COMMON", "DESCRIPTION")) # Note that these annotations are partially redundant to several different # aliases of the same three genes. # [END]