Update the ScCC network data

2020-09-23 14:06:58 +10:00
parent 02e20ba812
commit cf90c52a61
2 changed files with 72 additions and 33 deletions
--- a/data/scCCnet.rds
+++ b/data/scCCnet.rds
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@@ -6,40 +6,70 @@
 # Boris Steipe for ABC learning units
 #
 # Notes:
+#
+#      The large source- datafiles are NOT posted to github. If you want to
+#      experiment with your own code, download them and place them into your
+#      local  ./data  directory.
+#
 #      STRING data source:
-#        Download page: https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
-#        Data: (20.8 mb) https://string-db.org/download/protein.links.full.v10.5/4932.protein.links.full.v10.5.txt.gz
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
+#        Data: (20.1 mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
 #
-#      GOSlim data source:
-#        Info page: http://www.geneontology.org/page/go-slim-and-subset-guide
-#        Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab
+#      GOSlim data source: (Note: this has moved from GO to SGD)
+#        Info page: https://www.yeastgenome.org/downloads
+#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
+#        Data: (3 mb)
+# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
 #
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10  -  2019  01
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    2020 Update. GO Slim Yeast mow at SGD
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout
 #           1.0    First code copied from 2016 material.
 #
 # TODO:
 #
-#
 # ==============================================================================
+# SRCDIR <- "./instructor"

+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        INITIALIZE                                        58
+#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
+#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
+#TOC>   3.1        Intersect interactions and annotations         122
+#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  INITIALIZE  ==========================================================
+
+SRCDIR <- "./data"
 if (! requireNamespace("readr", quietly = TRUE)) {
  install.packages("readr")
 }


-# STRING functional interaction data
+# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================

 # Read STRING Data (needs to be downloaded from database, see URL in Notes)
-STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt",
-                         delim = " ")
+# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
+# really not necessary to uncompress since readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")

 # Subset only IDs and combined_score column
 STR <- STR[ , c("protein1", "protein2", "combined_score")]
@@ -48,22 +78,26 @@ STR <- STR[ , c("protein1", "protein2", "combined_score")]
 # sum(STR$combined_score > 909)  # 100270 edges
 # subset for 100,000 highest confidence edges
 STR <- STR[(STR$combined_score > 909), ]
+head(STR)

 # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
 STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
 STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
-# head(STR)
+head(STR)

 # get a vector of gene names in this list
 myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
                                                      # names
+length(myIntxGenes)
+sample(myIntxGenes, 10)  # choose 10 at random (sanity check)


-# GOSlim functional annotations
+# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
 #
 # Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
+( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )

-Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
+Gsl <- readr::read_tsv(fn,
                       col_names = c("ID",
                                     "name",
                                     "SGDId",
@@ -72,52 +106,57 @@ Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
                                     "termID",
                                     "status"))

-# head(Gsl)
-#
+head(Gsl)
+
 # What cell cycle names does it contain?
-# myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
-# myGslTermNames[grep("cycle", myGslTermNames)]
+myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
+myGslTermNames[grep("cycle", myGslTermNames)]
 # [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
-#
+
 # Choose "mitotic cell cycle" as the GOslim term to subset with
-#

 scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
-# length(scCCgenes)  # 324 genes annotated to that term
+length(scCCgenes)  # 324 genes annotated to that term

-# sum(scCCgenes %in% myIntxGenes)  # 294 of these have high-confidence
-#                                     # interactions
+# ==   3.1  Intersect interactions and annotations  ============================

+sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
+#                                # functional interactions
+
+
+# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
+#
 # Define scCCnet ... the S. Cervisiae Cell Cycle network
 # Subset all rows for which BOTH genes are in the GOslim cell cycle set
+#
 scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
               (STR$protein2 %in% scCCgenes), ]

 # How many genes are there?
-# length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #261
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283

 # Each edge is listed twice - now remove duplicates.
-#
-# Step 1: make a vector: sort two names so the frist one is alphabetically
-#         smaller han the second one. This brings the two names into a defined
+
+# Step 1: make a vector: sort two names so the fiRst one is alphabetically
+#         smaller Than the second one. This brings the two names into a defined
 #         order. Then concatenate them with a "." - the resulting string
 #         is always the same, for any order. E.g. c("A", "B") gives "A.B"
 #         and c("B", "A") also gives "A.B". This identifies duplicates.

-x <- apply(cbind(scCCnet$protein1,
-                 scCCnet$protein2),
+x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
           1,
           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
-# head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
+head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.

-# sum(duplicated(x))  # 1280
+sum(duplicated(x))  # 1453

 # Step 2: drop all rows that contain duplicates in x
 scCCnet <- scCCnet[! duplicated(x), ]

 # Confirm we didn't loose genes
-# length(unique(c(mySubnet$protein1, mySubnet$protein2)))  # 261, no change
-# Network has 261 nodes, 1280 edges
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
+nrow(scCCnet)
+# Network has 283 nodes, 1453 edges

 saveRDS(scCCnet, file = "./data/scCCnet.rds")