Update the ScCC network data

This commit is contained in:
hyginn 2020-09-23 14:06:58 +10:00
parent 02e20ba812
commit cf90c52a61
2 changed files with 72 additions and 33 deletions

Binary file not shown.

View File

@ -6,40 +6,70 @@
# Boris Steipe for ABC learning units # Boris Steipe for ABC learning units
# #
# Notes: # Notes:
#
# The large source- datafiles are NOT posted to github. If you want to
# experiment with your own code, download them and place them into your
# local ./data directory.
#
# STRING data source: # STRING data source:
# Download page: https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae # Download page:
# Data: (20.8 mb) https://string-db.org/download/protein.links.full.v10.5/4932.protein.links.full.v10.5.txt.gz # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
# Data: (20.1 mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
# #
# GOSlim data source: # GOSlim data source: (Note: this has moved from GO to SGD)
# Info page: http://www.geneontology.org/page/go-slim-and-subset-guide # Info page: https://www.yeastgenome.org/downloads
# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab # Info page: http://sgd-archive.yeastgenome.org/curation/literature/
# Data: (3 mb)
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
# #
# #
# Version: 1.1 # Version: 1.2
# #
# Date: 2017 10 - 2019 01 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Update. GO Slim Yeast mow at SGD
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 First code copied from 2016 material. # 1.0 First code copied from 2016 material.
# #
# TODO: # TODO:
# #
#
# ============================================================================== # ==============================================================================
# SRCDIR <- "./instructor"
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 INITIALIZE 58
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
#TOC> 3.1 Intersect interactions and annotations 122
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
#TOC>
#TOC> ==========================================================================
# = 1 INITIALIZE ==========================================================
SRCDIR <- "./data"
if (! requireNamespace("readr", quietly = TRUE)) { if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr") install.packages("readr")
} }
# STRING functional interaction data # = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes) # Read STRING Data (needs to be downloaded from database, see URL in Notes)
STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt", # The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
delim = " ") # really not necessary to uncompress since readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
# Subset only IDs and combined_score column # Subset only IDs and combined_score column
STR <- STR[ , c("protein1", "protein2", "combined_score")] STR <- STR[ , c("protein1", "protein2", "combined_score")]
@ -48,22 +78,26 @@ STR <- STR[ , c("protein1", "protein2", "combined_score")]
# sum(STR$combined_score > 909) # 100270 edges # sum(STR$combined_score > 909) # 100270 edges
# subset for 100,000 highest confidence edges # subset for 100,000 highest confidence edges
STR <- STR[(STR$combined_score > 909), ] STR <- STR[(STR$combined_score > 909), ]
head(STR)
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
STR$protein1 <- gsub("^4932\\.", "", STR$protein1) STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
STR$protein2 <- gsub("^4932\\.", "", STR$protein2) STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
# head(STR) head(STR)
# get a vector of gene names in this list # get a vector of gene names in this list
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
# names # names
length(myIntxGenes)
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
# GOSlim functional annotations # = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
# #
# Read GOSlim data (needs to be downloaded from database, see URL in Notes) # Read GOSlim data (needs to be downloaded from database, see URL in Notes)
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
Gsl <- readr::read_tsv("./data/go_slim_mapping.tab", Gsl <- readr::read_tsv(fn,
col_names = c("ID", col_names = c("ID",
"name", "name",
"SGDId", "SGDId",
@ -72,52 +106,57 @@ Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
"termID", "termID",
"status")) "status"))
# head(Gsl) head(Gsl)
#
# What cell cycle names does it contain? # What cell cycle names does it contain?
# myGslTermNames <- unique(Gsl$termName) # 169 unique terms myGslTermNames <- unique(Gsl$termName) # 169 unique terms
# myGslTermNames[grep("cycle", myGslTermNames)] myGslTermNames[grep("cycle", myGslTermNames)]
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle" # [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
#
# Choose "mitotic cell cycle" as the GOslim term to subset with # Choose "mitotic cell cycle" as the GOslim term to subset with
#
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
# length(scCCgenes) # 324 genes annotated to that term length(scCCgenes) # 324 genes annotated to that term
# sum(scCCgenes %in% myIntxGenes) # 294 of these have high-confidence # == 3.1 Intersect interactions and annotations ============================
# # interactions
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
# # functional interactions
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
#
# Define scCCnet ... the S. Cervisiae Cell Cycle network # Define scCCnet ... the S. Cervisiae Cell Cycle network
# Subset all rows for which BOTH genes are in the GOslim cell cycle set # Subset all rows for which BOTH genes are in the GOslim cell cycle set
#
scCCnet <- STR[(STR$protein1 %in% scCCgenes) & scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
(STR$protein2 %in% scCCgenes), ] (STR$protein2 %in% scCCgenes), ]
# How many genes are there? # How many genes are there?
# length(unique(c(scCCnet$protein1, scCCnet$protein2))) #261 length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
# Each edge is listed twice - now remove duplicates. # Each edge is listed twice - now remove duplicates.
#
# Step 1: make a vector: sort two names so the frist one is alphabetically # Step 1: make a vector: sort two names so the fiRst one is alphabetically
# smaller han the second one. This brings the two names into a defined # smaller Than the second one. This brings the two names into a defined
# order. Then concatenate them with a "." - the resulting string # order. Then concatenate them with a "." - the resulting string
# is always the same, for any order. E.g. c("A", "B") gives "A.B" # is always the same, for any order. E.g. c("A", "B") gives "A.B"
# and c("B", "A") also gives "A.B". This identifies duplicates. # and c("B", "A") also gives "A.B". This identifies duplicates.
x <- apply(cbind(scCCnet$protein1, x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
scCCnet$protein2),
1, 1,
FUN = function(x) { return(paste(sort(x), collapse = ".")) }) FUN = function(x) { return(paste(sort(x), collapse = ".")) })
# head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
# sum(duplicated(x)) # 1280 sum(duplicated(x)) # 1453
# Step 2: drop all rows that contain duplicates in x # Step 2: drop all rows that contain duplicates in x
scCCnet <- scCCnet[! duplicated(x), ] scCCnet <- scCCnet[! duplicated(x), ]
# Confirm we didn't loose genes # Confirm we didn't loose genes
# length(unique(c(mySubnet$protein1, mySubnet$protein2))) # 261, no change length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
# Network has 261 nodes, 1280 edges nrow(scCCnet)
# Network has 283 nodes, 1453 edges
saveRDS(scCCnet, file = "./data/scCCnet.rds") saveRDS(scCCnet, file = "./data/scCCnet.rds")