2021-11-16 05:31:48 +00:00
|
|
|
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
|
|
|
#
|
|
|
|
# Create a subnetwork of high-confidence human STRING edges.
|
|
|
|
#
|
|
|
|
# Notes:
|
|
|
|
#
|
|
|
|
# The large source- datafile is NOT posted to github. If you want to
|
|
|
|
# experiment with the original data, download it and place it into your
|
|
|
|
# local ./data directory.
|
|
|
|
#
|
|
|
|
# STRING data source:
|
|
|
|
# Download page:
|
|
|
|
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
|
|
|
# Data: (127.6 Mb)
|
|
|
|
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
|
|
|
#
|
|
|
|
# Version: 1.0
|
|
|
|
#
|
|
|
|
# Date: 2020-09
|
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# Versions:
|
|
|
|
# 1.0 Rewrite
|
|
|
|
#
|
|
|
|
# TODO:
|
|
|
|
#
|
|
|
|
# ==============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
#TOC> ==========================================================================
|
|
|
|
#TOC>
|
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> -------------------------------------------------
|
|
|
|
#TOC> 1 Initialize 44
|
|
|
|
#TOC> 2 Read STRING Data 51
|
|
|
|
#TOC> 3 Define cutoff and subset 63
|
|
|
|
#TOC> 4 Drop duplicates 103
|
|
|
|
#TOC> 5 Simple statistics 127
|
|
|
|
#TOC> 6 Write to file 160
|
|
|
|
#TOC>
|
|
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
|
|
|
|
|
|
# = 1 Initialize ==========================================================
|
|
|
|
|
|
|
|
if (! requireNamespace("readr", quietly = TRUE)) {
|
|
|
|
install.packages("readr")
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# = 2 Read STRING Data ====================================================
|
|
|
|
|
|
|
|
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
|
|
|
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
|
|
|
# 848 Mb. Fortunately readr:: can read from compressed
|
|
|
|
# files, and does so automatically, based on the file extension.
|
|
|
|
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
|
|
|
STR <- readr::read_delim(fn, delim = " ")
|
|
|
|
nrow(STR) # 11,759,454 rows
|
|
|
|
head(STR)
|
|
|
|
|
|
|
|
|
|
|
|
# = 3 Define cutoff and subset ============================================
|
|
|
|
|
|
|
|
# approximate distribution of combined_score
|
|
|
|
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
|
|
|
|
|
|
|
# Let's table the counts >= 850 and plot them for better resolution.
|
|
|
|
|
|
|
|
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
|
|
|
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
|
|
|
|
|
|
|
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
|
|
|
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
|
|
|
# frequently assigns a combined score of 0.900
|
|
|
|
|
|
|
|
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
|
|
|
# as combined scores.
|
|
|
|
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
|
|
|
plot(myX,
|
|
|
|
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
|
|
|
xlim = c(1.0, 0.85), # reverse x-axis
|
|
|
|
type = "l",
|
|
|
|
main = "STRING interactions for 9606 (top 600,000)",
|
|
|
|
xlab = "combined_score",
|
|
|
|
ylab = "cumulative counts",
|
|
|
|
col = "#CC0000")
|
|
|
|
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
|
|
|
|
|
|
|
# What's the cutoff for 100,000 edges?
|
|
|
|
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
|
|
|
|
|
|
|
# confirm
|
|
|
|
sum(STR$combined_score >= 964) # 101,348
|
|
|
|
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
|
|
|
|
|
|
|
# subset the table, and use only the protein IDs and the combined_score
|
|
|
|
STR <- STR[STR$combined_score >= 964,
|
|
|
|
c("protein1", "protein2", "combined_score")]
|
|
|
|
colnames(STR) <- c("a", "b", "score")
|
|
|
|
|
|
|
|
|
|
|
|
# = 4 Drop duplicates ====================================================
|
|
|
|
|
|
|
|
# identify duplicate interactions by creating keys in a defined alphabetical
|
|
|
|
# sort order, then checking for duplicated().
|
|
|
|
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
|
|
|
# (X:U, X:U) has a duplicate.
|
|
|
|
|
|
|
|
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
|
|
|
tmp <- STR$b # copy column b
|
|
|
|
STR$b[AB] <- STR$a[AB] # copy a's into b
|
|
|
|
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
|
|
|
all(STR$a >= STR$b) # confirm: TRUE
|
|
|
|
|
|
|
|
# now, make combined keys, like this:
|
|
|
|
paste0(STR$a[1:10], ":", STR$b[1:10])
|
|
|
|
|
|
|
|
tmp <- paste0(STR$a, ":", STR$b)
|
|
|
|
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
|
|
|
# both a:b and b:a !
|
|
|
|
|
|
|
|
# drop all duplicated interactions from tmp
|
|
|
|
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
|
|
|
|
|
|
|
|
|
|
|
# = 5 Simple statistics ===================================================
|
|
|
|
|
|
|
|
# how many unique genes?
|
|
|
|
length(unique(c(STR$a, STR$b))) # 8,445
|
|
|
|
|
|
|
|
# how many self-edges?
|
|
|
|
sum(STR$a == STR$b) # none
|
|
|
|
|
|
|
|
# log(rank) / log(frequency)
|
|
|
|
myTbl <- table(c(STR$a, STR$b))
|
|
|
|
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
|
|
|
|
|
|
|
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
|
|
|
|
|
|
|
# number of singletons
|
|
|
|
sum(myTbl == 1) # almost a quarter
|
|
|
|
|
|
|
|
# maximum?
|
|
|
|
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
|
|
|
# Google: CDC5L
|
|
|
|
|
|
|
|
# Zipf-plot
|
|
|
|
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
|
|
|
type = "b", cex = 0.7,
|
|
|
|
main = "STRINGedges - degrees",
|
|
|
|
xlab = "log(rank)",
|
|
|
|
ylab = "log(frequency)",
|
|
|
|
col = "#FFBB88")
|
|
|
|
|
|
|
|
sprintf("Average number of interactions: %5.2f",
|
|
|
|
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
|
|
|
|
|
|
|
|
|
|
|
# = 6 Write to file =======================================================
|
|
|
|
|
|
|
|
saveRDS(STR, file = "./data/STRINGedges.rds")
|
|
|
|
|
|
|
|
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
|
|
|
# object when needed
|
|
|
|
|
|
|
|
|
|
|
|
# [END]
|