Update PPI-Analysis assets and code
This commit is contained in:
168
scripts/ABC-makeSTRINGedges.R
Normal file
168
scripts/ABC-makeSTRINGedges.R
Normal file
@@ -0,0 +1,168 @@
|
||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence human STRING edges.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafile is NOT posted to github. If you want to
|
||||
# experiment with the original data, download it and place it into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||
# Data: (127.6 Mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 Rewrite
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Initialize 44
|
||||
#TOC> 2 Read STRING Data 51
|
||||
#TOC> 3 Define cutoff and subset 63
|
||||
#TOC> 4 Drop duplicates 103
|
||||
#TOC> 5 Simple statistics 127
|
||||
#TOC> 6 Write to file 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Initialize ==========================================================
|
||||
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 Read STRING Data ====================================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||
# 848 Mb. Fortunately readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
nrow(STR) # 11,759,454 rows
|
||||
head(STR)
|
||||
|
||||
|
||||
# = 3 Define cutoff and subset ============================================
|
||||
|
||||
# approximate distribution of combined_score
|
||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||
|
||||
# Let's table the counts >= 850 and plot them for better resolution.
|
||||
|
||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||
|
||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||
# frequently assigns a combined score of 0.900
|
||||
|
||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||
# as combined scores.
|
||||
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||
plot(myX,
|
||||
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||
xlim = c(1.0, 0.85), # reverse x-axis
|
||||
type = "l",
|
||||
main = "STRING interactions for 9606 (top 600,000)",
|
||||
xlab = "combined_score",
|
||||
ylab = "cumulative counts",
|
||||
col = "#CC0000")
|
||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# What's the cutoff for 100,000 edges?
|
||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||
|
||||
# confirm
|
||||
sum(STR$combined_score >= 964) # 101,348
|
||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# subset the table, and use only the protein IDs and the combined_score
|
||||
STR <- STR[STR$combined_score >= 964,
|
||||
c("protein1", "protein2", "combined_score")]
|
||||
colnames(STR) <- c("a", "b", "score")
|
||||
|
||||
|
||||
# = 4 Drop duplicates ====================================================
|
||||
|
||||
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||
# sort order, then checking for duplicated().
|
||||
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||
# (X:U, X:U) has a duplicate.
|
||||
|
||||
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||
tmp <- STR$b # copy column b
|
||||
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||
all(STR$a >= STR$b) # confirm: TRUE
|
||||
|
||||
# now, make combined keys, like this:
|
||||
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||
|
||||
tmp <- paste0(STR$a, ":", STR$b)
|
||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||
# both a:b and b:a !
|
||||
|
||||
# drop all duplicated interactions from tmp
|
||||
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||
|
||||
|
||||
# = 5 Simple statistics ===================================================
|
||||
|
||||
# how many unique genes?
|
||||
length(unique(c(STR$a, STR$b))) # 8,445
|
||||
|
||||
# how many self-edges?
|
||||
sum(STR$a == STR$b) # none
|
||||
|
||||
# log(rank) / log(frequency)
|
||||
myTbl <- table(c(STR$a, STR$b))
|
||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||
|
||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||
|
||||
# number of singletons
|
||||
sum(myTbl == 1) # almost a quarter
|
||||
|
||||
# maximum?
|
||||
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||
# Google: CDC5L
|
||||
|
||||
# Zipf-plot
|
||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||
type = "b", cex = 0.7,
|
||||
main = "STRINGedges - degrees",
|
||||
xlab = "log(rank)",
|
||||
ylab = "log(frequency)",
|
||||
col = "#FFBB88")
|
||||
|
||||
sprintf("Average number of interactions: %5.2f",
|
||||
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||
|
||||
|
||||
# = 6 Write to file =======================================================
|
||||
|
||||
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||
|
||||
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
Reference in New Issue
Block a user