bch441-work-abc-units/scripts/ABC-makeSTRINGedges.R

# tocID <- "scripts/ABC-makeSTRINGedges.R"
#
# Create a subnetwork of high-confidence human STRING edges.
#
# Notes:
#
#      The large source- datafile is NOT posted to github. If you want to
#      experiment with the original data, download it and place it into your
#      local  ./data  directory.
#
#      STRING data source:
#        Download page:
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
#        Data: (127.6 Mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
#
# Version:  1.0
#
# Date:     2020-09
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.0    Rewrite
#
# TODO:
#
# ==============================================================================


#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                             Line
#TOC> -------------------------------------------------
#TOC>   1        Initialize                          44
#TOC>   2        Read STRING Data                    51
#TOC>   3        Define cutoff and subset            63
#TOC>   4        Drop  duplicates                   103
#TOC>   5        Simple statistics                  127
#TOC>   6        Write to file                      160
#TOC>
#TOC> ==========================================================================


# =    1  Initialize  ==========================================================

if (! requireNamespace("readr", quietly = TRUE)) {
  install.packages("readr")
}


# =    2  Read STRING Data  ====================================================

# Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 127.6MB, the uncompressed version is probably
# 848 Mb. Fortunately readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
nrow(STR)  #  11,759,454 rows
head(STR)


# =    3  Define cutoff and subset  ============================================

# approximate distribution of combined_score
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")

# Let's table the counts >= 850 and plot them for better resolution.

myTb <- table(STR$combined_score[STR$combined_score >= 850])
is.unsorted(as.integer(names(myTb)))  # Good - they are all in order

plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
                         # frequently assigns a combined score of 0.900

# Let's plot these counts as cumulative sums, in reverse order, scaled
# as combined scores.
myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
plot(myX,
     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
     xlim = c(1.0, 0.85),            # reverse x-axis
     type = "l",
     main = "STRING interactions for 9606 (top 600,000)",
     xlab = "combined_score",
     ylab = "cumulative counts",
     col = "#CC0000")
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")

# What's the cutoff for 100,000 edges?
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964

# confirm
sum(STR$combined_score >= 964) # 101,348
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")

# subset the table, and use only the protein IDs and the combined_score
STR <- STR[STR$combined_score >= 964,
            c("protein1", "protein2", "combined_score")]
colnames(STR) <- c("a", "b", "score")


# =    4  Drop  duplicates  ====================================================

# identify duplicate interactions by creating keys in a defined alphabetical
# sort order, then checking for  duplicated().
# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
# (X:U, X:U) has a duplicate.

AB <- STR$a < STR$b        # logical vector: genes we need to swap
tmp <- STR$b               # copy column b
STR$b[AB] <- STR$a[AB]     # copy a's into b
STR$a[AB] <- tmp[AB]       # copy tmp's into a
all(STR$a >= STR$b)        # confirm: TRUE

# now, make combined keys, like this:
paste0(STR$a[1:10], ":", STR$b[1:10])

tmp <- paste0(STR$a, ":", STR$b)
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
                     # both a:b and b:a !

# drop all duplicated interactions from tmp
STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain


# =    5  Simple statistics  ===================================================

# how many unique genes?
length(unique(c(STR$a, STR$b)))   # 8,445

# how many self-edges?
sum(STR$a == STR$b)  # none

# log(rank) / log(frequency)
myTbl <- table(c(STR$a, STR$b))
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]

hist(myTbl, breaks = 40, col = "#FFEEBB")

# number of singletons
sum(myTbl == 1) # almost a quarter

# maximum?
myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
                                   # Google: CDC5L

# Zipf-plot
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
     type = "b", cex = 0.7,
     main = "STRINGedges - degrees",
     xlab = "log(rank)",
     ylab = "log(frequency)",
     col = "#FFBB88")

sprintf("Average number of interactions: %5.2f",
         nrow(STR) / length(unique(c(STR$a, STR$b))))


# =    6  Write to file  =======================================================

saveRDS(STR, file = "./data/STRINGedges.rds")

# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
                                                    # object when needed


# [END]