Update PPI-Analysis assets and code

2020-09-23 20:47:26 +10:00
parent 42ad74b2f2
commit 76b18a8a35
5 changed files with 224 additions and 45 deletions
--- a/.utilities.R
+++ b/.utilities.R
@@ -332,5 +332,16 @@ selectChi2 <- function() {
 }


+# ==   5.2  selectENSP()  ====================================================
+selectENSP <- function(x) {
+  oldSeed <- .Random.seed
+  set.seed(myStudentNumber)
+  x <- sample(x[order(x)])
+  .Random.seed <- oldSeed
+  cat(sprintf("seal: %s\n", seal(paste0(x,collapse=""))))
+  return(x)
+}
+
+

 # [END]
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@@ -1,10 +1,5 @@
 # tocID <- "BIN-PPI-Analysis.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
@@ -15,7 +10,8 @@
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
-#           1.2    Deprecate save()/load() for saveRDS()/readRDS()
+#           1.2    2020 Updates; Rewrite for new STRINg V11;
+#                  Deprecate save()/load() for saveRDS()/readRDS()
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -39,13 +35,13 @@
 #TOC> 
 #TOC>   Section  Title                                           Line
 #TOC> ---------------------------------------------------------------
-#TOC>   1        Setup and data                                    46
-#TOC>   2        Functional Edges in the Human Proteome            82
-#TOC>   2.1        Cliques                                        125
-#TOC>   2.2        Communities                                    166
-#TOC>   2.3        Betweenness Centrality                         180
-#TOC>   3        biomaRt                                          226
-#TOC>   4        Task for submission                              296
+#TOC>   1        Setup and data                                    49
+#TOC>   2        Functional Edges in the Human Proteome            85
+#TOC>   2.1        Cliques                                        128
+#TOC>   2.2        Communities                                    169
+#TOC>   2.3        Betweenness Centrality                         183
+#TOC>   3        biomaRt                                          230
+#TOC>   4        Task for submission                              301
 #TOC> 
 #TOC> ==========================================================================

@@ -68,10 +64,10 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
 # from the STRING database. The full table has 8.5 million records, here is a
 # subset of records with combined confidence scores > 980

-# The selected set of edges with a confidence of > 980 is a dataframe with about
-# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of
+# The selected set of edges with a confidence of > 964 is a dataframe with about
+# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
 # a fungal proteome. You can load the saved dataframe here (To read more about
-# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
+# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).

 STRINGedges <- readRDS("./data/STRINGedges.rds")

@@ -80,8 +76,8 @@ head(STRINGedges)
 # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
 # Ensemble transcript identifiers that start with ENSP. We'll remove them:

-STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1)
-STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2)
+STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
+STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)

 head(STRINGedges)

@@ -91,9 +87,9 @@ head(STRINGedges)

 # There are many possibilities to explore interesting aspects of biological
 # networks, we will keep with some very simple procedures here but you have
-# to be aware that this is barely scratching the surface of possibilites.
+# to be aware that this is barely scratching the surface of possibilities.
 # However, once the network exists in your computer, it is comparatively
-# easy to find information nline about the many, many options to analyze.
+# easy to find information online about the many, many options to analyze.


 # Make a graph from this dataframe
@@ -101,7 +97,7 @@ head(STRINGedges)

 gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)

-# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
+# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
 # layout of such large graphs is possible, but requires specialized code. Google
 # for <layout large graphs> if you are curious. Also, consider what one can
 # really learn from plotting such a graph ...
@@ -119,7 +115,7 @@ plot(log10(as.numeric(names(freqRank)) + 1),
     log10(as.numeric(freqRank)), type = "b",
     pch = 21, bg = "#FEE0AF",
     xlab = "log(Rank)", ylab = "log(frequency)",
-     main = "6,500 nodes from the human functional interaction network")
+     main = "8,400 nodes from the human functional interaction network")

 # This looks very scale-free indeed.

@@ -136,11 +132,11 @@ abline(regressionLine, col = "firebrick")
 # Biological complexes often appear as cliques in interaction graphs.

 igraph::clique_num(gSTR)
-# The largest clique has 63 members.
+# The largest clique has 81 members.

 (C <- igraph::largest_cliques(gSTR)[[1]])

-# Pick one of the proteins and find out what this fully connected cluster of 63
+# Pick one of the proteins and find out what this fully connected cluster of 81
 # proteins is (you can simply Google for any of the IDs). Is this expected?

 # Plot this ...
@@ -166,7 +162,7 @@ plot(R,
 par(oPar)

 # ... well: remember: a clique means every node is connected to every other
-# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI
+# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
 # networks looks like for large complexes.


@@ -179,9 +175,9 @@ set.seed(NULL)                         # reset the RNG
 igraph::modularity(gSTRclusters) # ... measures how separated the different
                                 # membership types are from each other
 tMem <- table(igraph::membership(gSTRclusters))
-length(tMem)  # More than 2000 communities identified
+length(tMem)  # About 700 communities identified
 hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
-range(tMem) # ... but one has > 100 members
+range(tMem) # ... but one has > 200 members


 # ==   2.3  Betweenness Centrality  ============================================
@@ -214,13 +210,14 @@ head(sBC)
 # IDs...
 (ENSPsel <- names(igraph::V(gSTR)[BCsel]))

+# Task:
+# =====
+# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
 # We are going to use these IDs to produce some output for a submitted task:
-# so I need you to personalize ENSPsel with the following
-# three lines of code:
+# therefore I need you to execute the following line, note the "seal" that this
+# returns, and not change ENSPsel later:

-set.seed(<myStudentNumber>)         # enter your student number here
-(ENSPsel <- sample(ENSPsel))
-set.seed(NULL)                      # reset the random number generator
+myENSPsel <- seal(ENSPsel)

 #  Next, to find what these proteins are...

@@ -251,15 +248,15 @@ if (! requireNamespace("biomaRt", quietly = TRUE)) {
 #  browseVignettes("biomaRt")    # available vignettes
 #  data(package = "biomaRt")     # available datasets

-# define which dataset to use ...
+# define which dataset to use ... this takes a while for download
 myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")

 # what filters are defined?
-(filters <- biomaRt::listFilters(myMart))
+( filters <- biomaRt::listFilters(myMart) )


 # and what attributes can we filter for?
-(attributes <- biomaRt::listAttributes(myMart))
+( attributes <- biomaRt::listAttributes(myMart) )


 # Soooo many options - let's look for the correct name of filters that are
@@ -296,13 +293,13 @@ for (ID in ENSPsel) {
                                 mart = myMart)
 }

+
 # So what are the proteins with the ten highest betweenness centralities?
 #  ... are you surprised? (I am! Really.)


 # =    4  Task for submission  =================================================

-
 # Write a loop that will go through your personalized list of Ensemble IDs and
 #    for each ID:
 #    --  print the ID,
@@ -310,11 +307,14 @@ for (ID in ENSPsel) {
 #    --  print the first row's wikigene description.
 #    --  print the first row's phenotype.
 #
+# Write your thoughts about this group of genes.
+#
 # (Hint, you can structure your loop in the same way as the loop that
 # created CPdefs. )

-# Place the R code for this loop and its output into your report if you are
-# submitting a report for this unit. Please read the requirements carefully.
+# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
+# for this loop and its output into your report if you are submitting
+# anything for credit for this unit. Please read the requirements carefully.



--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@@ -628,7 +628,7 @@ sum(divs < KLdiv(pmfL1, pmfL2)) # 933
 # we used above.


-# ==   4.3  Kolmogorov-Smirnov test for continuous distributions  ==============
+# ==   4.3  Continuous distributions: Kolmogorov-Smirnov test   ==============

 # The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e.
 # the probability it calculates assumes that the function values are all
--- a/data/STRINGedges.rds
+++ b/data/STRINGedges.rds
--- a/scripts/ABC-makeSTRINGedges.R
+++ b/scripts/ABC-makeSTRINGedges.R
@@ -0,0 +1,168 @@
+# tocID <- "scripts/ABC-makeSTRINGedges.R"
+#
+# Create a subnetwork of high-confidence human STRING edges.
+#
+# Notes:
+#
+#      The large source- datafile is NOT posted to github. If you want to
+#      experiment with the original data, download it and place it into your
+#      local  ./data  directory.
+#
+#      STRING data source:
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
+#        Data: (127.6 Mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
+#
+# Version:  1.0
+#
+# Date:     2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    Rewrite
+#
+# TODO:
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Initialize                          44
+#TOC>   2        Read STRING Data                    51
+#TOC>   3        Define cutoff and subset            63
+#TOC>   4        Drop  duplicates                   103
+#TOC>   5        Simple statistics                  127
+#TOC>   6        Write to file                      160
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Initialize  ==========================================================
+
+if (! requireNamespace("readr", quietly = TRUE)) {
+  install.packages("readr")
+}
+
+
+# =    2  Read STRING Data  ====================================================
+
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# The .gz compressed version is 127.6MB, the uncompressed version is probably
+# 848 Mb. Fortunately readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")
+nrow(STR)  #  11,759,454 rows
+head(STR)
+
+
+# =    3  Define cutoff and subset  ============================================
+
+# approximate distribution of combined_score
+hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
+
+# Let's table the counts >= 850 and plot them for better resolution.
+
+myTb <- table(STR$combined_score[STR$combined_score >= 850])
+is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
+
+plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
+myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
+                         # frequently assigns a combined score of 0.900
+
+# Let's plot these counts as cumulative sums, in reverse order, scaled
+# as combined scores.
+myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
+plot(myX,
+     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
+     xlim = c(1.0, 0.85),            # reverse x-axis
+     type = "l",
+     main = "STRING interactions for 9606 (top 600,000)",
+     xlab = "combined_score",
+     ylab = "cumulative counts",
+     col = "#CC0000")
+abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
+
+# What's the cutoff for 100,000 edges?
+which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
+
+# confirm
+sum(STR$combined_score >= 964) # 101,348
+abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
+
+# subset the table, and use only the protein IDs and the combined_score
+STR <- STR[STR$combined_score >= 964,
+            c("protein1", "protein2", "combined_score")]
+colnames(STR) <- c("a", "b", "score")
+
+
+# =    4  Drop  duplicates  ====================================================
+
+# identify duplicate interactions by creating keys in a defined alphabetical
+# sort order, then checking for  duplicated().
+# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
+# (X:U, X:U) has a duplicate.
+
+AB <- STR$a < STR$b        # logical vector: genes we need to swap
+tmp <- STR$b               # copy column b
+STR$b[AB] <- STR$a[AB]     # copy a's into b
+STR$a[AB] <- tmp[AB]       # copy tmp's into a
+all(STR$a >= STR$b)        # confirm: TRUE
+
+# now, make combined keys, like this:
+paste0(STR$a[1:10], ":", STR$b[1:10])
+
+tmp <- paste0(STR$a, ":", STR$b)
+sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
+                     # both a:b and b:a !
+
+# drop all duplicated interactions from tmp
+STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
+
+
+# =    5  Simple statistics  ===================================================
+
+# how many unique genes?
+length(unique(c(STR$a, STR$b)))   # 8,445
+
+# how many self-edges?
+sum(STR$a == STR$b)  # none
+
+# log(rank) / log(frequency)
+myTbl <- table(c(STR$a, STR$b))
+myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
+
+hist(myTbl, breaks = 40, col = "#FFEEBB")
+
+# number of singletons
+sum(myTbl == 1) # almost a quarter
+
+# maximum?
+myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
+                                   # Google: CDC5L
+
+# Zipf-plot
+plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
+     type = "b", cex = 0.7,
+     main = "STRINGedges - degrees",
+     xlab = "log(rank)",
+     ylab = "log(frequency)",
+     col = "#FFBB88")
+
+sprintf("Average number of interactions: %5.2f",
+         nrow(STR) / length(unique(c(STR$a, STR$b))))
+
+
+# =    6  Write to file  =======================================================
+
+saveRDS(STR, file = "./data/STRINGedges.rds")
+
+# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
+                                                    # object when needed
+
+
+# [END]