Update PPI-Analysis assets and code
This commit is contained in:
		
							
								
								
									
										11
									
								
								.utilities.R
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								.utilities.R
									
									
									
									
									
								
							@@ -332,5 +332,16 @@ selectChi2 <- function() {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ==   5.2  selectENSP()  ====================================================
 | 
				
			||||||
 | 
					selectENSP <- function(x) {
 | 
				
			||||||
 | 
					  oldSeed <- .Random.seed
 | 
				
			||||||
 | 
					  set.seed(myStudentNumber)
 | 
				
			||||||
 | 
					  x <- sample(x[order(x)])
 | 
				
			||||||
 | 
					  .Random.seed <- oldSeed
 | 
				
			||||||
 | 
					  cat(sprintf("seal: %s\n", seal(paste0(x,collapse=""))))
 | 
				
			||||||
 | 
					  return(x)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# [END]
 | 
					# [END]
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,10 +1,5 @@
 | 
				
			|||||||
# tocID <- "BIN-PPI-Analysis.R"
 | 
					# tocID <- "BIN-PPI-Analysis.R"
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# ---------------------------------------------------------------------------- #
 | 
					 | 
				
			||||||
#  PATIENCE  ...                                                               #
 | 
					 | 
				
			||||||
#    Do not yet work wih this code. Updates in progress. Thank you.            #
 | 
					 | 
				
			||||||
#    boris.steipe@utoronto.ca                                                  #
 | 
					 | 
				
			||||||
# ---------------------------------------------------------------------------- #
 | 
					 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Purpose:  A Bioinformatics Course:
 | 
					# Purpose:  A Bioinformatics Course:
 | 
				
			||||||
#              R code accompanying the BIN-PPI-Analysis unit.
 | 
					#              R code accompanying the BIN-PPI-Analysis unit.
 | 
				
			||||||
@@ -15,7 +10,8 @@
 | 
				
			|||||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
					# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Versions:
 | 
					# Versions:
 | 
				
			||||||
#           1.2    Deprecate save()/load() for saveRDS()/readRDS()
 | 
					#           1.2    2020 Updates; Rewrite for new STRINg V11;
 | 
				
			||||||
 | 
					#                  Deprecate save()/load() for saveRDS()/readRDS()
 | 
				
			||||||
#           1.1    Change from require() to requireNamespace(),
 | 
					#           1.1    Change from require() to requireNamespace(),
 | 
				
			||||||
#                      use <package>::<function>() idiom throughout,
 | 
					#                      use <package>::<function>() idiom throughout,
 | 
				
			||||||
#                      use Biocmanager:: not biocLite()
 | 
					#                      use Biocmanager:: not biocLite()
 | 
				
			||||||
@@ -36,17 +32,17 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
#TOC>
 | 
					#TOC> 
 | 
				
			||||||
#TOC>   Section  Title                                           Line
 | 
					#TOC>   Section  Title                                           Line
 | 
				
			||||||
#TOC> ---------------------------------------------------------------
 | 
					#TOC> ---------------------------------------------------------------
 | 
				
			||||||
#TOC>   1        Setup and data                                    46
 | 
					#TOC>   1        Setup and data                                    49
 | 
				
			||||||
#TOC>   2        Functional Edges in the Human Proteome            82
 | 
					#TOC>   2        Functional Edges in the Human Proteome            85
 | 
				
			||||||
#TOC>   2.1        Cliques                                        125
 | 
					#TOC>   2.1        Cliques                                        128
 | 
				
			||||||
#TOC>   2.2        Communities                                    166
 | 
					#TOC>   2.2        Communities                                    169
 | 
				
			||||||
#TOC>   2.3        Betweenness Centrality                         180
 | 
					#TOC>   2.3        Betweenness Centrality                         183
 | 
				
			||||||
#TOC>   3        biomaRt                                          226
 | 
					#TOC>   3        biomaRt                                          230
 | 
				
			||||||
#TOC>   4        Task for submission                              296
 | 
					#TOC>   4        Task for submission                              301
 | 
				
			||||||
#TOC>
 | 
					#TOC> 
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -68,10 +64,10 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
 | 
				
			|||||||
# from the STRING database. The full table has 8.5 million records, here is a
 | 
					# from the STRING database. The full table has 8.5 million records, here is a
 | 
				
			||||||
# subset of records with combined confidence scores > 980
 | 
					# subset of records with combined confidence scores > 980
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The selected set of edges with a confidence of > 980 is a dataframe with about
 | 
					# The selected set of edges with a confidence of > 964 is a dataframe with about
 | 
				
			||||||
# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of
 | 
					# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
 | 
				
			||||||
# a fungal proteome. You can load the saved dataframe here (To read more about
 | 
					# a fungal proteome. You can load the saved dataframe here (To read more about
 | 
				
			||||||
# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
 | 
					# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
 | 
					STRINGedges <- readRDS("./data/STRINGedges.rds")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -80,8 +76,8 @@ head(STRINGedges)
 | 
				
			|||||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
 | 
					# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
 | 
				
			||||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
 | 
					# Ensemble transcript identifiers that start with ENSP. We'll remove them:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1)
 | 
					STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
 | 
				
			||||||
STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2)
 | 
					STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
head(STRINGedges)
 | 
					head(STRINGedges)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -91,9 +87,9 @@ head(STRINGedges)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# There are many possibilities to explore interesting aspects of biological
 | 
					# There are many possibilities to explore interesting aspects of biological
 | 
				
			||||||
# networks, we will keep with some very simple procedures here but you have
 | 
					# networks, we will keep with some very simple procedures here but you have
 | 
				
			||||||
# to be aware that this is barely scratching the surface of possibilites.
 | 
					# to be aware that this is barely scratching the surface of possibilities.
 | 
				
			||||||
# However, once the network exists in your computer, it is comparatively
 | 
					# However, once the network exists in your computer, it is comparatively
 | 
				
			||||||
# easy to find information nline about the many, many options to analyze.
 | 
					# easy to find information online about the many, many options to analyze.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Make a graph from this dataframe
 | 
					# Make a graph from this dataframe
 | 
				
			||||||
@@ -101,7 +97,7 @@ head(STRINGedges)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
 | 
					gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
 | 
					# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
 | 
				
			||||||
# layout of such large graphs is possible, but requires specialized code. Google
 | 
					# layout of such large graphs is possible, but requires specialized code. Google
 | 
				
			||||||
# for <layout large graphs> if you are curious. Also, consider what one can
 | 
					# for <layout large graphs> if you are curious. Also, consider what one can
 | 
				
			||||||
# really learn from plotting such a graph ...
 | 
					# really learn from plotting such a graph ...
 | 
				
			||||||
@@ -119,7 +115,7 @@ plot(log10(as.numeric(names(freqRank)) + 1),
 | 
				
			|||||||
     log10(as.numeric(freqRank)), type = "b",
 | 
					     log10(as.numeric(freqRank)), type = "b",
 | 
				
			||||||
     pch = 21, bg = "#FEE0AF",
 | 
					     pch = 21, bg = "#FEE0AF",
 | 
				
			||||||
     xlab = "log(Rank)", ylab = "log(frequency)",
 | 
					     xlab = "log(Rank)", ylab = "log(frequency)",
 | 
				
			||||||
     main = "6,500 nodes from the human functional interaction network")
 | 
					     main = "8,400 nodes from the human functional interaction network")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# This looks very scale-free indeed.
 | 
					# This looks very scale-free indeed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -136,11 +132,11 @@ abline(regressionLine, col = "firebrick")
 | 
				
			|||||||
# Biological complexes often appear as cliques in interaction graphs.
 | 
					# Biological complexes often appear as cliques in interaction graphs.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
igraph::clique_num(gSTR)
 | 
					igraph::clique_num(gSTR)
 | 
				
			||||||
# The largest clique has 63 members.
 | 
					# The largest clique has 81 members.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(C <- igraph::largest_cliques(gSTR)[[1]])
 | 
					(C <- igraph::largest_cliques(gSTR)[[1]])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Pick one of the proteins and find out what this fully connected cluster of 63
 | 
					# Pick one of the proteins and find out what this fully connected cluster of 81
 | 
				
			||||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
 | 
					# proteins is (you can simply Google for any of the IDs). Is this expected?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Plot this ...
 | 
					# Plot this ...
 | 
				
			||||||
@@ -166,7 +162,7 @@ plot(R,
 | 
				
			|||||||
par(oPar)
 | 
					par(oPar)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ... well: remember: a clique means every node is connected to every other
 | 
					# ... well: remember: a clique means every node is connected to every other
 | 
				
			||||||
# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI
 | 
					# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
 | 
				
			||||||
# networks looks like for large complexes.
 | 
					# networks looks like for large complexes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -179,9 +175,9 @@ set.seed(NULL)                         # reset the RNG
 | 
				
			|||||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
 | 
					igraph::modularity(gSTRclusters) # ... measures how separated the different
 | 
				
			||||||
                                 # membership types are from each other
 | 
					                                 # membership types are from each other
 | 
				
			||||||
tMem <- table(igraph::membership(gSTRclusters))
 | 
					tMem <- table(igraph::membership(gSTRclusters))
 | 
				
			||||||
length(tMem)  # More than 2000 communities identified
 | 
					length(tMem)  # About 700 communities identified
 | 
				
			||||||
hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
 | 
					hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
 | 
				
			||||||
range(tMem) # ... but one has > 100 members
 | 
					range(tMem) # ... but one has > 200 members
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ==   2.3  Betweenness Centrality  ============================================
 | 
					# ==   2.3  Betweenness Centrality  ============================================
 | 
				
			||||||
@@ -214,13 +210,14 @@ head(sBC)
 | 
				
			|||||||
# IDs...
 | 
					# IDs...
 | 
				
			||||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
 | 
					(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Task:
 | 
				
			||||||
 | 
					# =====
 | 
				
			||||||
 | 
					# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
 | 
				
			||||||
# We are going to use these IDs to produce some output for a submitted task:
 | 
					# We are going to use these IDs to produce some output for a submitted task:
 | 
				
			||||||
# so I need you to personalize ENSPsel with the following
 | 
					# therefore I need you to execute the following line, note the "seal" that this
 | 
				
			||||||
# three lines of code:
 | 
					# returns, and not change ENSPsel later:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
set.seed(<myStudentNumber>)         # enter your student number here
 | 
					myENSPsel <- seal(ENSPsel)
 | 
				
			||||||
(ENSPsel <- sample(ENSPsel))
 | 
					 | 
				
			||||||
set.seed(NULL)                      # reset the random number generator
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#  Next, to find what these proteins are...
 | 
					#  Next, to find what these proteins are...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -251,15 +248,15 @@ if (! requireNamespace("biomaRt", quietly = TRUE)) {
 | 
				
			|||||||
#  browseVignettes("biomaRt")    # available vignettes
 | 
					#  browseVignettes("biomaRt")    # available vignettes
 | 
				
			||||||
#  data(package = "biomaRt")     # available datasets
 | 
					#  data(package = "biomaRt")     # available datasets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# define which dataset to use ...
 | 
					# define which dataset to use ... this takes a while for download
 | 
				
			||||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
 | 
					myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# what filters are defined?
 | 
					# what filters are defined?
 | 
				
			||||||
(filters <- biomaRt::listFilters(myMart))
 | 
					( filters <- biomaRt::listFilters(myMart) )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# and what attributes can we filter for?
 | 
					# and what attributes can we filter for?
 | 
				
			||||||
(attributes <- biomaRt::listAttributes(myMart))
 | 
					( attributes <- biomaRt::listAttributes(myMart) )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Soooo many options - let's look for the correct name of filters that are
 | 
					# Soooo many options - let's look for the correct name of filters that are
 | 
				
			||||||
@@ -296,13 +293,13 @@ for (ID in ENSPsel) {
 | 
				
			|||||||
                                 mart = myMart)
 | 
					                                 mart = myMart)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# So what are the proteins with the ten highest betweenness centralities?
 | 
					# So what are the proteins with the ten highest betweenness centralities?
 | 
				
			||||||
#  ... are you surprised? (I am! Really.)
 | 
					#  ... are you surprised? (I am! Really.)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# =    4  Task for submission  =================================================
 | 
					# =    4  Task for submission  =================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Write a loop that will go through your personalized list of Ensemble IDs and
 | 
					# Write a loop that will go through your personalized list of Ensemble IDs and
 | 
				
			||||||
#    for each ID:
 | 
					#    for each ID:
 | 
				
			||||||
#    --  print the ID,
 | 
					#    --  print the ID,
 | 
				
			||||||
@@ -310,11 +307,14 @@ for (ID in ENSPsel) {
 | 
				
			|||||||
#    --  print the first row's wikigene description.
 | 
					#    --  print the first row's wikigene description.
 | 
				
			||||||
#    --  print the first row's phenotype.
 | 
					#    --  print the first row's phenotype.
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					# Write your thoughts about this group of genes.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
# (Hint, you can structure your loop in the same way as the loop that
 | 
					# (Hint, you can structure your loop in the same way as the loop that
 | 
				
			||||||
# created CPdefs. )
 | 
					# created CPdefs. )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Place the R code for this loop and its output into your report if you are
 | 
					# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
 | 
				
			||||||
# submitting a report for this unit. Please read the requirements carefully.
 | 
					# for this loop and its output into your report if you are submitting
 | 
				
			||||||
 | 
					# anything for credit for this unit. Please read the requirements carefully.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -30,7 +30,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
#TOC> 
 | 
					#TOC>
 | 
				
			||||||
#TOC>   Section  Title                                                         Line
 | 
					#TOC>   Section  Title                                                         Line
 | 
				
			||||||
#TOC> -----------------------------------------------------------------------------
 | 
					#TOC> -----------------------------------------------------------------------------
 | 
				
			||||||
#TOC>   1        Introduction                                                    54
 | 
					#TOC>   1        Introduction                                                    54
 | 
				
			||||||
@@ -47,7 +47,7 @@
 | 
				
			|||||||
#TOC>   4.2.1          An example from tossing dice                             465
 | 
					#TOC>   4.2.1          An example from tossing dice                             465
 | 
				
			||||||
#TOC>   4.2.2          An example from lognormal distributions                  588
 | 
					#TOC>   4.2.2          An example from lognormal distributions                  588
 | 
				
			||||||
#TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions         631
 | 
					#TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions         631
 | 
				
			||||||
#TOC> 
 | 
					#TOC>
 | 
				
			||||||
#TOC> ==========================================================================
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -462,7 +462,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
 | 
				
			|||||||
# be applied to discrete distributions. But we need to talk a bit about
 | 
					# be applied to discrete distributions. But we need to talk a bit about
 | 
				
			||||||
# converting counts to p.m.f.'s.
 | 
					# converting counts to p.m.f.'s.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===   4.2.1  An example from tossing dice                        
 | 
					# ===   4.2.1  An example from tossing dice
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#  The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
 | 
					#  The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
 | 
				
			||||||
#  there is an issue when we convert sampled counts to frequencies, and estimate
 | 
					#  there is an issue when we convert sampled counts to frequencies, and estimate
 | 
				
			||||||
@@ -585,7 +585,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
 | 
				
			|||||||
# somewhat but not drastically atypical.
 | 
					# somewhat but not drastically atypical.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ===   4.2.2  An example from lognormal distributions             
 | 
					# ===   4.2.2  An example from lognormal distributions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# We had compared a set of lognormal and gamma distributions above, now we
 | 
					# We had compared a set of lognormal and gamma distributions above, now we
 | 
				
			||||||
# can use KL-divergence to quantify their similarity:
 | 
					# can use KL-divergence to quantify their similarity:
 | 
				
			||||||
@@ -628,7 +628,7 @@ sum(divs < KLdiv(pmfL1, pmfL2)) # 933
 | 
				
			|||||||
# we used above.
 | 
					# we used above.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ==   4.3  Kolmogorov-Smirnov test for continuous distributions  ==============
 | 
					# ==   4.3  Continuous distributions: Kolmogorov-Smirnov test   ==============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e.
 | 
					# The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e.
 | 
				
			||||||
# the probability it calculates assumes that the function values are all
 | 
					# the probability it calculates assumes that the function values are all
 | 
				
			||||||
 
 | 
				
			|||||||
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										168
									
								
								scripts/ABC-makeSTRINGedges.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								scripts/ABC-makeSTRINGedges.R
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,168 @@
 | 
				
			|||||||
 | 
					# tocID <- "scripts/ABC-makeSTRINGedges.R"
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Create a subnetwork of high-confidence human STRING edges.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Notes:
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#      The large source- datafile is NOT posted to github. If you want to
 | 
				
			||||||
 | 
					#      experiment with the original data, download it and place it into your
 | 
				
			||||||
 | 
					#      local  ./data  directory.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#      STRING data source:
 | 
				
			||||||
 | 
					#        Download page:
 | 
				
			||||||
 | 
					# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
 | 
				
			||||||
 | 
					#        Data: (127.6 Mb)
 | 
				
			||||||
 | 
					# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Version:  1.0
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Date:     2020-09
 | 
				
			||||||
 | 
					# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Versions:
 | 
				
			||||||
 | 
					#           1.0    Rewrite
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# TODO:
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# ==============================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					#TOC> 
 | 
				
			||||||
 | 
					#TOC>   Section  Title                             Line
 | 
				
			||||||
 | 
					#TOC> -------------------------------------------------
 | 
				
			||||||
 | 
					#TOC>   1        Initialize                          44
 | 
				
			||||||
 | 
					#TOC>   2        Read STRING Data                    51
 | 
				
			||||||
 | 
					#TOC>   3        Define cutoff and subset            63
 | 
				
			||||||
 | 
					#TOC>   4        Drop  duplicates                   103
 | 
				
			||||||
 | 
					#TOC>   5        Simple statistics                  127
 | 
				
			||||||
 | 
					#TOC>   6        Write to file                      160
 | 
				
			||||||
 | 
					#TOC> 
 | 
				
			||||||
 | 
					#TOC> ==========================================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    1  Initialize  ==========================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if (! requireNamespace("readr", quietly = TRUE)) {
 | 
				
			||||||
 | 
					  install.packages("readr")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    2  Read STRING Data  ====================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Read STRING Data (needs to be downloaded from database, see URL in Notes)
 | 
				
			||||||
 | 
					# The .gz compressed version is 127.6MB, the uncompressed version is probably
 | 
				
			||||||
 | 
					# 848 Mb. Fortunately readr:: can read from compressed
 | 
				
			||||||
 | 
					# files, and does so automatically, based on the file extension.
 | 
				
			||||||
 | 
					( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
 | 
				
			||||||
 | 
					STR <- readr::read_delim(fn, delim = " ")
 | 
				
			||||||
 | 
					nrow(STR)  #  11,759,454 rows
 | 
				
			||||||
 | 
					head(STR)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    3  Define cutoff and subset  ============================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# approximate distribution of combined_score
 | 
				
			||||||
 | 
					hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Let's table the counts >= 850 and plot them for better resolution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					myTb <- table(STR$combined_score[STR$combined_score >= 850])
 | 
				
			||||||
 | 
					is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
 | 
				
			||||||
 | 
					myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
 | 
				
			||||||
 | 
					                         # frequently assigns a combined score of 0.900
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Let's plot these counts as cumulative sums, in reverse order, scaled
 | 
				
			||||||
 | 
					# as combined scores.
 | 
				
			||||||
 | 
					myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
 | 
				
			||||||
 | 
					plot(myX,
 | 
				
			||||||
 | 
					     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
 | 
				
			||||||
 | 
					     xlim = c(1.0, 0.85),            # reverse x-axis
 | 
				
			||||||
 | 
					     type = "l",
 | 
				
			||||||
 | 
					     main = "STRING interactions for 9606 (top 600,000)",
 | 
				
			||||||
 | 
					     xlab = "combined_score",
 | 
				
			||||||
 | 
					     ylab = "cumulative counts",
 | 
				
			||||||
 | 
					     col = "#CC0000")
 | 
				
			||||||
 | 
					abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# What's the cutoff for 100,000 edges?
 | 
				
			||||||
 | 
					which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# confirm
 | 
				
			||||||
 | 
					sum(STR$combined_score >= 964) # 101,348
 | 
				
			||||||
 | 
					abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# subset the table, and use only the protein IDs and the combined_score
 | 
				
			||||||
 | 
					STR <- STR[STR$combined_score >= 964,
 | 
				
			||||||
 | 
					            c("protein1", "protein2", "combined_score")]
 | 
				
			||||||
 | 
					colnames(STR) <- c("a", "b", "score")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    4  Drop  duplicates  ====================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# identify duplicate interactions by creating keys in a defined alphabetical
 | 
				
			||||||
 | 
					# sort order, then checking for  duplicated().
 | 
				
			||||||
 | 
					# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
 | 
				
			||||||
 | 
					# (X:U, X:U) has a duplicate.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					AB <- STR$a < STR$b        # logical vector: genes we need to swap
 | 
				
			||||||
 | 
					tmp <- STR$b               # copy column b
 | 
				
			||||||
 | 
					STR$b[AB] <- STR$a[AB]     # copy a's into b
 | 
				
			||||||
 | 
					STR$a[AB] <- tmp[AB]       # copy tmp's into a
 | 
				
			||||||
 | 
					all(STR$a >= STR$b)        # confirm: TRUE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# now, make combined keys, like this:
 | 
				
			||||||
 | 
					paste0(STR$a[1:10], ":", STR$b[1:10])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					tmp <- paste0(STR$a, ":", STR$b)
 | 
				
			||||||
 | 
					sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
 | 
				
			||||||
 | 
					                     # both a:b and b:a !
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# drop all duplicated interactions from tmp
 | 
				
			||||||
 | 
					STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    5  Simple statistics  ===================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# how many unique genes?
 | 
				
			||||||
 | 
					length(unique(c(STR$a, STR$b)))   # 8,445
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# how many self-edges?
 | 
				
			||||||
 | 
					sum(STR$a == STR$b)  # none
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# log(rank) / log(frequency)
 | 
				
			||||||
 | 
					myTbl <- table(c(STR$a, STR$b))
 | 
				
			||||||
 | 
					myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					hist(myTbl, breaks = 40, col = "#FFEEBB")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# number of singletons
 | 
				
			||||||
 | 
					sum(myTbl == 1) # almost a quarter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# maximum?
 | 
				
			||||||
 | 
					myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
 | 
				
			||||||
 | 
					                                   # Google: CDC5L
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Zipf-plot
 | 
				
			||||||
 | 
					plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
 | 
				
			||||||
 | 
					     type = "b", cex = 0.7,
 | 
				
			||||||
 | 
					     main = "STRINGedges - degrees",
 | 
				
			||||||
 | 
					     xlab = "log(rank)",
 | 
				
			||||||
 | 
					     ylab = "log(frequency)",
 | 
				
			||||||
 | 
					     col = "#FFBB88")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sprintf("Average number of interactions: %5.2f",
 | 
				
			||||||
 | 
					         nrow(STR) / length(unique(c(STR$a, STR$b))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# =    6  Write to file  =======================================================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					saveRDS(STR, file = "./data/STRINGedges.rds")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
 | 
				
			||||||
 | 
					                                                    # object when needed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# [END]
 | 
				
			||||||
		Reference in New Issue
	
	Block a user