Update PPI-Analysis assets and code
This commit is contained in:
parent
42ad74b2f2
commit
76b18a8a35
11
.utilities.R
11
.utilities.R
@ -332,5 +332,16 @@ selectChi2 <- function() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 5.2 selectENSP() ====================================================
|
||||||
|
selectENSP <- function(x) {
|
||||||
|
oldSeed <- .Random.seed
|
||||||
|
set.seed(myStudentNumber)
|
||||||
|
x <- sample(x[order(x)])
|
||||||
|
.Random.seed <- oldSeed
|
||||||
|
cat(sprintf("seal: %s\n", seal(paste0(x,collapse=""))))
|
||||||
|
return(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -1,10 +1,5 @@
|
|||||||
# tocID <- "BIN-PPI-Analysis.R"
|
# tocID <- "BIN-PPI-Analysis.R"
|
||||||
#
|
#
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
# PATIENCE ... #
|
|
||||||
# Do not yet work wih this code. Updates in progress. Thank you. #
|
|
||||||
# boris.steipe@utoronto.ca #
|
|
||||||
# ---------------------------------------------------------------------------- #
|
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PPI-Analysis unit.
|
# R code accompanying the BIN-PPI-Analysis unit.
|
||||||
@ -15,7 +10,8 @@
|
|||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
# 1.2 Deprecate save()/load() for saveRDS()/readRDS()
|
# 1.2 2020 Updates; Rewrite for new STRINg V11;
|
||||||
|
# Deprecate save()/load() for saveRDS()/readRDS()
|
||||||
# 1.1 Change from require() to requireNamespace(),
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# use Biocmanager:: not biocLite()
|
# use Biocmanager:: not biocLite()
|
||||||
@ -36,17 +32,17 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------
|
||||||
#TOC> 1 Setup and data 46
|
#TOC> 1 Setup and data 49
|
||||||
#TOC> 2 Functional Edges in the Human Proteome 82
|
#TOC> 2 Functional Edges in the Human Proteome 85
|
||||||
#TOC> 2.1 Cliques 125
|
#TOC> 2.1 Cliques 128
|
||||||
#TOC> 2.2 Communities 166
|
#TOC> 2.2 Communities 169
|
||||||
#TOC> 2.3 Betweenness Centrality 180
|
#TOC> 2.3 Betweenness Centrality 183
|
||||||
#TOC> 3 biomaRt 226
|
#TOC> 3 biomaRt 230
|
||||||
#TOC> 4 Task for submission 296
|
#TOC> 4 Task for submission 301
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -68,10 +64,10 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
|
|||||||
# from the STRING database. The full table has 8.5 million records, here is a
|
# from the STRING database. The full table has 8.5 million records, here is a
|
||||||
# subset of records with combined confidence scores > 980
|
# subset of records with combined confidence scores > 980
|
||||||
|
|
||||||
# The selected set of edges with a confidence of > 980 is a dataframe with about
|
# The selected set of edges with a confidence of > 964 is a dataframe with about
|
||||||
# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of
|
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
|
||||||
# a fungal proteome. You can load the saved dataframe here (To read more about
|
# a fungal proteome. You can load the saved dataframe here (To read more about
|
||||||
# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
||||||
|
|
||||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
|
STRINGedges <- readRDS("./data/STRINGedges.rds")
|
||||||
|
|
||||||
@ -80,8 +76,8 @@ head(STRINGedges)
|
|||||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
|
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
|
||||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
|
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
|
||||||
|
|
||||||
STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1)
|
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
|
||||||
STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2)
|
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
|
||||||
|
|
||||||
head(STRINGedges)
|
head(STRINGedges)
|
||||||
|
|
||||||
@ -91,9 +87,9 @@ head(STRINGedges)
|
|||||||
|
|
||||||
# There are many possibilities to explore interesting aspects of biological
|
# There are many possibilities to explore interesting aspects of biological
|
||||||
# networks, we will keep with some very simple procedures here but you have
|
# networks, we will keep with some very simple procedures here but you have
|
||||||
# to be aware that this is barely scratching the surface of possibilites.
|
# to be aware that this is barely scratching the surface of possibilities.
|
||||||
# However, once the network exists in your computer, it is comparatively
|
# However, once the network exists in your computer, it is comparatively
|
||||||
# easy to find information nline about the many, many options to analyze.
|
# easy to find information online about the many, many options to analyze.
|
||||||
|
|
||||||
|
|
||||||
# Make a graph from this dataframe
|
# Make a graph from this dataframe
|
||||||
@ -101,7 +97,7 @@ head(STRINGedges)
|
|||||||
|
|
||||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||||
|
|
||||||
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
|
||||||
# layout of such large graphs is possible, but requires specialized code. Google
|
# layout of such large graphs is possible, but requires specialized code. Google
|
||||||
# for <layout large graphs> if you are curious. Also, consider what one can
|
# for <layout large graphs> if you are curious. Also, consider what one can
|
||||||
# really learn from plotting such a graph ...
|
# really learn from plotting such a graph ...
|
||||||
@ -119,7 +115,7 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
log10(as.numeric(freqRank)), type = "b",
|
log10(as.numeric(freqRank)), type = "b",
|
||||||
pch = 21, bg = "#FEE0AF",
|
pch = 21, bg = "#FEE0AF",
|
||||||
xlab = "log(Rank)", ylab = "log(frequency)",
|
xlab = "log(Rank)", ylab = "log(frequency)",
|
||||||
main = "6,500 nodes from the human functional interaction network")
|
main = "8,400 nodes from the human functional interaction network")
|
||||||
|
|
||||||
# This looks very scale-free indeed.
|
# This looks very scale-free indeed.
|
||||||
|
|
||||||
@ -136,11 +132,11 @@ abline(regressionLine, col = "firebrick")
|
|||||||
# Biological complexes often appear as cliques in interaction graphs.
|
# Biological complexes often appear as cliques in interaction graphs.
|
||||||
|
|
||||||
igraph::clique_num(gSTR)
|
igraph::clique_num(gSTR)
|
||||||
# The largest clique has 63 members.
|
# The largest clique has 81 members.
|
||||||
|
|
||||||
(C <- igraph::largest_cliques(gSTR)[[1]])
|
(C <- igraph::largest_cliques(gSTR)[[1]])
|
||||||
|
|
||||||
# Pick one of the proteins and find out what this fully connected cluster of 63
|
# Pick one of the proteins and find out what this fully connected cluster of 81
|
||||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||||
|
|
||||||
# Plot this ...
|
# Plot this ...
|
||||||
@ -166,7 +162,7 @@ plot(R,
|
|||||||
par(oPar)
|
par(oPar)
|
||||||
|
|
||||||
# ... well: remember: a clique means every node is connected to every other
|
# ... well: remember: a clique means every node is connected to every other
|
||||||
# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI
|
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
|
||||||
# networks looks like for large complexes.
|
# networks looks like for large complexes.
|
||||||
|
|
||||||
|
|
||||||
@ -179,9 +175,9 @@ set.seed(NULL) # reset the RNG
|
|||||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
||||||
# membership types are from each other
|
# membership types are from each other
|
||||||
tMem <- table(igraph::membership(gSTRclusters))
|
tMem <- table(igraph::membership(gSTRclusters))
|
||||||
length(tMem) # More than 2000 communities identified
|
length(tMem) # About 700 communities identified
|
||||||
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
||||||
range(tMem) # ... but one has > 100 members
|
range(tMem) # ... but one has > 200 members
|
||||||
|
|
||||||
|
|
||||||
# == 2.3 Betweenness Centrality ============================================
|
# == 2.3 Betweenness Centrality ============================================
|
||||||
@ -214,13 +210,14 @@ head(sBC)
|
|||||||
# IDs...
|
# IDs...
|
||||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
||||||
|
|
||||||
|
# Task:
|
||||||
|
# =====
|
||||||
|
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
|
||||||
# We are going to use these IDs to produce some output for a submitted task:
|
# We are going to use these IDs to produce some output for a submitted task:
|
||||||
# so I need you to personalize ENSPsel with the following
|
# therefore I need you to execute the following line, note the "seal" that this
|
||||||
# three lines of code:
|
# returns, and not change ENSPsel later:
|
||||||
|
|
||||||
set.seed(<myStudentNumber>) # enter your student number here
|
myENSPsel <- seal(ENSPsel)
|
||||||
(ENSPsel <- sample(ENSPsel))
|
|
||||||
set.seed(NULL) # reset the random number generator
|
|
||||||
|
|
||||||
# Next, to find what these proteins are...
|
# Next, to find what these proteins are...
|
||||||
|
|
||||||
@ -251,15 +248,15 @@ if (! requireNamespace("biomaRt", quietly = TRUE)) {
|
|||||||
# browseVignettes("biomaRt") # available vignettes
|
# browseVignettes("biomaRt") # available vignettes
|
||||||
# data(package = "biomaRt") # available datasets
|
# data(package = "biomaRt") # available datasets
|
||||||
|
|
||||||
# define which dataset to use ...
|
# define which dataset to use ... this takes a while for download
|
||||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||||
|
|
||||||
# what filters are defined?
|
# what filters are defined?
|
||||||
(filters <- biomaRt::listFilters(myMart))
|
( filters <- biomaRt::listFilters(myMart) )
|
||||||
|
|
||||||
|
|
||||||
# and what attributes can we filter for?
|
# and what attributes can we filter for?
|
||||||
(attributes <- biomaRt::listAttributes(myMart))
|
( attributes <- biomaRt::listAttributes(myMart) )
|
||||||
|
|
||||||
|
|
||||||
# Soooo many options - let's look for the correct name of filters that are
|
# Soooo many options - let's look for the correct name of filters that are
|
||||||
@ -296,13 +293,13 @@ for (ID in ENSPsel) {
|
|||||||
mart = myMart)
|
mart = myMart)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# So what are the proteins with the ten highest betweenness centralities?
|
# So what are the proteins with the ten highest betweenness centralities?
|
||||||
# ... are you surprised? (I am! Really.)
|
# ... are you surprised? (I am! Really.)
|
||||||
|
|
||||||
|
|
||||||
# = 4 Task for submission =================================================
|
# = 4 Task for submission =================================================
|
||||||
|
|
||||||
|
|
||||||
# Write a loop that will go through your personalized list of Ensemble IDs and
|
# Write a loop that will go through your personalized list of Ensemble IDs and
|
||||||
# for each ID:
|
# for each ID:
|
||||||
# -- print the ID,
|
# -- print the ID,
|
||||||
@ -310,11 +307,14 @@ for (ID in ENSPsel) {
|
|||||||
# -- print the first row's wikigene description.
|
# -- print the first row's wikigene description.
|
||||||
# -- print the first row's phenotype.
|
# -- print the first row's phenotype.
|
||||||
#
|
#
|
||||||
|
# Write your thoughts about this group of genes.
|
||||||
|
#
|
||||||
# (Hint, you can structure your loop in the same way as the loop that
|
# (Hint, you can structure your loop in the same way as the loop that
|
||||||
# created CPdefs. )
|
# created CPdefs. )
|
||||||
|
|
||||||
# Place the R code for this loop and its output into your report if you are
|
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
|
||||||
# submitting a report for this unit. Please read the requirements carefully.
|
# for this loop and its output into your report if you are submitting
|
||||||
|
# anything for credit for this unit. Please read the requirements carefully.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------------
|
||||||
#TOC> 1 Introduction 54
|
#TOC> 1 Introduction 54
|
||||||
@ -47,7 +47,7 @@
|
|||||||
#TOC> 4.2.1 An example from tossing dice 465
|
#TOC> 4.2.1 An example from tossing dice 465
|
||||||
#TOC> 4.2.2 An example from lognormal distributions 588
|
#TOC> 4.2.2 An example from lognormal distributions 588
|
||||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 631
|
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 631
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -462,7 +462,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
|
|||||||
# be applied to discrete distributions. But we need to talk a bit about
|
# be applied to discrete distributions. But we need to talk a bit about
|
||||||
# converting counts to p.m.f.'s.
|
# converting counts to p.m.f.'s.
|
||||||
|
|
||||||
# === 4.2.1 An example from tossing dice
|
# === 4.2.1 An example from tossing dice
|
||||||
|
|
||||||
# The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
|
# The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
|
||||||
# there is an issue when we convert sampled counts to frequencies, and estimate
|
# there is an issue when we convert sampled counts to frequencies, and estimate
|
||||||
@ -585,7 +585,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
|
|||||||
# somewhat but not drastically atypical.
|
# somewhat but not drastically atypical.
|
||||||
|
|
||||||
|
|
||||||
# === 4.2.2 An example from lognormal distributions
|
# === 4.2.2 An example from lognormal distributions
|
||||||
|
|
||||||
# We had compared a set of lognormal and gamma distributions above, now we
|
# We had compared a set of lognormal and gamma distributions above, now we
|
||||||
# can use KL-divergence to quantify their similarity:
|
# can use KL-divergence to quantify their similarity:
|
||||||
@ -628,7 +628,7 @@ sum(divs < KLdiv(pmfL1, pmfL2)) # 933
|
|||||||
# we used above.
|
# we used above.
|
||||||
|
|
||||||
|
|
||||||
# == 4.3 Kolmogorov-Smirnov test for continuous distributions ==============
|
# == 4.3 Continuous distributions: Kolmogorov-Smirnov test ==============
|
||||||
|
|
||||||
# The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e.
|
# The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e.
|
||||||
# the probability it calculates assumes that the function values are all
|
# the probability it calculates assumes that the function values are all
|
||||||
|
Binary file not shown.
168
scripts/ABC-makeSTRINGedges.R
Normal file
168
scripts/ABC-makeSTRINGedges.R
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||||
|
#
|
||||||
|
# Create a subnetwork of high-confidence human STRING edges.
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
#
|
||||||
|
# The large source- datafile is NOT posted to github. If you want to
|
||||||
|
# experiment with the original data, download it and place it into your
|
||||||
|
# local ./data directory.
|
||||||
|
#
|
||||||
|
# STRING data source:
|
||||||
|
# Download page:
|
||||||
|
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||||
|
# Data: (127.6 Mb)
|
||||||
|
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||||
|
#
|
||||||
|
# Version: 1.0
|
||||||
|
#
|
||||||
|
# Date: 2020-09
|
||||||
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
|
#
|
||||||
|
# Versions:
|
||||||
|
# 1.0 Rewrite
|
||||||
|
#
|
||||||
|
# TODO:
|
||||||
|
#
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -------------------------------------------------
|
||||||
|
#TOC> 1 Initialize 44
|
||||||
|
#TOC> 2 Read STRING Data 51
|
||||||
|
#TOC> 3 Define cutoff and subset 63
|
||||||
|
#TOC> 4 Drop duplicates 103
|
||||||
|
#TOC> 5 Simple statistics 127
|
||||||
|
#TOC> 6 Write to file 160
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 Initialize ==========================================================
|
||||||
|
|
||||||
|
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||||
|
install.packages("readr")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 Read STRING Data ====================================================
|
||||||
|
|
||||||
|
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||||
|
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||||
|
# 848 Mb. Fortunately readr:: can read from compressed
|
||||||
|
# files, and does so automatically, based on the file extension.
|
||||||
|
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||||
|
STR <- readr::read_delim(fn, delim = " ")
|
||||||
|
nrow(STR) # 11,759,454 rows
|
||||||
|
head(STR)
|
||||||
|
|
||||||
|
|
||||||
|
# = 3 Define cutoff and subset ============================================
|
||||||
|
|
||||||
|
# approximate distribution of combined_score
|
||||||
|
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||||
|
|
||||||
|
# Let's table the counts >= 850 and plot them for better resolution.
|
||||||
|
|
||||||
|
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||||
|
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||||
|
|
||||||
|
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||||
|
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||||
|
# frequently assigns a combined score of 0.900
|
||||||
|
|
||||||
|
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||||
|
# as combined scores.
|
||||||
|
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||||
|
plot(myX,
|
||||||
|
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||||
|
xlim = c(1.0, 0.85), # reverse x-axis
|
||||||
|
type = "l",
|
||||||
|
main = "STRING interactions for 9606 (top 600,000)",
|
||||||
|
xlab = "combined_score",
|
||||||
|
ylab = "cumulative counts",
|
||||||
|
col = "#CC0000")
|
||||||
|
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||||
|
|
||||||
|
# What's the cutoff for 100,000 edges?
|
||||||
|
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||||
|
|
||||||
|
# confirm
|
||||||
|
sum(STR$combined_score >= 964) # 101,348
|
||||||
|
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||||
|
|
||||||
|
# subset the table, and use only the protein IDs and the combined_score
|
||||||
|
STR <- STR[STR$combined_score >= 964,
|
||||||
|
c("protein1", "protein2", "combined_score")]
|
||||||
|
colnames(STR) <- c("a", "b", "score")
|
||||||
|
|
||||||
|
|
||||||
|
# = 4 Drop duplicates ====================================================
|
||||||
|
|
||||||
|
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||||
|
# sort order, then checking for duplicated().
|
||||||
|
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||||
|
# (X:U, X:U) has a duplicate.
|
||||||
|
|
||||||
|
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||||
|
tmp <- STR$b # copy column b
|
||||||
|
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||||
|
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||||
|
all(STR$a >= STR$b) # confirm: TRUE
|
||||||
|
|
||||||
|
# now, make combined keys, like this:
|
||||||
|
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||||
|
|
||||||
|
tmp <- paste0(STR$a, ":", STR$b)
|
||||||
|
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||||
|
# both a:b and b:a !
|
||||||
|
|
||||||
|
# drop all duplicated interactions from tmp
|
||||||
|
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||||
|
|
||||||
|
|
||||||
|
# = 5 Simple statistics ===================================================
|
||||||
|
|
||||||
|
# how many unique genes?
|
||||||
|
length(unique(c(STR$a, STR$b))) # 8,445
|
||||||
|
|
||||||
|
# how many self-edges?
|
||||||
|
sum(STR$a == STR$b) # none
|
||||||
|
|
||||||
|
# log(rank) / log(frequency)
|
||||||
|
myTbl <- table(c(STR$a, STR$b))
|
||||||
|
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||||
|
|
||||||
|
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||||
|
|
||||||
|
# number of singletons
|
||||||
|
sum(myTbl == 1) # almost a quarter
|
||||||
|
|
||||||
|
# maximum?
|
||||||
|
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||||
|
# Google: CDC5L
|
||||||
|
|
||||||
|
# Zipf-plot
|
||||||
|
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||||
|
type = "b", cex = 0.7,
|
||||||
|
main = "STRINGedges - degrees",
|
||||||
|
xlab = "log(rank)",
|
||||||
|
ylab = "log(frequency)",
|
||||||
|
col = "#FFBB88")
|
||||||
|
|
||||||
|
sprintf("Average number of interactions: %5.2f",
|
||||||
|
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||||
|
|
||||||
|
|
||||||
|
# = 6 Write to file =======================================================
|
||||||
|
|
||||||
|
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||||
|
|
||||||
|
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||||
|
# object when needed
|
||||||
|
|
||||||
|
|
||||||
|
# [END]
|
Loading…
x
Reference in New Issue
Block a user