diff --git a/.utilities.R b/.utilities.R index 94dbbe6..fc8f88a 100644 --- a/.utilities.R +++ b/.utilities.R @@ -332,5 +332,16 @@ selectChi2 <- function() { } +# == 5.2 selectENSP() ==================================================== +selectENSP <- function(x) { + oldSeed <- .Random.seed + set.seed(myStudentNumber) + x <- sample(x[order(x)]) + .Random.seed <- oldSeed + cat(sprintf("seal: %s\n", seal(paste0(x,collapse="")))) + return(x) +} + + # [END] diff --git a/BIN-PPI-Analysis.R b/BIN-PPI-Analysis.R index 2eaeaf4..daa4577 100644 --- a/BIN-PPI-Analysis.R +++ b/BIN-PPI-Analysis.R @@ -1,10 +1,5 @@ # tocID <- "BIN-PPI-Analysis.R" # -# ---------------------------------------------------------------------------- # -# PATIENCE ... # -# Do not yet work wih this code. Updates in progress. Thank you. # -# boris.steipe@utoronto.ca # -# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PPI-Analysis unit. @@ -15,7 +10,8 @@ # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: -# 1.2 Deprecate save()/load() for saveRDS()/readRDS() +# 1.2 2020 Updates; Rewrite for new STRINg V11; +# Deprecate save()/load() for saveRDS()/readRDS() # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout, # use Biocmanager:: not biocLite() @@ -36,17 +32,17 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------- -#TOC> 1 Setup and data 46 -#TOC> 2 Functional Edges in the Human Proteome 82 -#TOC> 2.1 Cliques 125 -#TOC> 2.2 Communities 166 -#TOC> 2.3 Betweenness Centrality 180 -#TOC> 3 biomaRt 226 -#TOC> 4 Task for submission 296 -#TOC> +#TOC> 1 Setup and data 49 +#TOC> 2 Functional Edges in the Human Proteome 85 +#TOC> 2.1 Cliques 128 +#TOC> 2.2 Communities 169 +#TOC> 2.3 Betweenness Centrality 183 +#TOC> 3 biomaRt 230 +#TOC> 4 Task for submission 301 +#TOC> #TOC> ========================================================================== @@ -68,10 +64,10 @@ if (! requireNamespace("igraph", quietly = TRUE)) { # from the STRING database. The full table has 8.5 million records, here is a # subset of records with combined confidence scores > 980 -# The selected set of edges with a confidence of > 980 is a dataframe with about -# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of +# The selected set of edges with a confidence of > 964 is a dataframe with about +# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of # a fungal proteome. You can load the saved dataframe here (To read more about -# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). +# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). STRINGedges <- readRDS("./data/STRINGedges.rds") @@ -80,8 +76,8 @@ head(STRINGedges) # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the # Ensemble transcript identifiers that start with ENSP. We'll remove them: -STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1) -STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2) +STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) +STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) head(STRINGedges) @@ -91,9 +87,9 @@ head(STRINGedges) # There are many possibilities to explore interesting aspects of biological # networks, we will keep with some very simple procedures here but you have -# to be aware that this is barely scratching the surface of possibilites. +# to be aware that this is barely scratching the surface of possibilities. # However, once the network exists in your computer, it is comparatively -# easy to find information nline about the many, many options to analyze. +# easy to find information online about the many, many options to analyze. # Make a graph from this dataframe @@ -101,7 +97,7 @@ head(STRINGedges) gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) -# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges - +# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - # layout of such large graphs is possible, but requires specialized code. Google # for if you are curious. Also, consider what one can # really learn from plotting such a graph ... @@ -119,7 +115,7 @@ plot(log10(as.numeric(names(freqRank)) + 1), log10(as.numeric(freqRank)), type = "b", pch = 21, bg = "#FEE0AF", xlab = "log(Rank)", ylab = "log(frequency)", - main = "6,500 nodes from the human functional interaction network") + main = "8,400 nodes from the human functional interaction network") # This looks very scale-free indeed. @@ -136,11 +132,11 @@ abline(regressionLine, col = "firebrick") # Biological complexes often appear as cliques in interaction graphs. igraph::clique_num(gSTR) -# The largest clique has 63 members. +# The largest clique has 81 members. (C <- igraph::largest_cliques(gSTR)[[1]]) -# Pick one of the proteins and find out what this fully connected cluster of 63 +# Pick one of the proteins and find out what this fully connected cluster of 81 # proteins is (you can simply Google for any of the IDs). Is this expected? # Plot this ... @@ -166,7 +162,7 @@ plot(R, par(oPar) # ... well: remember: a clique means every node is connected to every other -# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI +# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI # networks looks like for large complexes. @@ -179,9 +175,9 @@ set.seed(NULL) # reset the RNG igraph::modularity(gSTRclusters) # ... measures how separated the different # membership types are from each other tMem <- table(igraph::membership(gSTRclusters)) -length(tMem) # More than 2000 communities identified +length(tMem) # About 700 communities identified hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ... -range(tMem) # ... but one has > 100 members +range(tMem) # ... but one has > 200 members # == 2.3 Betweenness Centrality ============================================ @@ -214,13 +210,14 @@ head(sBC) # IDs... (ENSPsel <- names(igraph::V(gSTR)[BCsel])) +# Task: +# ===== +# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT # We are going to use these IDs to produce some output for a submitted task: -# so I need you to personalize ENSPsel with the following -# three lines of code: +# therefore I need you to execute the following line, note the "seal" that this +# returns, and not change ENSPsel later: -set.seed() # enter your student number here -(ENSPsel <- sample(ENSPsel)) -set.seed(NULL) # reset the random number generator +myENSPsel <- seal(ENSPsel) # Next, to find what these proteins are... @@ -251,15 +248,15 @@ if (! requireNamespace("biomaRt", quietly = TRUE)) { # browseVignettes("biomaRt") # available vignettes # data(package = "biomaRt") # available datasets -# define which dataset to use ... +# define which dataset to use ... this takes a while for download myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") # what filters are defined? -(filters <- biomaRt::listFilters(myMart)) +( filters <- biomaRt::listFilters(myMart) ) # and what attributes can we filter for? -(attributes <- biomaRt::listAttributes(myMart)) +( attributes <- biomaRt::listAttributes(myMart) ) # Soooo many options - let's look for the correct name of filters that are @@ -296,13 +293,13 @@ for (ID in ENSPsel) { mart = myMart) } + # So what are the proteins with the ten highest betweenness centralities? # ... are you surprised? (I am! Really.) # = 4 Task for submission ================================================= - # Write a loop that will go through your personalized list of Ensemble IDs and # for each ID: # -- print the ID, @@ -310,11 +307,14 @@ for (ID in ENSPsel) { # -- print the first row's wikigene description. # -- print the first row's phenotype. # +# Write your thoughts about this group of genes. +# # (Hint, you can structure your loop in the same way as the loop that # created CPdefs. ) -# Place the R code for this loop and its output into your report if you are -# submitting a report for this unit. Please read the requirements carefully. +# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code +# for this loop and its output into your report if you are submitting +# anything for credit for this unit. Please read the requirements carefully. diff --git a/FND-STA-Probability_distribution.R b/FND-STA-Probability_distribution.R index 719168b..35711f3 100644 --- a/FND-STA-Probability_distribution.R +++ b/FND-STA-Probability_distribution.R @@ -30,7 +30,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------------------- #TOC> 1 Introduction 54 @@ -47,7 +47,7 @@ #TOC> 4.2.1 An example from tossing dice 465 #TOC> 4.2.2 An example from lognormal distributions 588 #TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 631 -#TOC> +#TOC> #TOC> ========================================================================== @@ -462,7 +462,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000) # be applied to discrete distributions. But we need to talk a bit about # converting counts to p.m.f.'s. -# === 4.2.1 An example from tossing dice +# === 4.2.1 An example from tossing dice # The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But # there is an issue when we convert sampled counts to frequencies, and estimate @@ -585,7 +585,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick") # somewhat but not drastically atypical. -# === 4.2.2 An example from lognormal distributions +# === 4.2.2 An example from lognormal distributions # We had compared a set of lognormal and gamma distributions above, now we # can use KL-divergence to quantify their similarity: @@ -628,7 +628,7 @@ sum(divs < KLdiv(pmfL1, pmfL2)) # 933 # we used above. -# == 4.3 Kolmogorov-Smirnov test for continuous distributions ============== +# == 4.3 Continuous distributions: Kolmogorov-Smirnov test ============== # The Kolmogorov-Smirnov (KS) test is meant for continuous distributions, i.e. # the probability it calculates assumes that the function values are all diff --git a/data/STRINGedges.rds b/data/STRINGedges.rds index a546b87..33b3ebc 100644 Binary files a/data/STRINGedges.rds and b/data/STRINGedges.rds differ diff --git a/scripts/ABC-makeSTRINGedges.R b/scripts/ABC-makeSTRINGedges.R new file mode 100644 index 0000000..0da774a --- /dev/null +++ b/scripts/ABC-makeSTRINGedges.R @@ -0,0 +1,168 @@ +# tocID <- "scripts/ABC-makeSTRINGedges.R" +# +# Create a subnetwork of high-confidence human STRING edges. +# +# Notes: +# +# The large source- datafile is NOT posted to github. If you want to +# experiment with the original data, download it and place it into your +# local ./data directory. +# +# STRING data source: +# Download page: +# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens +# Data: (127.6 Mb) +# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz +# +# Version: 1.0 +# +# Date: 2020-09 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 1.0 Rewrite +# +# TODO: +# +# ============================================================================== + + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------- +#TOC> 1 Initialize 44 +#TOC> 2 Read STRING Data 51 +#TOC> 3 Define cutoff and subset 63 +#TOC> 4 Drop duplicates 103 +#TOC> 5 Simple statistics 127 +#TOC> 6 Write to file 160 +#TOC> +#TOC> ========================================================================== + + +# = 1 Initialize ========================================================== + +if (! requireNamespace("readr", quietly = TRUE)) { + install.packages("readr") +} + + +# = 2 Read STRING Data ==================================================== + +# Read STRING Data (needs to be downloaded from database, see URL in Notes) +# The .gz compressed version is 127.6MB, the uncompressed version is probably +# 848 Mb. Fortunately readr:: can read from compressed +# files, and does so automatically, based on the file extension. +( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) +STR <- readr::read_delim(fn, delim = " ") +nrow(STR) # 11,759,454 rows +head(STR) + + +# = 3 Define cutoff and subset ============================================ + +# approximate distribution of combined_score +hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") + +# Let's table the counts >= 850 and plot them for better resolution. + +myTb <- table(STR$combined_score[STR$combined_score >= 850]) +is.unsorted(as.integer(names(myTb))) # Good - they are all in order + +plot(myTb, type = "b", cex = 0.5, col = "#BB0000") +myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that + # frequently assigns a combined score of 0.900 + +# Let's plot these counts as cumulative sums, in reverse order, scaled +# as combined scores. +myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing +plot(myX, + cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing + xlim = c(1.0, 0.85), # reverse x-axis + type = "l", + main = "STRING interactions for 9606 (top 600,000)", + xlab = "combined_score", + ylab = "cumulative counts", + col = "#CC0000") +abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") + +# What's the cutoff for 100,000 edges? +which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 + +# confirm +sum(STR$combined_score >= 964) # 101,348 +abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") + +# subset the table, and use only the protein IDs and the combined_score +STR <- STR[STR$combined_score >= 964, + c("protein1", "protein2", "combined_score")] +colnames(STR) <- c("a", "b", "score") + + +# = 4 Drop duplicates ==================================================== + +# identify duplicate interactions by creating keys in a defined alphabetical +# sort order, then checking for duplicated(). +# e.g if we have (X:U, U:X), we change U:X to X:U and now find that +# (X:U, X:U) has a duplicate. + +AB <- STR$a < STR$b # logical vector: genes we need to swap +tmp <- STR$b # copy column b +STR$b[AB] <- STR$a[AB] # copy a's into b +STR$a[AB] <- tmp[AB] # copy tmp's into a +all(STR$a >= STR$b) # confirm: TRUE + +# now, make combined keys, like this: +paste0(STR$a[1:10], ":", STR$b[1:10]) + +tmp <- paste0(STR$a, ":", STR$b) +sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports + # both a:b and b:a ! + +# drop all duplicated interactions from tmp +STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain + + +# = 5 Simple statistics =================================================== + +# how many unique genes? +length(unique(c(STR$a, STR$b))) # 8,445 + +# how many self-edges? +sum(STR$a == STR$b) # none + +# log(rank) / log(frequency) +myTbl <- table(c(STR$a, STR$b)) +myTbl <- myTbl[order(myTbl, decreasing = TRUE)] + +hist(myTbl, breaks = 40, col = "#FFEEBB") + +# number of singletons +sum(myTbl == 1) # almost a quarter + +# maximum? +myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465 + # Google: CDC5L + +# Zipf-plot +plot(log(1:length(myTbl)), log(as.numeric(myTbl)), + type = "b", cex = 0.7, + main = "STRINGedges - degrees", + xlab = "log(rank)", + ylab = "log(frequency)", + col = "#FFBB88") + +sprintf("Average number of interactions: %5.2f", + nrow(STR) / length(unique(c(STR$a, STR$b)))) + + +# = 6 Write to file ======================================================= + +saveRDS(STR, file = "./data/STRINGedges.rds") + +# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the + # object when needed + + +# [END]