New unit and data
This commit is contained in:
parent
943dbeba8f
commit
16d96d79f4
@ -3,54 +3,94 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PPI-Analysis unit.
|
# R code accompanying the BIN-PPI-Analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2017 08 - 2017 11
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.0 First live version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
#
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> ---------------------------------------------------------
|
||||||
|
#TOC> 1 Setup and data 43
|
||||||
|
#TOC> 2 Functional Edges in the Human Proteome 80
|
||||||
|
#TOC> 2.1 Cliques 123
|
||||||
|
#TOC> 2.2 Communities 164
|
||||||
|
#TOC> 2.3 Betweenness Centrality 176
|
||||||
|
#TOC> 3 biomaRt 220
|
||||||
|
#TOC> 4 Task for submission 291
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 Setup and data ======================================================
|
||||||
|
|
||||||
|
|
||||||
# ==============================================================================
|
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
||||||
# PART FOUR: EXPLORE FUNCTIONAL EDGES IN THE HUMAN PROTEOME
|
|
||||||
# ==============================================================================
|
if (!require(igraph, quietly=TRUE)) {
|
||||||
|
install.packages("igraph")
|
||||||
|
library(igraph)
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = igraph) # basic information
|
||||||
|
# browseVignettes("igraph") # available vignettes
|
||||||
|
# data(package = "igraph") # available datasets
|
||||||
|
|
||||||
# In order for you to explore some real, biological networks, I give you a
|
# In order for you to explore some real, biological networks, I give you a
|
||||||
# dataframe of functional relationships of human proteins that I have downloaded from the
|
# dataframe of functional relationships of human proteins that I have downloaded
|
||||||
# STRING database. The full table has 8.5 million records, here is a subset of
|
# from the STRING database. The full table has 8.5 million records, here is a
|
||||||
# records with combined confidence scores > 980
|
# subset of records with combined confidence scores > 980
|
||||||
|
|
||||||
# The selected set of edges with a confidence of > 980 is a dataframe with about
|
# The selected set of edges with a confidence of > 980 is a dataframe with about
|
||||||
# 50,000 edges and 6,500 unique proteins. You can load the saved dataframe here
|
# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of
|
||||||
# (and also read more about what the numbers mean at
|
# a fungal proteome. You can load the saved dataframe here (To read more about
|
||||||
# http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
||||||
|
|
||||||
load("STRINGedges.RData")
|
load("./data/STRINGedges.RData")
|
||||||
|
|
||||||
|
head(STRINGedges)
|
||||||
|
|
||||||
|
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
|
||||||
|
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
|
||||||
|
|
||||||
|
STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1)
|
||||||
|
STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2)
|
||||||
|
|
||||||
head(STRINGedges)
|
head(STRINGedges)
|
||||||
|
|
||||||
|
|
||||||
# make a graph from this dataframe
|
# = 2 Functional Edges in the Human Proteome ==============================
|
||||||
|
|
||||||
|
|
||||||
|
# There are many possibilities to explore interesting aspects of biological
|
||||||
|
# networks, we will keep with some very simple procedures here but you have
|
||||||
|
# to be aware that this is barely scratching the surface of possibilites.
|
||||||
|
# However, once the network exists in your computer, it is comparatively
|
||||||
|
# easy to find information nline about the many, many options to analyze.
|
||||||
|
|
||||||
|
|
||||||
|
# Make a graph from this dataframe
|
||||||
?graph_from_data_frame
|
?graph_from_data_frame
|
||||||
|
|
||||||
gSTR <- graph_from_data_frame(STRINGedges)
|
gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||||
|
|
||||||
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
||||||
# layout of such large graphs is possible, but requires specialized code. Google
|
# layout of such large graphs is possible, but requires specialized code. Google
|
||||||
@ -62,11 +102,10 @@ gSTR <- graph_from_data_frame(STRINGedges)
|
|||||||
compSTR <- components(gSTR)
|
compSTR <- components(gSTR)
|
||||||
summary(compSTR) # our graph is fully connected!
|
summary(compSTR) # our graph is fully connected!
|
||||||
|
|
||||||
dg <- degree(gSTR)
|
hist(log(degree(gSTR)), col="#FEE0AF")
|
||||||
hist(log(dg), col="#FEE0AF")
|
|
||||||
# this actually does look rather scale-free
|
# this actually does look rather scale-free
|
||||||
|
|
||||||
(freqRank <- table(dg))
|
(freqRank <- table(degree(gSTR)))
|
||||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||||
log10(as.numeric(freqRank)), type = "b",
|
log10(as.numeric(freqRank)), type = "b",
|
||||||
pch = 21, bg = "#FEE0AF",
|
pch = 21, bg = "#FEE0AF",
|
||||||
@ -74,10 +113,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
main = "6,500 nodes from the human functional interaction network")
|
main = "6,500 nodes from the human functional interaction network")
|
||||||
|
|
||||||
# This looks very scale-free indeed.
|
# This looks very scale-free indeed.
|
||||||
#
|
|
||||||
|
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
|
||||||
|
log10(as.numeric(names(freqRank)) + 1)))
|
||||||
|
abline(regressionLine, col = "firebrick")
|
||||||
|
|
||||||
# Now explore some more:
|
# Now explore some more:
|
||||||
|
|
||||||
# === CLIQUES ========
|
# == 2.1 Cliques ===========================================================
|
||||||
|
|
||||||
# Let's find the largest cliques. Remember: a clique is a fully connected
|
# Let's find the largest cliques. Remember: a clique is a fully connected
|
||||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
||||||
# Biological complexes often appear as cliques in interaction graphs.
|
# Biological complexes often appear as cliques in interaction graphs.
|
||||||
@ -85,14 +129,51 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
clique_num(gSTR)
|
clique_num(gSTR)
|
||||||
# The largest clique has 63 members.
|
# The largest clique has 63 members.
|
||||||
|
|
||||||
largest_cliques(gSTR)[[1]]
|
(C <- largest_cliques(gSTR)[[1]])
|
||||||
|
|
||||||
# Pick one of the proteins and find out what this fully connected cluster of 63
|
# Pick one of the proteins and find out what this fully connected cluster of 63
|
||||||
# proteins is (you can simply Google for the ID). Is this expected?
|
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||||
|
|
||||||
|
# Plot this ...
|
||||||
|
R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices
|
||||||
|
|
||||||
|
# color the vertices along a color spectrum
|
||||||
|
vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes
|
||||||
|
|
||||||
|
# color the edges to have the same color as the originating node
|
||||||
|
eCol <- character()
|
||||||
|
for (i in seq_along(vCol)) {
|
||||||
|
eCol <- c(eCol, rep(vCol[i], gorder(R)))
|
||||||
|
}
|
||||||
|
|
||||||
|
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||||
|
plot(R,
|
||||||
|
layout = layout_in_circle(R),
|
||||||
|
vertex.size = 3,
|
||||||
|
vertex.color = vCol,
|
||||||
|
edge.color = eCol,
|
||||||
|
edge.width = 0.1,
|
||||||
|
vertex.label = NA)
|
||||||
|
par(oPar)
|
||||||
|
|
||||||
|
# ... well: remember: a clique means every node is connected to every other
|
||||||
|
# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI
|
||||||
|
# networks looks like for large complexes.
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.2 Communities =======================================================
|
||||||
|
|
||||||
# === BETWEENNESS CENTRALITY =======================================
|
set.seed(112358)
|
||||||
|
gSTRclusters <- cluster_infomap(gSTR)
|
||||||
|
modularity(gSTRclusters) # ... measures how separated the different membership
|
||||||
|
# types are from each other
|
||||||
|
tMem <- table(membership(gSTRclusters))
|
||||||
|
length(tMem) # More than 2000 communities identified
|
||||||
|
hist(tMem, breaks = 50) # most clusters are small ...
|
||||||
|
range(tMem) # ... but one has > 100 members
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.3 Betweenness Centrality ============================================
|
||||||
|
|
||||||
# Let's find the nodes with the 10 - highest betweenness centralities.
|
# Let's find the nodes with the 10 - highest betweenness centralities.
|
||||||
#
|
#
|
||||||
@ -105,7 +186,7 @@ BC$res[1] # betweeness centrality of node 1 in the graph ...
|
|||||||
# ... which one is node 1?
|
# ... which one is node 1?
|
||||||
V(gSTR)[1]
|
V(gSTR)[1]
|
||||||
|
|
||||||
# to get the ten-highest nodes, we simply label the elements BC with their
|
# to get the ten-highest nodes, we simply label the elements of BC with their
|
||||||
# index ...
|
# index ...
|
||||||
names(BC$res) <- as.character(1:length(BC$res))
|
names(BC$res) <- as.character(1:length(BC$res))
|
||||||
|
|
||||||
@ -116,27 +197,17 @@ head(sBC)
|
|||||||
# This ordered vector means: node 3,862 has the highest betweeness centrality,
|
# This ordered vector means: node 3,862 has the highest betweeness centrality,
|
||||||
# node 1,720 has the second highest.. etc.
|
# node 1,720 has the second highest.. etc.
|
||||||
|
|
||||||
BCsel <- as.numeric(names(sBC)[1:10])
|
(BCsel <- as.numeric(names(sBC)[1:10]))
|
||||||
BCsel
|
|
||||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
||||||
# IDs...
|
# IDs...
|
||||||
ENSPsel <- names(V(gSTR)[BCsel])
|
(ENSPsel <- names(V(gSTR)[BCsel]))
|
||||||
|
|
||||||
# We are going to use these IDs to produce some output for you to print out and
|
# We are going to use these IDs to produce some output for a submitted task:
|
||||||
# bring to class, so I need you to personalize ENSPsel with the following
|
# so I need you to personalize ENSPsel with the following
|
||||||
# two lines of code:
|
# two lines of code:
|
||||||
set.seed(myStudentNumber)
|
set.seed(<myStudentNumber>) # enter your student number here
|
||||||
ENSPsel <- sample(ENSPsel)
|
(ENSPsel <- sample(ENSPsel))
|
||||||
|
|
||||||
# We also need to remove the string "9606." from the ID:
|
|
||||||
ENSPsel <- gsub("9606\\.", "", ENSPsel)
|
|
||||||
|
|
||||||
# This is the final vector of IDs.:
|
|
||||||
ENSPsel
|
|
||||||
|
|
||||||
# Could you define in a short answer quiz what these IDs are? And what their
|
|
||||||
# biological significance is? I'm probably not going to ask you to that on the
|
|
||||||
# quiz, but I expect you to be able to.
|
|
||||||
|
|
||||||
# Next, to find what these proteins are...
|
# Next, to find what these proteins are...
|
||||||
|
|
||||||
@ -145,7 +216,9 @@ ENSPsel
|
|||||||
# the very, very useful biomaRt package to translate these Ensemble IDs into
|
# the very, very useful biomaRt package to translate these Ensemble IDs into
|
||||||
# gene symbols.
|
# gene symbols.
|
||||||
|
|
||||||
# == biomaRt =========================================================
|
|
||||||
|
# = 3 biomaRt =============================================================
|
||||||
|
|
||||||
|
|
||||||
# IDs are just labels, but for _bio_informatics we need to learn more about the
|
# IDs are just labels, but for _bio_informatics we need to learn more about the
|
||||||
# biological function of the genes or proteins that we retrieve via graph data
|
# biological function of the genes or proteins that we retrieve via graph data
|
||||||
@ -170,12 +243,12 @@ if (!require(biomaRt, quietly=TRUE)) {
|
|||||||
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||||
|
|
||||||
# what filters are defined?
|
# what filters are defined?
|
||||||
filters <- listFilters(myMart)
|
(filters <- listFilters(myMart))
|
||||||
filters
|
|
||||||
|
|
||||||
# and what attributes can we filter for?
|
# and what attributes can we filter for?
|
||||||
attributes <- listAttributes(myMart)
|
(attributes <- listAttributes(myMart))
|
||||||
attributes
|
|
||||||
|
|
||||||
# Soooo many options - let's look for the correct name of filters that are
|
# Soooo many options - let's look for the correct name of filters that are
|
||||||
# useful for ENSP IDs ...
|
# useful for ENSP IDs ...
|
||||||
@ -214,20 +287,22 @@ for (ID in ENSPsel) {
|
|||||||
# So what are the proteins with the ten highest betweenness centralities?
|
# So what are the proteins with the ten highest betweenness centralities?
|
||||||
# ... are you surprised? (I am! Really.)
|
# ... are you surprised? (I am! Really.)
|
||||||
|
|
||||||
# Final task: Write a loop that will go through your list and
|
|
||||||
|
# = 4 Task for submission =================================================
|
||||||
|
|
||||||
|
|
||||||
|
# Write a loop that will go through your personalized list of Ensemble IDs and
|
||||||
# for each ID:
|
# for each ID:
|
||||||
# -- print the ID,
|
# -- print the ID,
|
||||||
# -- print the first row's symbol, and
|
# -- print the first row's HGNC symbol,
|
||||||
# -- print the first row's wikigene description.
|
# -- print the first row's wikigene description.
|
||||||
|
# -- print the first row's phenotype.
|
||||||
#
|
#
|
||||||
# (Hint, you can structure your loop in the same way as the loop that
|
# (Hint, you can structure your loop in the same way as the loop that
|
||||||
# created CPdefs. )
|
# created CPdefs. )
|
||||||
|
|
||||||
# Print the R code for your loop and its output for the ten genes onto a sheet
|
# Place the R code for this loop and its output into your report if you are
|
||||||
# of paper, write your student number and name on it, and bring this to class.
|
# submitting a report for this unit. Please read the requirements carefully.
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tasks
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BIN
data/STRINGedges.RData
Normal file
BIN
data/STRINGedges.RData
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user