New unit and data

2017-11-09 23:44:25 -05:00
parent 943dbeba8f
commit 16d96d79f4
2 changed files with 131 additions and 56 deletions
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@@ -3,54 +3,94 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
 #
-# Version:  0.1
+# Version:   1.0
 #
-# Date:     2017  08  28
+# Date:     2017  08  - 2017 11
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.0    First live version
 #           0.1    First code copied from 2016 material.
-
+#
 #
 # TODO:
 #
 #
 # == DO NOT SIMPLY  source()  THIS FILE! =======================================
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 # going on. That's not how it works ...
-
+#
 # ==============================================================================

-# = 1 ___Section___
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                     Line
+#TOC> ---------------------------------------------------------
+#TOC>   1        Setup and data                              43
+#TOC>   2        Functional Edges in the Human Proteome      80
+#TOC>   2.1      Cliques                                    123
+#TOC>   2.2      Communities                                164
+#TOC>   2.3      Betweenness Centrality                     176
+#TOC>   3        biomaRt                                    220
+#TOC>   4        Task for submission                        291
+#TOC> 
+#TOC> ==========================================================================


+# =    1  Setup and data  ======================================================


-# ==============================================================================
-#        PART FOUR: EXPLORE FUNCTIONAL EDGES IN THE HUMAN PROTEOME
-# ==============================================================================
+# Not surprisingly, the analysis of PPI networks needs iGraph:
+
+if (!require(igraph, quietly=TRUE)) {
+  install.packages("igraph")
+  library(igraph)
+}
+# Package information:
+#  library(help = igraph)       # basic information
+#  browseVignettes("igraph")    # available vignettes
+#  data(package = "igraph")     # available datasets

 # In order for you to explore some real, biological networks, I give you a
-# dataframe of functional relationships of human proteins that I have downloaded from the
-# STRING database. The full table has 8.5 million records, here is a subset of
-# records with combined confidence scores > 980
+# dataframe of functional relationships of human proteins that I have downloaded
+# from the STRING database. The full table has 8.5 million records, here is a
+# subset of records with combined confidence scores > 980

 # The selected set of edges with a confidence of > 980 is a dataframe with about
-# 50,000 edges and 6,500 unique proteins. You can load the saved dataframe here
-# (and also read more about what the numbers mean at
-# http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
+# 50,000 edges and 6,500 unique proteins. Incidentaly, that's about the size of
+# a fungal proteome. You can load the saved dataframe here (To read more about
+# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).

-load("STRINGedges.RData")
+load("./data/STRINGedges.RData")
+
+head(STRINGedges)
+
+# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
+# Ensemble transcript identifiers that start with ENSP. We'll remove them:
+
+STRINGedges$protein1 <- gsub("^9606\\.", "", STRINGedges$protein1)
+STRINGedges$protein2 <- gsub("^9606\\.", "", STRINGedges$protein2)

 head(STRINGedges)


-# make a graph from this dataframe
+# =    2  Functional Edges in the Human Proteome  ==============================
+
+
+# There are many possibilities to explore interesting aspects of biological
+# networks, we will keep with some very simple procedures here but you have
+# to be aware that this is barely scratching the surface of possibilites.
+# However, once the network exists in your computer, it is comparatively
+# easy to find information nline about the many, many options to analyze.
+
+
+# Make a graph from this dataframe
 ?graph_from_data_frame

-gSTR <- graph_from_data_frame(STRINGedges)
+gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)

 # CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
 # layout of such large graphs is possible, but requires specialized code. Google
@@ -62,11 +102,10 @@ gSTR <- graph_from_data_frame(STRINGedges)
 compSTR <- components(gSTR)
 summary(compSTR) # our graph is fully connected!

-dg <- degree(gSTR)
-hist(log(dg), col="#FEE0AF")
+hist(log(degree(gSTR)), col="#FEE0AF")
 # this actually does look rather scale-free

-(freqRank <- table(dg))
+(freqRank <- table(degree(gSTR)))
 plot(log10(as.numeric(names(freqRank)) + 1),
     log10(as.numeric(freqRank)), type = "b",
     pch = 21, bg = "#FEE0AF",
@@ -74,10 +113,15 @@ plot(log10(as.numeric(names(freqRank)) + 1),
     main = "6,500 nodes from the human functional interaction network")

 # This looks very scale-free indeed.
-#
+
+(regressionLine <- lm(log10(as.numeric(freqRank)) ~
+                      log10(as.numeric(names(freqRank)) + 1)))
+abline(regressionLine, col = "firebrick")
+
 # Now explore some more:

-# === CLIQUES   ========
+# ==   2.1  Cliques  ===========================================================
+
 # Let's find the largest cliques. Remember: a clique is a fully connected
 # subgraph, i.e. a subgraph in which every node is connected to every other.
 # Biological complexes often appear as cliques in interaction graphs.
@@ -85,14 +129,51 @@ plot(log10(as.numeric(names(freqRank)) + 1),
 clique_num(gSTR)
 # The largest clique has 63 members.

-largest_cliques(gSTR)[[1]]
+(C <- largest_cliques(gSTR)[[1]])

 # Pick one of the proteins and find out what this fully connected cluster of 63
-# proteins is (you can simply Google for the ID). Is this expected?
+# proteins is (you can simply Google for any of the IDs). Is this expected?
+
+# Plot this ...
+R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices
+
+# color the vertices along a color spectrum
+vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes
+
+# color the edges to have the same color as the originating node
+eCol <- character()
+for (i in seq_along(vCol)) {
+  eCol <- c(eCol, rep(vCol[i], gorder(R)))
+}
+
+oPar <- par(mar= rep(0,4)) # Turn margins off
+plot(R,
+     layout = layout_in_circle(R),
+     vertex.size = 3,
+     vertex.color = vCol,
+     edge.color = eCol,
+     edge.width = 0.1,
+     vertex.label = NA)
+par(oPar)
+
+# ... well: remember: a clique means every node is connected to every other
+# node. We have 63 * 63 = 3,969 edges. This is what a matrix model of PPI
+# networks looks like for large complexes.


+# ==   2.2  Communities  =======================================================

-# === BETWEENNESS CENTRALITY   =======================================
+set.seed(112358)
+gSTRclusters <- cluster_infomap(gSTR)
+modularity(gSTRclusters) # ... measures how separated the different membership
+                         # types are from each other
+tMem <- table(membership(gSTRclusters))
+length(tMem)  # More than 2000 communities identified
+hist(tMem, breaks = 50)  # most clusters are small ...
+range(tMem) # ... but one has > 100 members
+
+
+# ==   2.3  Betweenness Centrality  ============================================

 # Let's find the nodes with the 10 - highest betweenness centralities.
 #
@@ -105,7 +186,7 @@ BC$res[1]   # betweeness centrality of node 1 in the graph ...
 # ... which one is node 1?
 V(gSTR)[1]

-# to get the ten-highest nodes, we simply label the elements BC with their
+# to get the ten-highest nodes, we simply label the elements of BC with their
 # index ...
 names(BC$res) <- as.character(1:length(BC$res))

@@ -116,27 +197,17 @@ head(sBC)
 # This ordered vector means: node 3,862 has the highest betweeness centrality,
 # node 1,720 has the second highest.. etc.

-BCsel <- as.numeric(names(sBC)[1:10])
-BCsel
+(BCsel <- as.numeric(names(sBC)[1:10]))
+
 # We can use the first ten labels to subset the nodes in gSTR and fetch the
 # IDs...
-ENSPsel <- names(V(gSTR)[BCsel])
+(ENSPsel <- names(V(gSTR)[BCsel]))

-# We are going to use these IDs to produce some output for you to print out and
-# bring to class, so I need you to personalize ENSPsel with the following
+# We are going to use these IDs to produce some output for a submitted task:
+# so I need you to personalize ENSPsel with the following
 # two lines of code:
-set.seed(myStudentNumber)
-ENSPsel <- sample(ENSPsel)
-
-# We also need to remove the string "9606." from the ID:
-ENSPsel <- gsub("9606\\.", "", ENSPsel)
-
-# This is the final vector of IDs.:
-ENSPsel
-
-# Could you define in a short answer quiz what these IDs are? And what their
-# biological significance is? I'm probably not going to ask you to that on the
-# quiz, but I expect you to be able to.
+set.seed(<myStudentNumber>) # enter your student number here
+(ENSPsel <- sample(ENSPsel))

 #  Next, to find what these proteins are...

@@ -145,7 +216,9 @@ ENSPsel
 # the very, very useful biomaRt package to translate these Ensemble IDs into
 # gene symbols.

-# == biomaRt =========================================================
+
+# =    3  biomaRt  =============================================================
+

 # IDs are just labels, but for _bio_informatics we need to learn more about the
 # biological function of the genes or proteins that we retrieve via graph data
@@ -170,12 +243,12 @@ if (!require(biomaRt, quietly=TRUE)) {
 myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")

 # what filters are defined?
-filters <- listFilters(myMart)
-filters
+(filters <- listFilters(myMart))
+

 # and what attributes can we filter for?
-attributes <- listAttributes(myMart)
-attributes
+(attributes <- listAttributes(myMart))
+

 # Soooo many options - let's look for the correct name of filters that are
 # useful for ENSP IDs ...
@@ -214,20 +287,22 @@ for (ID in ENSPsel) {
 # So what are the proteins with the ten highest betweenness centralities?
 #  ... are you surprised? (I am! Really.)

-# Final task: Write a loop that will go through your list and
+
+# =    4  Task for submission  =================================================
+
+
+# Write a loop that will go through your personalized list of Ensemble IDs and
 #    for each ID:
 #    --  print the ID,
-#    --  print the first row's symbol, and
+#    --  print the first row's HGNC symbol,
 #    --  print the first row's wikigene description.
+#    --  print the first row's phenotype.
 #
 # (Hint, you can structure your loop in the same way as the loop that
 # created CPdefs. )

-# Print the R code for your loop and its output for the ten genes onto a sheet
-# of paper, write your student number and name on it, and bring this to class.
-
-
-# = 1 Tasks
+# Place the R code for this loop and its output into your report if you are
+# submitting a report for this unit. Please read the requirements carefully.



--- a/data/STRINGedges.RData
+++ b/data/STRINGedges.RData