Reorganized proportional plots into a "further reading" section; added nested-box, and sankey plot visualization of proportions. Introduced plotly.
This commit is contained in:
parent
8a5c5b3961
commit
8839248362
303
BIN-MYSPE.R
303
BIN-MYSPE.R
@ -3,11 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-MYSPE unit
|
# R code accompanying the BIN-MYSPE unit
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2020-09-18
|
# Date: 2020-10-01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
|
# V 1.2 Reorganized proportional plot section into a "further reading"
|
||||||
|
# section, added nested-box, and sankey plot visualization of
|
||||||
|
# proportions. Introduced plotly.
|
||||||
# V 1.1 2020 Workflow changes
|
# V 1.1 2020 Workflow changes
|
||||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||||
@ -29,16 +32,21 @@
|
|||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------
|
#TOC> -----------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 47
|
#TOC> 1 PREPARATIONS 49
|
||||||
#TOC> 2 Suitable MYSPE Species 59
|
#TOC> 2 SUITABLE MYSPE SPECIES 61
|
||||||
#TOC> 3 Adopt "MYSPE" 83
|
#TOC> 3 ADOPT "MYSPE" 85
|
||||||
|
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 124
|
||||||
|
#TOC> 4.1 Percentages 142
|
||||||
|
#TOC> 4.2 Visualizing proportions: Pie chart 161
|
||||||
|
#TOC> 4.3 Visualizing proportions: Nested squares 238
|
||||||
|
#TOC> 4.4 Visualizing proportions: Sankey diagrams 275
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Preparations ========================================================
|
# = 1 PREPARATIONS ========================================================
|
||||||
#
|
#
|
||||||
|
|
||||||
# Execute the two conditionals below:
|
# Execute the two conditionals below:
|
||||||
@ -50,12 +58,12 @@ if (! exists("myStudentNumber")) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# = 2 Suitable MYSPE Species ==============================================
|
# = 2 SUITABLE MYSPE SPECIES ==============================================
|
||||||
|
|
||||||
|
|
||||||
# In this unit we will select one species from a list of genome sequenced fungi
|
# In this unit we will select one species from a list of genome sequenced fungi
|
||||||
# and write it into your personalized profile file. This species will be called
|
# and write it into your personalized profile file. This species will be called
|
||||||
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
|
# "MYSPE" (My Species) for other learning units and exercises.
|
||||||
|
|
||||||
# A detailed description of the process of compiling the list of genome
|
# A detailed description of the process of compiling the list of genome
|
||||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||||
@ -74,69 +82,258 @@ if (! exists("myStudentNumber")) {
|
|||||||
# implemented in practice.
|
# implemented in practice.
|
||||||
|
|
||||||
|
|
||||||
# = 3 Adopt "MYSPE" =======================================================
|
# = 3 ADOPT "MYSPE" =======================================================
|
||||||
|
|
||||||
# Execute:
|
# Execute:
|
||||||
( MYSPE <- getMYSPE(myStudentNumber) )
|
( MYSPE <- getMYSPE(myStudentNumber) )
|
||||||
|
|
||||||
# If this produced an error, this session has not been properly set up. You
|
# If this produced an error, this session has not been properly set up. You
|
||||||
# may not yet have run init() and edited .myProfile.R , or that file is not
|
# may not yet have run init() and edited .myProfile.R , or that file is not
|
||||||
# in your myScripts/ folder. Fix this, and execute source(".Rprofile") .
|
# in your myScripts/ folder. Fix this, and execute:
|
||||||
|
#
|
||||||
|
# source(".Rprofile") .
|
||||||
|
|
||||||
# If this produced NA, your Student Number may not be correct, or you are not
|
# If this produced NA, your Student Number may not be correct, or you are not in
|
||||||
# in my class-list. Contact me.
|
# my class-list. Contact me. Otherwise, this should have printed a species name,
|
||||||
# Otherwise, this should have printed a species name. Your unique species
|
# and the taxonomy ID of its genome-sequenced strain. This is your unique species
|
||||||
# for this course.
|
# for this course. Note it in your journal ...
|
||||||
|
|
||||||
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
biCode(MYSPE) # and also note it's "BiCode" ...
|
||||||
|
( myTaxID <- names(MYSPE) ) # and its taxID
|
||||||
|
|
||||||
# Task: Note down the species name and its five letter BiCode on your Student
|
|
||||||
# Wiki user page. Use this species whenever this or future assignments refer
|
|
||||||
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
|
||||||
# from myScripts/.myProfile.R and is available as MYSPE .
|
|
||||||
|
|
||||||
# Here is some more information:
|
# Task:
|
||||||
|
# =====
|
||||||
|
# Note down the species name and its five letter BiCode on your Student
|
||||||
|
# Wiki user page. Use this species whenever this or future assignments refer
|
||||||
|
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
||||||
|
# from myScripts/.myProfile.R and is available as MYSPE .
|
||||||
|
|
||||||
|
# Here is some more information about MYSPE, taken from the table of genome-
|
||||||
|
# sequenced fungi that is in your ./data folder.
|
||||||
fungiDat <- read.csv("data/Species.csv")
|
fungiDat <- read.csv("data/Species.csv")
|
||||||
|
iMs <- which(fungiDat$Taxon.ID == myTaxID)
|
||||||
|
|
||||||
# number of sequenced fungal genomes:
|
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
|
||||||
nrow(fungiDat)
|
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
|
||||||
|
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
|
||||||
|
|
||||||
# sequenced genomes of species:
|
# That's all.
|
||||||
sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
|
|
||||||
( x <- fungiDat[sel, "Name"] )
|
|
||||||
Nspc <- length(x) # save this for later ...
|
|
||||||
|
|
||||||
# sequenced genomes of genus:
|
|
||||||
sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
|
|
||||||
( x <- fungiDat[sel, "Name"] )
|
|
||||||
Ngen <- length(x) - Nspc
|
|
||||||
|
|
||||||
# order:
|
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
|
||||||
( x <- unique(fungiDat[sel, "Classification"]) )
|
|
||||||
Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
|
|
||||||
Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
|
|
||||||
|
|
||||||
# proportions
|
# The material below is an exploration of data-preparation and plotting
|
||||||
|
# techniques; you can treat this as additional practice and further reading and
|
||||||
|
# I expect that some of the code and plotting examples may be useful in a
|
||||||
|
# different context.
|
||||||
|
|
||||||
|
# A frequent task is to visualize the proportion of elements with given
|
||||||
|
# categories in a sample. For example, we might ask what the proportion of the
|
||||||
|
# different orders of fungi is the order of MYSPE? Let's first collect the
|
||||||
|
# numbers.
|
||||||
|
|
||||||
|
( nFungi <- nrow(fungiDat) ) # sequenced fungi
|
||||||
|
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
|
||||||
|
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
|
||||||
|
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.1 Percentages =======================================================
|
||||||
|
|
||||||
|
# The zeroth-order approach to visualization is simply to print percentages:
|
||||||
|
|
||||||
|
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
|
||||||
|
myOr,
|
||||||
|
(nOrder * 100) / nFungi))
|
||||||
|
|
||||||
|
# ... or, adding the actual numbers:
|
||||||
|
|
||||||
|
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
|
||||||
|
myOr,
|
||||||
|
(nOrder * 100) / nFungi,
|
||||||
|
nOrder,
|
||||||
|
nFungi))
|
||||||
|
|
||||||
|
# But that's hard to visualize for most of us, and anyway, we don't know how
|
||||||
|
# that relates to other orders.
|
||||||
|
|
||||||
|
# == 4.2 Visualizing proportions: Pie chart ================================
|
||||||
|
|
||||||
|
# Often, we will use a pie chart instead. Pie charts are rather informal types of plots, not well suited for analysis. But easy to do:
|
||||||
|
|
||||||
|
# Define four colors to identify the four categories
|
||||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
||||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))
|
|
||||||
pie(c(Nspc, Ngen, Nord, Nfng),
|
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||||
labels = "",
|
# and remember the
|
||||||
|
# previous setting
|
||||||
|
|
||||||
|
pie(c(nSpecies, # subtract numbers since these
|
||||||
|
nGenus - nSpecies, # categories are mutually contained
|
||||||
|
nOrder - nGenus - nSpecies, # in each other
|
||||||
|
nFungi - nOrder - nGenus -nSpecies),
|
||||||
|
labels = "",
|
||||||
|
radius = 0.9,
|
||||||
|
main = "MYSPE in genome-sequenced fungi",
|
||||||
|
lty = 0, # turn borders for wedges off
|
||||||
|
col = pCol,
|
||||||
|
clockwise = TRUE,
|
||||||
|
init.angle = 90)
|
||||||
|
|
||||||
|
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
|
||||||
|
|
||||||
|
legend(x = 0.95, y = 0.8, # place at legend here
|
||||||
|
legend = c("Species", "Genus", "Order", "Fungi"),
|
||||||
|
y.intersp = 2, # line spacing for labels
|
||||||
|
cex = 0.8, # character size for labels
|
||||||
|
bty = "n", # "no" box around the legend
|
||||||
|
pt.cex = 2, # size of colour boxes
|
||||||
|
pch = 15, # a filled square
|
||||||
|
col = pCol)
|
||||||
|
|
||||||
|
par(oPar) # reset graphics state
|
||||||
|
|
||||||
|
# Unless MYSPE is one of the frequently sequenced species, there will only be a
|
||||||
|
# very thin wedge visible. Pie charts are not well suited to visualize small
|
||||||
|
# proportions.
|
||||||
|
|
||||||
|
# It is a little more useful if we have non-nested proportions - like the
|
||||||
|
# number of species in the same order overall:
|
||||||
|
|
||||||
|
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
|
||||||
|
head(myTbl)
|
||||||
|
|
||||||
|
# pie() does a reasonable job out of the box to interpret table() data:
|
||||||
|
pie(myTbl)
|
||||||
|
|
||||||
|
# ... we can improve this quickly with a bit of tweaking:
|
||||||
|
|
||||||
|
N <- length(myTbl)
|
||||||
|
sel <- myOrder == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
|
||||||
|
|
||||||
|
myCol <- rep(pCol[4], N) # N elements of pCol[1]
|
||||||
|
myCol[sel] <- pCol[1] # replace this one color
|
||||||
|
|
||||||
|
myLbl <- rep("", N) # N labels of ""
|
||||||
|
myLbl[sel] <- myOr # replace this one label with the MYSPE order
|
||||||
|
|
||||||
|
|
||||||
|
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||||
|
|
||||||
|
pie(myTbl,
|
||||||
|
labels = myLbl,
|
||||||
radius = 0.9,
|
radius = 0.9,
|
||||||
main = "MYSPE in genome-sequenced fungi",
|
main = "MYSPE order",
|
||||||
lty = 0, # no borders for wedges
|
border = "#DDDDDD",
|
||||||
col = pCol,
|
col = myCol,
|
||||||
clockwise = TRUE,
|
clockwise = TRUE,
|
||||||
init.angle = 90)
|
init.angle = 90)
|
||||||
title(main = MYSPE, line = 0, cex.main = 0.7)
|
|
||||||
legend(x = 0.95, y = 0.8, # position
|
|
||||||
legend = c("Species", "Genus", "Order", "Fungi"),
|
|
||||||
y.intersp = 2, # line spacing for labels
|
|
||||||
cex = 0.8, # character size for labels
|
|
||||||
bty = "n", # "no" box around the legend
|
|
||||||
pt.cex = 2, # size of colour boxes
|
|
||||||
pch = 15,
|
|
||||||
col = pCol)
|
|
||||||
par(oPar)
|
|
||||||
|
|
||||||
|
par(oPar) # reset graphics state
|
||||||
|
|
||||||
|
# But the overall problem remains.
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.3 Visualizing proportions: Nested squares ===========================
|
||||||
|
|
||||||
|
# A simple alternative is to draw such proportions as nested squares:
|
||||||
|
|
||||||
|
x <- sqrt(nFungi)
|
||||||
|
|
||||||
|
# set margins to ~ 0 and type to square
|
||||||
|
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
|
||||||
|
|
||||||
|
# empty, square plot
|
||||||
|
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
|
||||||
|
type="n", axes=FALSE, xlab="", ylab="")
|
||||||
|
|
||||||
|
# basic square for all genomes
|
||||||
|
rect(0, 0, x, x, col = pCol[4])
|
||||||
|
|
||||||
|
# grid
|
||||||
|
u <- 0:floor(x)
|
||||||
|
N <- length(u)
|
||||||
|
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
|
||||||
|
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
|
||||||
|
# each square on this grid is one genome
|
||||||
|
|
||||||
|
# colored squares
|
||||||
|
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
|
||||||
|
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
|
||||||
|
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
|
||||||
|
|
||||||
|
# labels
|
||||||
|
text(x/2, x/2, "Fungi")
|
||||||
|
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
|
||||||
|
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
|
||||||
|
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
|
||||||
|
|
||||||
|
par(oPar) # reset graphics state
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
|
||||||
|
|
||||||
|
# Sankey diagrams are an excellent way to visualize complicated nested
|
||||||
|
# proportions and their changes (see here for example:
|
||||||
|
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
|
||||||
|
# example with the MYSPE proportions, as an illustration of the plotting
|
||||||
|
# principle.
|
||||||
|
|
||||||
|
if (! requireNamespace("plotly")) {
|
||||||
|
install.packages("plotly")
|
||||||
|
}
|
||||||
|
# Package information:
|
||||||
|
# library(help = plotly) # basic information
|
||||||
|
# browseVignettes("plotly") # available vignettes
|
||||||
|
# data(package = "plotly") # available datasets
|
||||||
|
|
||||||
|
# Here, we use the plotly package that wraps a very well developed javascript library with many options for interactive plots.
|
||||||
|
|
||||||
|
|
||||||
|
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
|
||||||
|
"Ophiostomatales (6)", # 1
|
||||||
|
"Other...", # 2
|
||||||
|
"Sporothrix (4)", # 3
|
||||||
|
"Other...", # 4
|
||||||
|
"Sporothrix schenckii (2)", # 5
|
||||||
|
"Other..." # 6
|
||||||
|
),
|
||||||
|
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
|
||||||
|
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
|
||||||
|
color = c("#f2f2f0", #
|
||||||
|
"#ffd5c4",
|
||||||
|
"#CCCCCC",
|
||||||
|
"#ff9582",
|
||||||
|
"#CCCCCC",
|
||||||
|
"#ed394e",
|
||||||
|
"#CCCCCC"
|
||||||
|
),
|
||||||
|
pad = 15,
|
||||||
|
thickness = 20,
|
||||||
|
line = list(color = "black",
|
||||||
|
width = 0.5))
|
||||||
|
|
||||||
|
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
|
||||||
|
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
|
||||||
|
value = c(6, 18, 4, 2, 2, 2)) # and node 1
|
||||||
|
|
||||||
|
# Setting up the actual plot ...
|
||||||
|
fig <- plotly::plot_ly(type = "sankey",
|
||||||
|
arrangement = "snap",
|
||||||
|
orientation = "h",
|
||||||
|
node = myNodes,
|
||||||
|
link = myLinks)
|
||||||
|
|
||||||
|
# Adding and adjusting a few layout parameters
|
||||||
|
fig <- plotly::layout(fig,
|
||||||
|
title = "Fungi Genomes - Classification",
|
||||||
|
font = list(size = 10))
|
||||||
|
|
||||||
|
fig # plot the diagram
|
||||||
|
|
||||||
|
# Note that the plot appears in the Viewer window, not the Plot window, and that
|
||||||
|
# it is interactive: you can hover over nodes and links, and drag the nodes
|
||||||
|
# around.
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
Loading…
Reference in New Issue
Block a user