Reorganized proportional plots into a "further reading" section; added nested-box, and sankey plot visualization of proportions. Introduced plotly.
This commit is contained in:
parent
8a5c5b3961
commit
8839248362
277
BIN-MYSPE.R
277
BIN-MYSPE.R
@ -3,11 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-MYSPE unit
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2020-09-18
|
||||
# Date: 2020-10-01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.2 Reorganized proportional plot section into a "further reading"
|
||||
# section, added nested-box, and sankey plot visualization of
|
||||
# proportions. Introduced plotly.
|
||||
# V 1.1 2020 Workflow changes
|
||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||
@ -30,15 +33,20 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------
|
||||
#TOC> 1 Preparations 47
|
||||
#TOC> 2 Suitable MYSPE Species 59
|
||||
#TOC> 3 Adopt "MYSPE" 83
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 PREPARATIONS 49
|
||||
#TOC> 2 SUITABLE MYSPE SPECIES 61
|
||||
#TOC> 3 ADOPT "MYSPE" 85
|
||||
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 124
|
||||
#TOC> 4.1 Percentages 142
|
||||
#TOC> 4.2 Visualizing proportions: Pie chart 161
|
||||
#TOC> 4.3 Visualizing proportions: Nested squares 238
|
||||
#TOC> 4.4 Visualizing proportions: Sankey diagrams 275
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
# = 1 PREPARATIONS ========================================================
|
||||
#
|
||||
|
||||
# Execute the two conditionals below:
|
||||
@ -50,12 +58,12 @@ if (! exists("myStudentNumber")) {
|
||||
}
|
||||
|
||||
|
||||
# = 2 Suitable MYSPE Species ==============================================
|
||||
# = 2 SUITABLE MYSPE SPECIES ==============================================
|
||||
|
||||
|
||||
# In this unit we will select one species from a list of genome sequenced fungi
|
||||
# and write it into your personalized profile file. This species will be called
|
||||
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
|
||||
# "MYSPE" (My Species) for other learning units and exercises.
|
||||
|
||||
# A detailed description of the process of compiling the list of genome
|
||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||
@ -74,69 +82,258 @@ if (! exists("myStudentNumber")) {
|
||||
# implemented in practice.
|
||||
|
||||
|
||||
# = 3 Adopt "MYSPE" =======================================================
|
||||
# = 3 ADOPT "MYSPE" =======================================================
|
||||
|
||||
# Execute:
|
||||
( MYSPE <- getMYSPE(myStudentNumber) )
|
||||
|
||||
# If this produced an error, this session has not been properly set up. You
|
||||
# may not yet have run init() and edited .myProfile.R , or that file is not
|
||||
# in your myScripts/ folder. Fix this, and execute source(".Rprofile") .
|
||||
# in your myScripts/ folder. Fix this, and execute:
|
||||
#
|
||||
# source(".Rprofile") .
|
||||
|
||||
# If this produced NA, your Student Number may not be correct, or you are not
|
||||
# in my class-list. Contact me.
|
||||
# Otherwise, this should have printed a species name. Your unique species
|
||||
# for this course.
|
||||
# If this produced NA, your Student Number may not be correct, or you are not in
|
||||
# my class-list. Contact me. Otherwise, this should have printed a species name,
|
||||
# and the taxonomy ID of its genome-sequenced strain. This is your unique species
|
||||
# for this course. Note it in your journal ...
|
||||
|
||||
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
||||
biCode(MYSPE) # and also note it's "BiCode" ...
|
||||
( myTaxID <- names(MYSPE) ) # and its taxID
|
||||
|
||||
# Task: Note down the species name and its five letter BiCode on your Student
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# Note down the species name and its five letter BiCode on your Student
|
||||
# Wiki user page. Use this species whenever this or future assignments refer
|
||||
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
||||
# from myScripts/.myProfile.R and is available as MYSPE .
|
||||
|
||||
# Here is some more information:
|
||||
# Here is some more information about MYSPE, taken from the table of genome-
|
||||
# sequenced fungi that is in your ./data folder.
|
||||
fungiDat <- read.csv("data/Species.csv")
|
||||
iMs <- which(fungiDat$Taxon.ID == myTaxID)
|
||||
|
||||
# number of sequenced fungal genomes:
|
||||
nrow(fungiDat)
|
||||
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
|
||||
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
|
||||
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
|
||||
|
||||
# sequenced genomes of species:
|
||||
sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
|
||||
( x <- fungiDat[sel, "Name"] )
|
||||
Nspc <- length(x) # save this for later ...
|
||||
# That's all.
|
||||
|
||||
# sequenced genomes of genus:
|
||||
sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
|
||||
( x <- fungiDat[sel, "Name"] )
|
||||
Ngen <- length(x) - Nspc
|
||||
|
||||
# order:
|
||||
( x <- unique(fungiDat[sel, "Classification"]) )
|
||||
Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
|
||||
Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
|
||||
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
|
||||
|
||||
# proportions
|
||||
# The material below is an exploration of data-preparation and plotting
|
||||
# techniques; you can treat this as additional practice and further reading and
|
||||
# I expect that some of the code and plotting examples may be useful in a
|
||||
# different context.
|
||||
|
||||
# A frequent task is to visualize the proportion of elements with given
|
||||
# categories in a sample. For example, we might ask what the proportion of the
|
||||
# different orders of fungi is the order of MYSPE? Let's first collect the
|
||||
# numbers.
|
||||
|
||||
( nFungi <- nrow(fungiDat) ) # sequenced fungi
|
||||
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
|
||||
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
|
||||
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
|
||||
|
||||
|
||||
# == 4.1 Percentages =======================================================
|
||||
|
||||
# The zeroth-order approach to visualization is simply to print percentages:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi))
|
||||
|
||||
# ... or, adding the actual numbers:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi,
|
||||
nOrder,
|
||||
nFungi))
|
||||
|
||||
# But that's hard to visualize for most of us, and anyway, we don't know how
|
||||
# that relates to other orders.
|
||||
|
||||
# == 4.2 Visualizing proportions: Pie chart ================================
|
||||
|
||||
# Often, we will use a pie chart instead. Pie charts are rather informal types of plots, not well suited for analysis. But easy to do:
|
||||
|
||||
# Define four colors to identify the four categories
|
||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))
|
||||
pie(c(Nspc, Ngen, Nord, Nfng),
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
# and remember the
|
||||
# previous setting
|
||||
|
||||
pie(c(nSpecies, # subtract numbers since these
|
||||
nGenus - nSpecies, # categories are mutually contained
|
||||
nOrder - nGenus - nSpecies, # in each other
|
||||
nFungi - nOrder - nGenus -nSpecies),
|
||||
labels = "",
|
||||
radius = 0.9,
|
||||
main = "MYSPE in genome-sequenced fungi",
|
||||
lty = 0, # no borders for wedges
|
||||
lty = 0, # turn borders for wedges off
|
||||
col = pCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
title(main = MYSPE, line = 0, cex.main = 0.7)
|
||||
legend(x = 0.95, y = 0.8, # position
|
||||
|
||||
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
|
||||
|
||||
legend(x = 0.95, y = 0.8, # place at legend here
|
||||
legend = c("Species", "Genus", "Order", "Fungi"),
|
||||
y.intersp = 2, # line spacing for labels
|
||||
cex = 0.8, # character size for labels
|
||||
bty = "n", # "no" box around the legend
|
||||
pt.cex = 2, # size of colour boxes
|
||||
pch = 15,
|
||||
pch = 15, # a filled square
|
||||
col = pCol)
|
||||
par(oPar)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# Unless MYSPE is one of the frequently sequenced species, there will only be a
|
||||
# very thin wedge visible. Pie charts are not well suited to visualize small
|
||||
# proportions.
|
||||
|
||||
# It is a little more useful if we have non-nested proportions - like the
|
||||
# number of species in the same order overall:
|
||||
|
||||
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
|
||||
head(myTbl)
|
||||
|
||||
# pie() does a reasonable job out of the box to interpret table() data:
|
||||
pie(myTbl)
|
||||
|
||||
# ... we can improve this quickly with a bit of tweaking:
|
||||
|
||||
N <- length(myTbl)
|
||||
sel <- myOrder == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
|
||||
|
||||
myCol <- rep(pCol[4], N) # N elements of pCol[1]
|
||||
myCol[sel] <- pCol[1] # replace this one color
|
||||
|
||||
myLbl <- rep("", N) # N labels of ""
|
||||
myLbl[sel] <- myOr # replace this one label with the MYSPE order
|
||||
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
|
||||
pie(myTbl,
|
||||
labels = myLbl,
|
||||
radius = 0.9,
|
||||
main = "MYSPE order",
|
||||
border = "#DDDDDD",
|
||||
col = myCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# But the overall problem remains.
|
||||
|
||||
|
||||
# == 4.3 Visualizing proportions: Nested squares ===========================
|
||||
|
||||
# A simple alternative is to draw such proportions as nested squares:
|
||||
|
||||
x <- sqrt(nFungi)
|
||||
|
||||
# set margins to ~ 0 and type to square
|
||||
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
|
||||
|
||||
# empty, square plot
|
||||
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
|
||||
type="n", axes=FALSE, xlab="", ylab="")
|
||||
|
||||
# basic square for all genomes
|
||||
rect(0, 0, x, x, col = pCol[4])
|
||||
|
||||
# grid
|
||||
u <- 0:floor(x)
|
||||
N <- length(u)
|
||||
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
|
||||
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
|
||||
# each square on this grid is one genome
|
||||
|
||||
# colored squares
|
||||
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
|
||||
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
|
||||
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
|
||||
|
||||
# labels
|
||||
text(x/2, x/2, "Fungi")
|
||||
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
|
||||
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
|
||||
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
|
||||
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
|
||||
|
||||
# Sankey diagrams are an excellent way to visualize complicated nested
|
||||
# proportions and their changes (see here for example:
|
||||
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
|
||||
# example with the MYSPE proportions, as an illustration of the plotting
|
||||
# principle.
|
||||
|
||||
if (! requireNamespace("plotly")) {
|
||||
install.packages("plotly")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = plotly) # basic information
|
||||
# browseVignettes("plotly") # available vignettes
|
||||
# data(package = "plotly") # available datasets
|
||||
|
||||
# Here, we use the plotly package that wraps a very well developed javascript library with many options for interactive plots.
|
||||
|
||||
|
||||
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
|
||||
"Ophiostomatales (6)", # 1
|
||||
"Other...", # 2
|
||||
"Sporothrix (4)", # 3
|
||||
"Other...", # 4
|
||||
"Sporothrix schenckii (2)", # 5
|
||||
"Other..." # 6
|
||||
),
|
||||
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
|
||||
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
|
||||
color = c("#f2f2f0", #
|
||||
"#ffd5c4",
|
||||
"#CCCCCC",
|
||||
"#ff9582",
|
||||
"#CCCCCC",
|
||||
"#ed394e",
|
||||
"#CCCCCC"
|
||||
),
|
||||
pad = 15,
|
||||
thickness = 20,
|
||||
line = list(color = "black",
|
||||
width = 0.5))
|
||||
|
||||
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
|
||||
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
|
||||
value = c(6, 18, 4, 2, 2, 2)) # and node 1
|
||||
|
||||
# Setting up the actual plot ...
|
||||
fig <- plotly::plot_ly(type = "sankey",
|
||||
arrangement = "snap",
|
||||
orientation = "h",
|
||||
node = myNodes,
|
||||
link = myLinks)
|
||||
|
||||
# Adding and adjusting a few layout parameters
|
||||
fig <- plotly::layout(fig,
|
||||
title = "Fungi Genomes - Classification",
|
||||
font = list(size = 10))
|
||||
|
||||
fig # plot the diagram
|
||||
|
||||
# Note that the plot appears in the Viewer window, not the Plot window, and that
|
||||
# it is interactive: you can hover over nodes and links, and drag the nodes
|
||||
# around.
|
||||
|
||||
# [END]
|
||||
|
Loading…
Reference in New Issue
Block a user