Reorganized proportional plots into a "further reading" section; added nested-box, and sankey plot visualization of proportions. Introduced plotly.

This commit is contained in:
hyginn 2020-10-01 18:18:51 +10:00
parent 8a5c5b3961
commit 8839248362

View File

@ -3,11 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit
#
# Version: 1.1
# Version: 1.2
#
# Date: 2020-09-18
# Date: 2020-10-01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.2 Reorganized proportional plot section into a "further reading"
# section, added nested-box, and sankey plot visualization of
# proportions. Introduced plotly.
# V 1.1 2020 Workflow changes
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
@ -29,16 +32,21 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------
#TOC> 1 Preparations 47
#TOC> 2 Suitable MYSPE Species 59
#TOC> 3 Adopt "MYSPE" 83
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 PREPARATIONS 49
#TOC> 2 SUITABLE MYSPE SPECIES 61
#TOC> 3 ADOPT "MYSPE" 85
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 124
#TOC> 4.1 Percentages 142
#TOC> 4.2 Visualizing proportions: Pie chart 161
#TOC> 4.3 Visualizing proportions: Nested squares 238
#TOC> 4.4 Visualizing proportions: Sankey diagrams 275
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
# = 1 PREPARATIONS ========================================================
#
# Execute the two conditionals below:
@ -50,12 +58,12 @@ if (! exists("myStudentNumber")) {
}
# = 2 Suitable MYSPE Species ==============================================
# = 2 SUITABLE MYSPE SPECIES ==============================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
# "MYSPE" (My Species) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
@ -74,69 +82,258 @@ if (! exists("myStudentNumber")) {
# implemented in practice.
# = 3 Adopt "MYSPE" =======================================================
# = 3 ADOPT "MYSPE" =======================================================
# Execute:
( MYSPE <- getMYSPE(myStudentNumber) )
# If this produced an error, this session has not been properly set up. You
# may not yet have run init() and edited .myProfile.R , or that file is not
# in your myScripts/ folder. Fix this, and execute source(".Rprofile") .
# in your myScripts/ folder. Fix this, and execute:
#
# source(".Rprofile") .
# If this produced NA, your Student Number may not be correct, or you are not
# in my class-list. Contact me.
# Otherwise, this should have printed a species name. Your unique species
# for this course.
# If this produced NA, your Student Number may not be correct, or you are not in
# my class-list. Contact me. Otherwise, this should have printed a species name,
# and the taxonomy ID of its genome-sequenced strain. This is your unique species
# for this course. Note it in your journal ...
biCode(MYSPE) # and what is it's "BiCode" ... ?
biCode(MYSPE) # and also note it's "BiCode" ...
( myTaxID <- names(MYSPE) ) # and its taxID
# Task: Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information:
# Task:
# =====
# Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information about MYSPE, taken from the table of genome-
# sequenced fungi that is in your ./data folder.
fungiDat <- read.csv("data/Species.csv")
iMs <- which(fungiDat$Taxon.ID == myTaxID)
# number of sequenced fungal genomes:
nrow(fungiDat)
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
# sequenced genomes of species:
sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
( x <- fungiDat[sel, "Name"] )
Nspc <- length(x) # save this for later ...
# That's all.
# sequenced genomes of genus:
sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
( x <- fungiDat[sel, "Name"] )
Ngen <- length(x) - Nspc
# order:
( x <- unique(fungiDat[sel, "Classification"]) )
Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
# proportions
# The material below is an exploration of data-preparation and plotting
# techniques; you can treat this as additional practice and further reading and
# I expect that some of the code and plotting examples may be useful in a
# different context.
# A frequent task is to visualize the proportion of elements with given
# categories in a sample. For example, we might ask what the proportion of the
# different orders of fungi is the order of MYSPE? Let's first collect the
# numbers.
( nFungi <- nrow(fungiDat) ) # sequenced fungi
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
# == 4.1 Percentages =======================================================
# The zeroth-order approach to visualization is simply to print percentages:
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
myOr,
(nOrder * 100) / nFungi))
# ... or, adding the actual numbers:
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
myOr,
(nOrder * 100) / nFungi,
nOrder,
nFungi))
# But that's hard to visualize for most of us, and anyway, we don't know how
# that relates to other orders.
# == 4.2 Visualizing proportions: Pie chart ================================
# Often, we will use a pie chart instead. Pie charts are rather informal types of plots, not well suited for analysis. But easy to do:
# Define four colors to identify the four categories
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))
pie(c(Nspc, Ngen, Nord, Nfng),
labels = "",
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
# and remember the
# previous setting
pie(c(nSpecies, # subtract numbers since these
nGenus - nSpecies, # categories are mutually contained
nOrder - nGenus - nSpecies, # in each other
nFungi - nOrder - nGenus -nSpecies),
labels = "",
radius = 0.9,
main = "MYSPE in genome-sequenced fungi",
lty = 0, # turn borders for wedges off
col = pCol,
clockwise = TRUE,
init.angle = 90)
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
legend(x = 0.95, y = 0.8, # place at legend here
legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes
pch = 15, # a filled square
col = pCol)
par(oPar) # reset graphics state
# Unless MYSPE is one of the frequently sequenced species, there will only be a
# very thin wedge visible. Pie charts are not well suited to visualize small
# proportions.
# It is a little more useful if we have non-nested proportions - like the
# number of species in the same order overall:
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
head(myTbl)
# pie() does a reasonable job out of the box to interpret table() data:
pie(myTbl)
# ... we can improve this quickly with a bit of tweaking:
N <- length(myTbl)
sel <- myOrder == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
myCol <- rep(pCol[4], N) # N elements of pCol[1]
myCol[sel] <- pCol[1] # replace this one color
myLbl <- rep("", N) # N labels of ""
myLbl[sel] <- myOr # replace this one label with the MYSPE order
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
pie(myTbl,
labels = myLbl,
radius = 0.9,
main = "MYSPE in genome-sequenced fungi",
lty = 0, # no borders for wedges
col = pCol,
main = "MYSPE order",
border = "#DDDDDD",
col = myCol,
clockwise = TRUE,
init.angle = 90)
title(main = MYSPE, line = 0, cex.main = 0.7)
legend(x = 0.95, y = 0.8, # position
legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes
pch = 15,
col = pCol)
par(oPar)
par(oPar) # reset graphics state
# But the overall problem remains.
# == 4.3 Visualizing proportions: Nested squares ===========================
# A simple alternative is to draw such proportions as nested squares:
x <- sqrt(nFungi)
# set margins to ~ 0 and type to square
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
# empty, square plot
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
type="n", axes=FALSE, xlab="", ylab="")
# basic square for all genomes
rect(0, 0, x, x, col = pCol[4])
# grid
u <- 0:floor(x)
N <- length(u)
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
# each square on this grid is one genome
# colored squares
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
# labels
text(x/2, x/2, "Fungi")
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
par(oPar) # reset graphics state
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
# Sankey diagrams are an excellent way to visualize complicated nested
# proportions and their changes (see here for example:
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
# example with the MYSPE proportions, as an illustration of the plotting
# principle.
if (! requireNamespace("plotly")) {
install.packages("plotly")
}
# Package information:
# library(help = plotly) # basic information
# browseVignettes("plotly") # available vignettes
# data(package = "plotly") # available datasets
# Here, we use the plotly package that wraps a very well developed javascript library with many options for interactive plots.
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
"Ophiostomatales (6)", # 1
"Other...", # 2
"Sporothrix (4)", # 3
"Other...", # 4
"Sporothrix schenckii (2)", # 5
"Other..." # 6
),
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
color = c("#f2f2f0", #
"#ffd5c4",
"#CCCCCC",
"#ff9582",
"#CCCCCC",
"#ed394e",
"#CCCCCC"
),
pad = 15,
thickness = 20,
line = list(color = "black",
width = 0.5))
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
value = c(6, 18, 4, 2, 2, 2)) # and node 1
# Setting up the actual plot ...
fig <- plotly::plot_ly(type = "sankey",
arrangement = "snap",
orientation = "h",
node = myNodes,
link = myLinks)
# Adding and adjusting a few layout parameters
fig <- plotly::layout(fig,
title = "Fungi Genomes - Classification",
font = list(size = 10))
fig # plot the diagram
# Note that the plot appears in the Viewer window, not the Plot window, and that
# it is interactive: you can hover over nodes and links, and drag the nodes
# around.
# [END]