bch441-work-abc-units/BIN-MYSPE.R

352 lines
14 KiB
R
Raw Permalink Normal View History

2021-11-16 05:31:48 +00:00
# tocID <- "BIN-MYSPE.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit
#
#
# Version: 1.4
#
# Date: 2017-09 - 2021-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.4 Add troubleshooting hints via errText[[...]]
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
# V 1.2 Reorganized proportional plot section into a "further reading"
# section, added nested-box, and sankey plot visualization of
# proportions. Introduced plotly.
# V 1.1 2020 Workflow changes
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
#
# TODO: Sample solution for sankey plot function.
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 PREPARATIONS 52
#TOC> 2 SUITABLE MYSPE SPECIES 65
#TOC> 3 ADOPT "MYSPE" 89
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
#TOC> 4.1 Percentages 146
#TOC> 4.2 Visualizing proportions: Pie chart 165
#TOC> 4.3 Visualizing proportions: Nested squares 243
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280
#TOC>
#TOC> ==========================================================================
# = 1 PREPARATIONS ========================================================
#
# Execute the two conditionals below:
if (! file.exists("./myScripts/.myProfile.R")) {
stop(errText[["noProfileFile"]]) # message defined in .Rprofile
}
if (! exists("myStudentNumber")) {
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
}
# = 2 SUITABLE MYSPE SPECIES ==============================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "MYSPE" (My Species) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
# was retrieved from https://fungi.ensembl.org; a search for homologues to
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
# A representative organism at each genus-level was chosen from those hits
# that actual;ly have a homologue. Finally, a mapping table was constructed to
# asymmetrically retrieve unique species: a student number will retrieve
# a species, but (public) knowledge of the species cannot reconstruct the
# student number.
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
# of selecting and combining data from various data resources. Studying
# it will give you a better sense of how such workflows can be
# implemented in practice.
# = 3 ADOPT "MYSPE" =======================================================
# Execute:
( MYSPE <- getMYSPE(myStudentNumber) )
# If this produced an error, this session has not been properly set up. You
# may not yet have run init() and edited .myProfile.R , or that file is not
# in your myScripts/ folder. Fix this, and execute:
#
# source(".Rprofile") .
# If this produced NA, your Student Number may not be correct, or you are not in
# my class-list. Contact me. Otherwise, this should have printed a species name,
# and the taxonomy ID of its genome-sequenced strain. This is your unique
# speciesfor this course. Note it in your journal ...
biCode(MYSPE) # and also note it's "BiCode" ...
( myTaxID <- names(MYSPE) ) # and its taxID
# Task:
# =====
# Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information about MYSPE, taken from the table of genome-
# sequenced fungi that is in your ./data folder.
fungiDat <- read.csv("data/Species.csv")
iMs <- which(fungiDat$Taxon.ID == myTaxID)
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
# That's all.
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
# The material below is an exploration of data-preparation and plotting
# techniques; you can treat this as additional practice and further reading and
# I expect that some of the code and plotting examples may be useful in a
# different context.
# A frequent task is to visualize the proportion of elements with given
# categories in a sample. For example, we might ask what the proportion of the
# different orders of fungi is the order of MYSPE? Let's first collect the
# numbers.
( nFungi <- nrow(fungiDat) ) # sequenced fungi
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
# == 4.1 Percentages =======================================================
# The zeroth-order approach to visualization is simply to print percentages:
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
myOr,
(nOrder * 100) / nFungi))
# ... or, adding the actual numbers:
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
myOr,
(nOrder * 100) / nFungi,
nOrder,
nFungi))
# But that's hard to visualize for most of us, and anyway, we don't know how
# that relates to other orders.
# == 4.2 Visualizing proportions: Pie chart ================================
# Often, we will use a pie chart instead. Pie charts are rather informal types
# of plots, not well suited for analysis. But easy to do:
# Define four colors to identify the four categories
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
# and remember the
# previous setting
pie(c(nSpecies, # subtract numbers since these
nGenus - nSpecies, # categories are mutually contained
nOrder - nGenus - nSpecies, # in each other
nFungi - nOrder - nGenus - nSpecies),
labels = "",
radius = 0.9,
main = "MYSPE in genome-sequenced fungi",
lty = 0, # turn borders for wedges off
col = pCol,
clockwise = TRUE,
init.angle = 90)
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
legend(x = 0.95, y = 0.8, # place at legend here
legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes
pch = 15, # a filled square
col = pCol)
par(oPar) # reset graphics state
# Unless MYSPE is one of the frequently sequenced species, there will only be a
# very thin wedge visible. Pie charts are not well suited to visualize small
# proportions.
# It is a little more useful if we have non-nested proportions - like the
# number of species in the same order overall:
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
head(myTbl)
# pie() does a reasonable job out of the box to interpret table() data:
pie(myTbl)
# ... we can improve this quickly with a bit of tweaking:
N <- length(myTbl)
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
myCol <- rep(pCol[4], N) # N elements of pCol[1]
myCol[sel] <- pCol[1] # replace this one color
myLbl <- rep("", N) # N labels of ""
myLbl[sel] <- myOr # replace this one label with the MYSPE order
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
pie(myTbl,
labels = myLbl,
radius = 0.9,
main = "MYSPE order",
border = "#DDDDDD",
col = myCol,
clockwise = TRUE,
init.angle = 90)
par(oPar) # reset graphics state
# But the overall problem remains.
# == 4.3 Visualizing proportions: Nested squares ===========================
# A simple alternative is to draw such proportions as nested squares:
x <- sqrt(nFungi)
# set margins to ~ 0 and type to square
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
# empty, square plot
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
type="n", axes=FALSE, xlab="", ylab="")
# basic square for all genomes
rect(0, 0, x, x, col = pCol[4])
# grid
u <- 0:floor(x)
N <- length(u)
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
# each square on this grid is one genome
# colored squares
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
# labels
text(x/2, x/2, "Fungi")
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
par(oPar) # reset graphics state
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
# Sankey diagrams are an excellent way to visualize complicated nested
# proportions and their changes (see here for example:
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
# example with the MYSPE proportions, as an illustration of the plotting
# principle.
if (! requireNamespace("plotly")) {
install.packages("plotly")
}
# Package information:
# library(help = plotly) # basic information
# browseVignettes("plotly") # available vignettes
# data(package = "plotly") # available datasets
# Here, we use the plotly package that wraps a very well developed javascript
# library with many options for interactive plots. I am producing this plot
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
# to change the code to adapt it to your own MYSPE - or even build a function
# for this. Do try this if you have a bit of coding experience, sankey diagrams
# are a good way to show hierarchical data relations - and if you get this
# working for your own organism you can be proud that you have understood
# how preparing the data works.
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
"Ophiostomatales (6)", # 1
"Other...", # 2
"Sporothrix (4)", # 3
"Other...", # 4
"Sporothrix schenckii (2)", # 5
"Other..." # 6
),
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
color = c("#f2f2f0", #
"#ffd5c4",
"#CCCCCC",
"#ff9582",
"#CCCCCC",
"#ed394e",
"#CCCCCC"
),
pad = 15,
thickness = 20,
line = list(color = "black",
width = 0.5))
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
value = c(6, 18, 4, 2, 2, 2)) # and node 1
# Setting up the actual plot ...
fig <- plotly::plot_ly(type = "sankey",
arrangement = "snap",
orientation = "h",
node = myNodes,
link = myLinks)
# Adding and adjusting a few layout parameters
fig <- plotly::layout(fig,
title = "Fungi Genomes - Classification",
font = list(size = 10))
fig # plot the diagram
# Note that the plot appears in the Viewer window, not the Plot window, and that
# it is interactive: you can hover over nodes and links, and drag the nodes
# around.
# [END]