# tocID <- "BIN-MYSPE.R" # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-MYSPE unit # # # Version: 1.4 # # Date: 2017-09 - 2021-10 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # V 1.4 Add troubleshooting hints via errText[[...]] # V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about # V 1.2 Reorganized proportional plot section into a "further reading" # section, added nested-box, and sankey plot visualization of # proportions. Introduced plotly. # V 1.1 2020 Workflow changes # V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory # V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist # V 0.1 First code copied from BCH441_A03_makeMYSPElist.R # # TODO: Sample solution for sankey plot function. # # # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # # DO NOT SIMPLY source() THESE FILES! # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------- #TOC> 1 PREPARATIONS 52 #TOC> 2 SUITABLE MYSPE SPECIES 65 #TOC> 3 ADOPT "MYSPE" 89 #TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128 #TOC> 4.1 Percentages 146 #TOC> 4.2 Visualizing proportions: Pie chart 165 #TOC> 4.3 Visualizing proportions: Nested squares 243 #TOC> 4.4 Visualizing proportions: Sankey diagrams 280 #TOC> #TOC> ========================================================================== # = 1 PREPARATIONS ======================================================== # # Execute the two conditionals below: if (! file.exists("./myScripts/.myProfile.R")) { stop(errText[["noProfileFile"]]) # message defined in .Rprofile } if (! exists("myStudentNumber")) { stop(errText[["noStudentNumber"]]) # message defined in .Rprofile } # = 2 SUITABLE MYSPE SPECIES ============================================== # In this unit we will select one species from a list of genome sequenced fungi # and write it into your personalized profile file. This species will be called # "MYSPE" (My Species) for other learning units and exercises. # A detailed description of the process of compiling the list of genome # sequenced fungi with protein annotations and Mbp1 homologues is in the file # ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi # was retrieved from https://fungi.ensembl.org; a search for homologues to # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. # A representative organism at each genus-level was chosen from those hits # that actual;ly have a homologue. Finally, a mapping table was constructed to # asymmetrically retrieve unique species: a student number will retrieve # a species, but (public) knowledge of the species cannot reconstruct the # student number. # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow # of selecting and combining data from various data resources. Studying # it will give you a better sense of how such workflows can be # implemented in practice. # = 3 ADOPT "MYSPE" ======================================================= # Execute: ( MYSPE <- getMYSPE(myStudentNumber) ) # If this produced an error, this session has not been properly set up. You # may not yet have run init() and edited .myProfile.R , or that file is not # in your myScripts/ folder. Fix this, and execute: # # source(".Rprofile") . # If this produced NA, your Student Number may not be correct, or you are not in # my class-list. Contact me. Otherwise, this should have printed a species name, # and the taxonomy ID of its genome-sequenced strain. This is your unique # speciesfor this course. Note it in your journal ... biCode(MYSPE) # and also note it's "BiCode" ... ( myTaxID <- names(MYSPE) ) # and its taxID # Task: # ===== # Note down the species name and its five letter BiCode on your Student # Wiki user page. Use this species whenever this or future assignments refer # to MYSPE. Whenever you start a session, it will automatically be loaded # from myScripts/.myProfile.R and is available as MYSPE . # Here is some more information about MYSPE, taken from the table of genome- # sequenced fungi that is in your ./data folder. fungiDat <- read.csv("data/Species.csv") iMs <- which(fungiDat$Taxon.ID == myTaxID) ( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order ( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus ( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain # That's all. # = 4 FURTHER READING: PLOTTING PROPORTIONS =============================== # The material below is an exploration of data-preparation and plotting # techniques; you can treat this as additional practice and further reading and # I expect that some of the code and plotting examples may be useful in a # different context. # A frequent task is to visualize the proportion of elements with given # categories in a sample. For example, we might ask what the proportion of the # different orders of fungi is the order of MYSPE? Let's first collect the # numbers. ( nFungi <- nrow(fungiDat) ) # sequenced fungi ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE ( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE # == 4.1 Percentages ======================================================= # The zeroth-order approach to visualization is simply to print percentages: cat(sprintf("\n%s comprise %5.2f%% of fungi.", myOr, (nOrder * 100) / nFungi)) # ... or, adding the actual numbers: cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", myOr, (nOrder * 100) / nFungi, nOrder, nFungi)) # But that's hard to visualize for most of us, and anyway, we don't know how # that relates to other orders. # == 4.2 Visualizing proportions: Pie chart ================================ # Often, we will use a pie chart instead. Pie charts are rather informal types # of plots, not well suited for analysis. But easy to do: # Define four colors to identify the four categories pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0 # and remember the # previous setting pie(c(nSpecies, # subtract numbers since these nGenus - nSpecies, # categories are mutually contained nOrder - nGenus - nSpecies, # in each other nFungi - nOrder - nGenus - nSpecies), labels = "", radius = 0.9, main = "MYSPE in genome-sequenced fungi", lty = 0, # turn borders for wedges off col = pCol, clockwise = TRUE, init.angle = 90) title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot legend(x = 0.95, y = 0.8, # place at legend here legend = c("Species", "Genus", "Order", "Fungi"), y.intersp = 2, # line spacing for labels cex = 0.8, # character size for labels bty = "n", # "no" box around the legend pt.cex = 2, # size of colour boxes pch = 15, # a filled square col = pCol) par(oPar) # reset graphics state # Unless MYSPE is one of the frequently sequenced species, there will only be a # very thin wedge visible. Pie charts are not well suited to visualize small # proportions. # It is a little more useful if we have non-nested proportions - like the # number of species in the same order overall: myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) head(myTbl) # pie() does a reasonable job out of the box to interpret table() data: pie(myTbl) # ... we can improve this quickly with a bit of tweaking: N <- length(myTbl) sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere myCol <- rep(pCol[4], N) # N elements of pCol[1] myCol[sel] <- pCol[1] # replace this one color myLbl <- rep("", N) # N labels of "" myLbl[sel] <- myOr # replace this one label with the MYSPE order oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0 pie(myTbl, labels = myLbl, radius = 0.9, main = "MYSPE order", border = "#DDDDDD", col = myCol, clockwise = TRUE, init.angle = 90) par(oPar) # reset graphics state # But the overall problem remains. # == 4.3 Visualizing proportions: Nested squares =========================== # A simple alternative is to draw such proportions as nested squares: x <- sqrt(nFungi) # set margins to ~ 0 and type to square oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") # empty, square plot plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), type="n", axes=FALSE, xlab="", ylab="") # basic square for all genomes rect(0, 0, x, x, col = pCol[4]) # grid u <- 0:floor(x) N <- length(u) segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") # each square on this grid is one genome # colored squares rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3]) rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2]) rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) # labels text(x/2, x/2, "Fungi") text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9) text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8) text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) par(oPar) # reset graphics state # == 4.4 Visualizing proportions: Sankey diagrams ========================== # Sankey diagrams are an excellent way to visualize complicated nested # proportions and their changes (see here for example: # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple # example with the MYSPE proportions, as an illustration of the plotting # principle. if (! requireNamespace("plotly")) { install.packages("plotly") } # Package information: # library(help = plotly) # basic information # browseVignettes("plotly") # available vignettes # data(package = "plotly") # available datasets # Here, we use the plotly package that wraps a very well developed javascript # library with many options for interactive plots. I am producing this plot # hard-coded for the sample organism "Sporothrix schenkii"; you would need # to change the code to adapt it to your own MYSPE - or even build a function # for this. Do try this if you have a bit of coding experience, sankey diagrams # are a good way to show hierarchical data relations - and if you get this # working for your own organism you can be proud that you have understood # how preparing the data works. myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID "Ophiostomatales (6)", # 1 "Other...", # 2 "Sporothrix (4)", # 3 "Other...", # 4 "Sporothrix schenckii (2)", # 5 "Other..." # 6 ), x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), color = c("#f2f2f0", # "#ffd5c4", "#CCCCCC", "#ff9582", "#CCCCCC", "#ed394e", "#CCCCCC" ), pad = 15, thickness = 20, line = list(color = "black", width = 0.5)) myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0 value = c(6, 18, 4, 2, 2, 2)) # and node 1 # Setting up the actual plot ... fig <- plotly::plot_ly(type = "sankey", arrangement = "snap", orientation = "h", node = myNodes, link = myLinks) # Adding and adjusting a few layout parameters fig <- plotly::layout(fig, title = "Fungi Genomes - Classification", font = list(size = 10)) fig # plot the diagram # Note that the plot appears in the Viewer window, not the Plot window, and that # it is interactive: you can hover over nodes and links, and drag the nodes # around. # [END]