# tocID <- "plottingReference.R" # # Purpose: Reference to graphical output in R. # # # Version: 2.1 # # Date: 2017 09 - 2021 09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # V 2.1 First minor updates 2021 # V 2.0 Comprehensive reference with basic and advanced options based on # an integrated yeast gene expression dataset. Full rewrite of most # sections. # V 1.1 Stylistic improvements, code polish, and additional examples # V 1.0 Digest from "plotting reference" files # # ToDo: Demo 6.3.2 in parallel plots # # #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> ------------------------------------------------------------------- #TOC> 01 INITIALIZE 69 #TOC> 02 THIS REFERENCE ... 74 #TOC> 02.1 Dataset Documentation 81 #TOC> 03 PROPORTIONS AND DISTRIBUTIONS 193 #TOC> 03.1 barplot() 198 #TOC> 03.2 pie() 222 #TOC> 03.3 boxplot() 254 #TOC> 03.4 hist() 312 #TOC> 03.4.1 overlaying histograms 357 #TOC> 04 THE plot() FUNCTION 400 #TOC> 04.1 line plots 404 #TOC> 05 ENCODING INFORMATION: SYMBOL, SIZE, COLOuR 503 #TOC> 05.1 pch ("plotting character" symbols) 509 #TOC> 05.1.1 Line types 684 #TOC> 05.2 cex ("character expansion" size) 708 #TOC> 06 COLOUR 768 #TOC> 06.1 Colours by number 776 #TOC> 06.2 Colours by name 794 #TOC> 06.3 Colours as hex-triplets 820 #TOC> 06.3.1 Inbuilt palettes 875 #TOC> 06.3.2 Custom palettes 952 #TOC> 06.3.3 Transparency: The Alpha Channel 1002 #TOC> 06.4 abline(), lines() and segments() 1047 #TOC> 07 AXES 1081 #TOC> 08 LEGENDS 1118 #TOC> 08.1 basic legends 1121 #TOC> 08.2 Color bars 1125 #TOC> 09 LAYOUT 1246 #TOC> 10 TEXT 1281 #TOC> 11 DRAWING ON PLOTS 1309 #TOC> 12 IMAGES 1376 #TOC> 13 CONTOUR LINES 1381 #TOC> 14 3D PLOTS 1386 #TOC> 15 GRAPHS AND NETWORKS 1391 #TOC> 16 OTHER GRPAHICS PACKAGES 1396 #TOC> 17 INTERACTIVE PLOTS 1420 #TOC> 17.1 locator() 1424 #TOC> 17.2 plotly:: 1427 #TOC> #TOC> ========================================================================== # = 01 INITIALIZE ========================================================= SC <- readRDS("data/SC.rds") # <<<- execute this line first # = 02 THIS REFERENCE ... ================================================= # This script covers basic plotting and graphical output options of R. The # functions are demonstrated with an integrated data set of yeast gene # expression data and annotations. # == 02.1 Dataset Documentation ============================================ # You do not need to study the dataset in detail but it will be useful to refer # to this documentation from time to time to understand what data is being # plotted and what one can therefore learn from the results. # The integrated dataset SC is a list that contains an analysis of yeast gene # expression profiles originally published by Pramila et al. (2002; PMID: # 12464633) and accessible as GSE3653 on GEO # (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE3635). This is a # high-resolution (5 min. interval) data set spanning two cycles of a # synchronized yeast culture. It includes 6228 expression profiles with 25 # time-points assigned to yeast systematic names. It is complemented with gene # annotations taken from SGD, and augmented with a set of GO slims. GO slims # were rerieved by navigating to https://www.yeastgenome.org/ and using the menu # to choose Function >> Gene Ontology >> GOslim Mapping File. This downloads # go_slim_mapping.tab which was further processed to annotate each gene with its # corresponding GO terms. To evaluate cyclical regulation, a curve-fitting # protocol was followed to fit a cyclical epxpression model with parameters # amplitude, phase, frequency, exponential damping, and baseline shift. After # fitting the profiles, correlations with the fitted model were calculated and # the first peak of the model was determined as a marker of when the expression # was ON in the cell-cycle. Finally, the expression peak markers were assigned # to seven phases of the cell cycle, and GO term enrichments were computed. A # A subset was selected with the following parameters: # selCCl <- # nlsParams$A > 0.075 & # reasonably high Amplitude # nlsParams$cor > 0.6 & # good correlation with the parametrized model # nlsParams$f > 0.75 & # period between 0.75 ... # nlsParams$f < 1.333 & # ... and 1.333 hours # nlsParams$k < 0.03 & # limiting the exponential damping # nlsParams$k > -0.001 # sum(selCCl) # 1,297 genes in the data set show some level # # of periodic expression-variation in the cell # # cycle. # # Thus the dataset comprises numeric and categorical data # Here are the data details: # SC$xpr : expression profiles. Numeric matrix with 1,297 rows of genes # ====== and 25 colums of time points in 5 minute # intervals # Example: # -------- # SC$xpr[169, 1:5] # > t.0 t.5 t.10 t.15 t.20 # > -0.055703429 -0.116124955 -0.135965733 -0.068665035 -0.001819363 # # # SC$mdl : parametrized model. Data frame with 1,297 rows of genes # ====== # SC$mdl$A : model parameter: amplitude (log ratio) # SC$mdl$phi : model parameter: phase (degrees) # SC$mdl$f : model parameter: 1/frequency (hours) # SC$mdl$k : model parameter: damping # SC$mdl$B : model parameter: baseline shift # SC$mdl$cor : correlation of observation and model # SC$mdl$peaks : timepoint of first expression peak of model (minutes) # # # Example: # -------- # SC$mdl[169, ] # > A phi f k B cor peaks # > 0.1007908 -81.04413 1.012092 0.00656052 -0.01857916 0.8437175 37.52217 # # # SC$ann : annotations. Data frame with 1,297 rows of genes # ====== # SC$ann$SGD : Saccharomyces Gene Database identifier # SC$ann$sysName : Yeast gene systematic name # SC$ann$stdName : The standard name under which the gene is known # SC$ann$alias : common alias name(s) # SC$ann$description : free-text description of the gene # SC$ann$GO : GO IDs of the SGD GO slim subset annotated to the gene # # Example: # -------- # SC$ann[169, ] # > SGD sysName stdName alias # > S000002214 YDL056W MBP1 transcription factor MBP1 # > description # > YDL056W Transcription factor; involved in regulation [...] # > GO # > YDL056W GO:0005634 GO:0003677 GO:0001071 GO:0000278 # # SC$phases # --------- # Data frame with a definition of yeast cell cycle phase names and # time-points derived from inspection of the SC$mdl$peaks distribution. # names start ends # 1 Sense 0 6 # 2 Prep 6 15 # 3 Replicate 15 30 # 4 Assemble 30 37 # 5 Segregate 37 43 # 6 Duplicate 43 55 # 7 Stabilize 55 65 # # SC$GO # ----- # Data frame with GO term information and GO term enrichment data # SC$GO$ontology : {C|F|P} identifying the "cellular component", "molecular # function" or "biological process" ontology # SC$GO$GOid : Gene Ontyology ID of the term # SC$GO$label : Short label of the term # SC$GO$tAll : Count of times the term is annotated in any gene # SC$GO$tCC : ... times term is annotated to a gene in this dataset # SC$GO$t... : ... annotated to one ofg the cell-cycle phases # SC$GO$xs... : ... enrichment factor in each cell cycle phase # # = 03 PROPORTIONS AND DISTRIBUTIONS ====================================== # Distributions of numeric variable characterize the values. Proportions compare values. A typical use case is to characterize a set of measurements, or compare several sets of measurements. # == 03.1 barplot() ======================================================== # Draws a bar with height proportional to a given number. barplot(mean(SC$xpr[ , "t.20"])) # barplot of the mean expression at t.20 # Barplots show only a single number. They tell us nothing about the underlying distribution. They are commonly used to compare several distributions: barplot(colMeans(SC$xpr[ , c("t.0", "t.20", "t.40", "t.60")])) # barplots take the usual plotting parameters myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function "#00FFFF55", "#CCCCAA55", "#FF00FF55", "#FF006655", "#FFFFFF55"), alpha = TRUE) # transparent colors barplot(colMeans(SC$xpr[ , 1:13]), # mean expression changes, first hour main = "Mean expression changes", xlab = "time points", cex.names = 0.6, # scale column names ylab = "Expression (log ratio)", cex.axis = 0.8, # scale of y-axis labels col = myPal(13)) # get 13 color values from myPal() # == 03.2 pie() ============================================================ # Generally not preferred, but simple to do table(SC$GO$ontology) # Example: how many genes are anotated to the GO terms in the Cellular Component # category? CCids <- SC$GO$GOid[SC$GO$ontology == "C"] # get GO ids for "C" ontology x <- unlist(strsplit(SC$ann$GO, " ")) # split GO term strings on blank-spaces # and dump them all into one single vector x <- x[x %in% CCids] # subset the ones in CCIDs (about 40%) pie(table(x), cex = 0.5) # This can be better visualized in a barplot myT <- sort(table(x), decreasing = TRUE) b <- barplot(myT, # assign the plot - we need the x-coordinates main = "Genes annotated to GO terms in the Cellular Component Ontology", cex.main = 0.8, names.arg = "", # blank the names - otherwise they are ylab = "counts", # taken from the names of the object cex.axis = 0.8, col = colorRampPalette(c("#CCAAFF", "#FFDDFF", "#FFFFFF"))(length(myT)), ylim = c(0, 1.5 * max(myT))) # make space at the top # Add the GOids as text() text(b, myT, # b holds the x coordinates, myT is y labels = names(myT), # take the names from the table - names srt = 90, # rotate to vertical adj = c(-0.1, 0.5), # align center, top cex = 0.5) # == 03.3 boxplot() ======================================================== # A boxplot() is almost always preferred to a barplot, since it includes an # estimate of the distribution: boxplot(SC$xpr[ , "t.20"]) # boxplot of expression values at t.20 # What are these elements? # Assigning the plot to a variable makes the numbers available so we can # use them for annotation: oPar <- par(mar = c(0.2,3,0.2,0.2)) # reduce the margins ( b <- boxplot(SC$xpr[ , "t.20"])) xT <- 1.3 cT <- "#DD0000" # outliers myOut <- mean(b$out[b$out > b$stats[5]]) # outliers above the whiskers myOut[2] <- mean(b$out[b$out < b$stats[1]]) # outliers below the whiskers text(1.04, myOut, adj=c(0,0.5), cex=0.7, col=cT, "outliers") # whiskers text(1.12, b$stats[1,1], adj=c(0,0.5), cex=0.7, col=cT, "min(x_i > Q1-1.5*IQR)") text(1.12, b$stats[5,1], adj=c(0,0.5), cex=0.7, col=cT, "max(x_i < Q3+1.5*IQR)") # quartiles # "min(x_i > Q1-1.5*IQR)" is "the smallest x in the data that still lies # within 1.5 times the inter-quartile range below Q1 text(1.23, b$stats[2,1], adj=c(0,0.5), cex=0.7, col=cT, "Q1 (lower quartile)") text(1.23, b$stats[4,1], adj=c(0,0.5), cex=0.7, col=cT, "Q3 (upper quartile)") # median text(1.23, b$stats[3,1], adj=c(0,0.5), cex=0.9, col=cT, "median") # the mean is not shown text(0.79, mean(SC$xpr[,"t.20"]), adj=c(1,0.5), cex=0.7, col="#999999", "mean") par(oPar) # reset the margins # If we have datasets with more columns, each column gets its own boxplot boxplot(SC$xpr[ , c("t.0", "t.20", "t.40", "t.60")]) # boxplots take the usual plotting parameters myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function "#00FFFF55", "#CCCCAA55", "#FF00FF55", "#FF006655", "#FFFFFF55"), alpha = TRUE) # transparent colors boxplot(SC$xpr[ , 1:13], # expression changes, first hour main = "Expression levels", xlab = "time points", cex.axis = 0.6, # scale axis names ylab = "Expression (log ratio)", col = myPal(13)) # get 13 color values from myPal() # This plot shows that there is a noticable fluctuation of expression levels # over the course of the first 60 minutes of the experiment. This may be due to # these genes being selected as having cyclically varying expression profiles. # == 03.4 hist() =========================================================== # Histograms are superbly informative and one of the workhorses of our analyses. A histogram answers the question how many observations do we have in a particular range of a (continuously varying) variable. # Here we plot a histogram of the first expression peaks of genes, computed as # the first peak of the fitted model for every gene in our set. We might be # interested in whether the expression of different genes peaks continuously # over the cycle, or whetehr there are time intervals where waves of expression # of different genes can be observed in response to cycle-control signals. hist(SC$mdl$peaks) # We can explicitly set breakpoints for the histogram: # Here we set breaks at 4 minute intervals ( myBreaks <- seq(0, 80, by = 4) ) # Note: 21 values bound 20 intervals hist(SC$mdl$peaks, breaks = myBreaks) # Assigning the output of hist() makes the values # used in constructing the histogram accessible: ( H <- hist(SC$mdl$peaks) ) # for example, we can use this to plot the actual numbers: text(H$mids, # midpoints of the bars H$counts, # counts -> probabilities labels = H$counts, # add labels to each bar: count numbers adj = c(0.5, -0.5), # text is centered on x and raised in y cex = 0.6, col = "#8888FF") # histograms take the usual plotting parameters myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function "#00FFFF55", "#CCCCAA55", "#FF00FF55", "#FF006655", "#FFFFFF55"), alpha = TRUE) # transparent colors myBreaks <- seq(0, 80, length.out = 81) # one-minute intervals hist(SC$mdl$peaks, # gene expression first peaks breaks = myBreaks, main = "Timing of first peaks of during the cell-cycle", sub = "The bimodal distribution distinguishes replication from division", cex.sub = 0.8, xlab = "t (min.)", ylab = "counts", col = myPal(81)) # get 81 color values from myPal() # === 03.4.1 overlaying histograms # Histograms can be plotted one-over another to compare them # Example: when do genes with different GO term annotations first peak? # GO:0005730 "nucleolus" # GO:0032200 "telomere organization" # GO:0006260 "DNA replication" sel1 <- grep("GO:0005730", SC$ann$GO) # 103 genes sel2 <- grep("GO:0032200", SC$ann$GO) # 32 genes sel3 <- grep("GO:0006260", SC$ann$GO) # 67 genes # define breaks. This is important to make the histograms comparable myBreaks <- seq(0, 80, by = 4) # plot the first histogram hist(SC$mdl$peaks[sel1], breaks = myBreaks, main = "Peak timing for different GO terms", xlab = "t (min)", ylab = "Counts", ylim = c(0, 50), col = "#FF007955") # first overlay hist(SC$mdl$peaks[sel2], breaks = myBreaks, col = "#00B2FF55", add = TRUE) # second overlay hist(SC$mdl$peaks[sel3], breaks = myBreaks, col = "#7FDFC955", add = TRUE) # Legend legend("topright", fill = c("#FF007955", "#00B2FF55", "#7FDFC955"), legend = c("GO:0005730\n(nucleolus)\n", # "\n" is a line-break "GO:0032200\n(telomere organization)\n", "GO:0006260\n(DNA replication)\n"), cex = 0.6, bty = "n") # no box around the legend # = 04 THE plot() FUNCTION ================================================ # plot() is the workhorse of data visualization in R. But plot() is also a "generic" - i.e. different "classes" can define their own plot-methods and plot will "dispatch" the right method for a class. # == 04.1 line plots ======================================================= # Lines are useful to visually associate ralated data points, like points in a # sequence of events. # plot an expression profile: value over time plot(SC$xpr[169, ]) # note the x is implied and plotted as "index", the expression values are used # for the y-axis by default. # plot types plot(SC$xpr[169, ], type = "p") # points, the default plot(SC$xpr[169, ], type = "l") # lines plot(SC$xpr[169, ], type = "b") # both points and lines plot(SC$xpr[169, ], type = "c") # empty points and lines plot(SC$xpr[169, ], type = "o") # overplotted plot(SC$xpr[169, ], type = "s") # steps plot(SC$xpr[169, ], type = "h") # like histogram plot(SC$xpr[169, ], type = "n") # none - useful to create an empty frame # to add other elements later text(SC$xpr[169, ], colnames(SC$xpr), cex = 0.5) # ... such as labels # title and axis labels iRow <- 169 t <- seq(0, 120, by = 5) # set actual x-values plot(t, SC$xpr[iRow, ], type = "b", main = "Expression profile", sub = "Data: GSE3653 (Pramila et al. 2002)", xlab = "time(min)", ylab = "Expression (log ratio)") # adjusting font sizes plot(t, SC$xpr[iRow, ], type = "b", main = "Expression profile", sub = "Data: GSE3653 (Pramila et al. 2002)", xlab = "time(min)", ylab = "Expression (log ratio)", cex.main = 1.1, # title cex.sub = 0.6, # subtitle cex.axis = 0.7, # axis values cex.lab = 0.8) # axis labels # x and y axis ranges can be defined with a two-element vector plot(t, SC$xpr[iRow, ], type = "b", xlim = c(0, 120), # tight against the values ylim = c(-max(abs(SC$xpr[iRow, ])), max(abs(SC$xpr[iRow, ]))), # symmetric around 0 main = "", sub = "", xlab = "", ylab = "") abline(h = 0, col = "#FF000044") # pull in annotations with sprintf() plot(t, SC$xpr[iRow, ], type = "b", main = sprintf("Expression profile: %s (%s)", SC$ann$stdName[iRow], SC$ann$sysName[iRow]), sub = "", xlab = "", ylab = "") # When exploring data, it is useful to have a standard view of data as a # function: xprPlot <- function(iRow) { t <- seq(0, 120, by=5) yMax <- max(abs(SC$xpr[iRow, ])) * 1.1 plot(t, SC$xpr[iRow, ], ylim = c(-yMax, yMax), type = "b", main = sprintf("Expression profile: %s (%s)", SC$ann$stdName[iRow], SC$ann$sysName[iRow]), xlab = "time(min)", ylab = "Expression (log ratio)", cex.main = 1.1, cex.axis = 0.7, cex.lab = 0.8) abline(h = 0, col = "#FF000044") } xprPlot(169) xprPlot(1124) # better to visualize by plotting one OVER the other. # This is done with the function points() xprPlot(1124) points(t, SC$xpr[169, ], type = "b", cex = 0.7, col = "#EE5500CC") # In general, when we plot(), we produce a "scatterplot" of X and y values. # scatterplot: one against the other plot(SC$xpr[169, ], SC$xpr[1124, ]) # these two profiles are anticorrelated # = 05 ENCODING INFORMATION: SYMBOL, SIZE, COLOuR ========================= # We have many options to encode information in scatter plots (and others), in order to be able to visualize and discover patterns and relationships. The most important ones are the type of symbol to use, its size, and its color. # == 05.1 pch ("plotting character" symbols) =============================== # There are many types of symbols available to plot points and they can all be varied by type, size, and color to present additional information. Which one to use is defined in the argument pch . The default is pch = 1: the open circle. Characters 1:20 are regular symbols: # reduce margins oPar <- par(mar = c(1, 1, 1, 1)) # Empty plot frame ... plot(c(0,11), c(0,11), type = "n", axes = FALSE, xlab = "", ylab = "") # coordinates for first 25 symbols x1 <- c(1:10, 1:10) y1 <- c(rep(10, 10),rep(9, 10)) myPch <- 1:20 points(x1, y1, pch = myPch, cex = 1.2) # pch 21:25 can have different border and fill colours. Borders are defined with # "col", fills are defined with "fill": x2 <- 1:5 y2 <- rep(8,5) myCols <- c("#4B0055", "#00588B", "#009B95", "#53CC67", "#FDE333") points(x2, y2, pch=21:25, col="#708090", bg = myCols, cex = 1.8) # ten more symbols are defined as characters x3 <- 1:10 y3 <- rep(6,10) myPch <- c(".", "o", "O", "0","a","A", "*", "+","-","|") points(x3, y3, pch = myPch) # note: "myPCh" is a character vector while # the other pch were identified by integers # The ASCII codes for characters 32 to 126 can also be used as plotting symbols myPch <- 32:126 x4 <- (((myPch - 32) %% 19) + 2) / 2 y4 <- 5 - ((myPch - 32) %/% 19) points(x4, y4, pch=32:126, col="#0055AA") # I don't think I have ever actually used characters as plotting characters in # this way; it is much more straightforward to just use the text() function in # that case, rather than having to look up which ASCII code is which character, # and remembering which ones are available as numbers, and which ones are # identified by the character itself. But the first twenty five I use a lot, and # I also end up looking up their codes a lot. So here is a function to plot them for reference. You can paste it into your .Rprofile to keep it at hand: pchRef <- function() { # a reference plot for the first twenty five plotting characters oPar <- par(mar = c(1, 1, 1, 1), mfrow = c(1, 1)) plot(c(0,6), c(0,6), type = "n", axes = FALSE, xlab = "", ylab = "") x <- ((0:24) %% 5) + 1 y <- rep(5:1, each = 5) idx <- 1:20 points(x[idx], y[idx], pch = idx, cex = 2) idx <- 21:25 points(x[idx], y[idx], pch = idx, cex = 2, col = "#708090", bg=c("#CC0033AA", "#991566AA", "#662A99AA", "#323FCCAA", "#0055FFAA")) text(x - 0.3, y, labels = sprintf("%d:", 1:25), col = "#7766AA", cex = 0.8) par(oPar) } # reset margins par(oPar) # In addition to the default fonts, a large set of Hershey vector fonts is # available which gives access to many more plotting and labeling options via # text() demo(Hershey) # Plotting other symbols: # In the most general way, Unicode characters can be plotted as text. # The code is passed in hexadecimal, long integer, with a negative sign. # Here is a quarter note (Unicode: 266a) using plot() plot(0.5,0.5, pch=-0x266aL, cex=5, xlab="", ylab="") # However, rendering may vary across platforms since it depends on unicode # support. If your character can't be rendered, you'll only get an empty box. # 1.1.1 Using pch to emphasize different categories # In general, when we plot(), we produce a "scatterplot" of X and y values. # scatterplot: one against the other tRep <- c("t.15", "t.20", "t.25", "t.30") # Replication-phase time points tDup <- c("t.40", "t.45", "t.50", "t.55") # Duplication-phase time points plot(rowMeans(SC$xpr[ , tRep]), rowMeans(SC$xpr[ , tDup])) # select top ten replication enriched GO terms sel <- order(SC$GO$xsReplicate, decreasing = TRUE)[1:10] SC$GO$label[sel] # what are these? GOrep <- SC$GO$GOid[sel] # GO terms that are enriched in the Replication phase sel <- order(SC$GO$xsDuplicate, decreasing = TRUE)[1:10] SC$GO$label[sel] # what are these? GOdup <- SC$GO$GOid[sel] # GO terms that are enriched in the Duplication phase # define sets of genes that are annotated to one or the other GO id set repGenes <- logical(nrow(SC$ann)) # prepare an empty vector for (id in GOrep) { # whenever we find one of the repGenes <- repGenes | grepl(id, SC$ann$GO) # characteristic GO IDs annotated } # to a a gene, we put TRUE in its sum(repGenes) # spot of the repGenes vector. xRep <- rowMeans(SC$xpr[ repGenes, tRep]) # repGenes in the replication phase yRep <- rowMeans(SC$xpr[ repGenes, tDup]) # repGenes in the duplication phase # same thing for the genes that are enriched in the duplication phase dupGenes <- logical(nrow(SC$ann)) for (id in GOdup) { dupGenes <- dupGenes | grepl(id, SC$ann$GO) } sum(dupGenes) xDup <- rowMeans(SC$xpr[ dupGenes, tRep]) # dupGenes in the replication phase yDup <- rowMeans(SC$xpr[ dupGenes, tDup]) # dupGenes in the duplication phase # Ready to plot: # base plot without the ref... and dup... genes basePlot <- function(...) { sel <- ! (repGenes | dupGenes) m <- sprintf("Expression changes between phases (%d genes)", nrow(SC$xpr)) plot(rowMeans(SC$xpr[ sel, tRep]), rowMeans(SC$xpr[ sel, tDup]), main = m, xlab = "Expression in Replication phase", ylab = "Expression in Duplication phase", cex.main = 0.9, cex.lab = 0.8, ...) abline(h = 0, col = "#0088FF44") # horizontal zero abline(v = 0, col = "#0088FF44") # vertical zero } basePlot() # plotting the rep.. and dup... sets with different pch points(xRep, yRep, pch=24, col="#000000", bg="#FFFFFF") # white up triangle points(xDup, yDup, pch=25, col="#000000", bg="#FFFFFF") # white down triangle # These are clearly different distributions, but the plot is overall too busy. # Using color improves the plot: basePlot(pch=21, col="#AAAAAA", bg="#DDDDDD") points(xRep, yRep, pch=24, col="#000000", bg="#66DDFF") points(xDup, yDup, pch=25, col="#000000", bg="#FF77FF") # Even better may be to use transparent colors, to address the overlap: basePlot(pch=19, col="#00000022") points(xRep, yRep, pch=19, col="#33BBFF66") points(xDup, yDup, pch=19, col="#FF44AA44") # this needs a legend: legend ("bottom", legend = c( sprintf("%d Genes with Replication-\nenriched GO terms\n ", sum(repGenes)), sprintf("%d Genes with Duplication-\nenriched GO terms\n ", sum(dupGenes)), "Other Genes\n " ), horiz = TRUE, pch = 19, col = c("#33BBFF66", "#FF44AA44", "#00000022"), cex = 0.6, pt.cex = 1.0, inset = 0.02, box.col = "#DDDDDD", bg = "#FFFFFF") # === 05.1.1 Line types # Basically all plots take arguments lty to define the line type, and lwd # to define line width # empty plot ... plot(c(0,10), c(0,10), type = "n", axes = FALSE, xlab = "", ylab = "") # Line type for (i in 1:8) { y <- 10.5-(i/2) segments(1,y,5,y, lty=i) text(6, y, paste("lty = ", i), col="grey60", adj=0, cex=0.75) } # Line width for (i in 1:10) { y <- 5.5-(i/2) segments(1,y,5,y, lwd=(0.3*i)^2) text(6, y, paste("lwd = ", (0.3*i)^2), col="grey60", adj=0, cex=0.75) } # == 05.2 cex ("character expansion" size) ================================= # The size of characters can be controlled with the cex parameter. cex takes a # vector of numbers, mapped to the vecors of plotted elements. The usual R # conventions of vector recycling apply, so if cex s only a single number, it is # applied to all plotted elements. Here is an example plotting expression # amplitudes in a comparison of pre-replication vs. replication phase # expression. tPre <- c("t.0", "t.5", "t.10") # Pre replication-phase time points tMid <- c("t.25", "t.30", "t.35") # Midway between replication and duplication x <- rowMeans(SC$xpr[ , tPre]) # row means average out fluctuations y <- rowMeans(SC$xpr[ , tMid]) plot(x, y) myAmps <- SC$mdl$A # fetch the correlations of the fitted # low correlations are not well modeled # with a periodic function hist(myAmps, breaks = 40) # examine the distribution hist(log(myAmps), breaks = 40) # sometimes log() is be better suited # to map to symbol size # useful cex sizes are approximately between 0.5 and 5 # Example: N <- 10 cexMin <- 0.5 cexMax <- 5.0 myCex <- seq(cexMax, cexMin, length.out=N) # Vector of cex values of length N # We plot these points from largest to smallest, so we don't overplot x1 <- runif(N) y1 <- runif(N) plot(x1, y1, xlim = 0:1, ylim = 0:1, xlab = "", ylab = "", pch = 16, cex = myCex, col= colorRampPalette(c("#F4F0F8", "#3355DD"))(N)) # Note that the smallest cex at 0.5 is _really_ small. points(x1[N], y1[N], cex = 3.0, col = "#FF000077") # Here it is! # To rescale a vector into the desired interval (lo, up) we write a little # function. First we normalize the vector into the interval (0, 1), then we # multiply it by up-lo, and add lo. # reScale <- function(x, lo = 0, up = 1) { return((((x-min(x)) / (max(x)-min(x))) * (up-lo)) + lo ) } # with this, our coding amplitudes as plot character size becomes quite easy: plot(x, y, pch=16, cex = reScale(myAmps, lo=0.5, up=5), # <<<---- That's all we need col="#4499BB22") abline(a = 0, b = 1, col = "#DD333344") abline(h = 0, col = "#33333344") abline(v = 0, col = "#33333344") # ToDo: add titles, scale legend, and interpretation # = 06 COLOUR ============================================================= # Colour is immensely important for data visualization, and colours in R can be # specified by number, by name, as hex-triplets, as rgb or hsv values, and # through several color-space specifications. The can be used individually, as # vectors that are defined manually, and through colour palettes. And the can be # solid and transparent. # == 06.1 Colours by number ================================================ # The col=... parameter for plots is 1 by default and you can # set it to the range 0:8. # 0: white # 1: black (the default) # 2: red # 3: green # 4: blue # 5: cyan # 6: magenta # 7: yellow # 8: grey barplot(rep(1,9), col=0:8, axes=FALSE, names.arg=c(0:8)) # As you can see, the default primary colours are garish and offend even the # most rudimentary sense of aesthetics. Using these colors won't earn you # respect. Fortunately we have many more sophisticated ways to define colours. # == 06.2 Colours by name ================================================== # You may have noticed that "red", "green", and "blue" work for the col=... # parameter, but you probably would not have imagined that "peachpuff", # "firebrick" and "goldenrod" are valid as well. In fact, there are 657 named # colours in R. Access them all by typing: colors() myCols <- c( "firebrick2", "tomato", "goldenrod1", "peachpuff", "papayawhip", "seashell", "whitesmoke" ) pie(c(1, 1, 2, 3, 5, 8, 13), col = myCols, labels = myCols) # Read more about named colours (and related topics) in Melanie Frazier's pdf: # https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf # I almost never use named colours anymore but use hex-triplets instead... # == 06.3 Colours as hex-triplets ========================================== # Hex triplets in R work exactly as in HTML: a triplet of # RGB values in two-digit hexadecimal representation. The # first two digits specify the red value, the second two # are for green, then blue. R accepts a fourth pair of # digits to optionally specify the transparency - the "Alpha" channel. The # semantics of the hex-triplet is thus "#RRGGBB" or "#RRGGBBAA". # Read more e.g. at http://en.wikipedia.org/wiki/Web_colors # The function col2rgb() converts colour names to rgb values ... col2rgb("violetred") # ... and rgb() converts rgb values to hex-code: rgb(1, 0.5, 0.23) # to get hexcodes from names takes a bit more: divide and transpose - rgb(t(col2rgb("violetred")/255)) # Here is a utility function to convert all manners of color-vectors to # hex-triplets. You can copy it into your .Rprofile if you want to keep it at hand. col2hex <- function(colV, dput = FALSE, pal = FALSE, N = NULL) { # colV a vector or scalar of color names, hex-codes or integers # dput logical. If TRUE, character output is suitable for pasting into code. # pal logical. If TRUE, the colors will be used in a palette. N must # be defined in that case # value: a vector of hex-codes of the same length, or a dput() string h <- rgb(t(col2rgb(colV) / 255)) if (pal == TRUE) { if (is.null(N)) { stop("N must be defined if pal is TRUE.") } h <- colorRampPalette(h)(N) } if (dput == TRUE) { dput(h) return(invisible(h)) } return(h) } # Examples: myCols <- c( "firebrick2", "tomato", "goldenrod1", "peachpuff", "papayawhip", "seashell", "whitesmoke") col2hex(myCols) col2hex(myCols, dput = TRUE) # barplot with the original colors barplot(rep(1, length(myCols)), col=myCols) # expanding the palette with col2hex() N <- 40 barplot(rep(1, N), col = col2hex(myCols,pal=TRUE,N=N) ) # === 06.3.1 Inbuilt palettes # In R, a palette is a function (!) that takes a number as its argument and # returns that number of colors, those colors can then be used to color points # in a plot or other elements. In principle, such colors can serve three # distinct puposes: # Qualitative colors have a different color for different types of entities. # Here the color serves simply to distinguish the types, it represents # "categorical information". For example, we can assign different colors to each # individual amino acid. # Sequential colors distinguish order in a sequence: the colors distinguish the first from the last element in the order. For example we often color the N-terminus of a protein blue(ish) and the C-terminus red(dish). # Diverging colors encode some property and are scaled to represent particular # values of the property. For example we might represent temperature with a # black-body radiation color palette. Note that "sequential" can be seen as # "diverging on order". # R has are a number of inbuilt palettes; Here is a convenience function to view # palettes. Input can be either a palette, the name of a hcl.colors palette, or a vector of colors: plotPal <- function(pal, N = 40) { if (is.character(pal)) { if (length(pal) == 1) { # name of a hcl palette myCols <- hcl.colors(N, pal) m <- pal } else { # already a color vector myCols <- colorRampPalette(pal)(N) m <- sprintf("Custom palette (spread to %d values)", N) } } else { # palette function myCols <- pal(N) m <- as.character(substitute(pal)) } barplot(rep(1,N), main = m, axes = FALSE, col = myCols) } plotPal(rainbow) # rainbow() is the quintessential qualitative palette plotPal(cm.colors) # cm.colors() is sequential: cyan-white-magenta plotPal(terrain.colors) # a diverging palette: map elevation colors # A lot of thought has gone into the construction of palettes in the # colorspace:: package, now available as hcl.colors in R (hcl: hue, chroma, # luminance), which is a better way to specify perceptually equidistant colors. # It's worthwhile to read through the help-file for more information - ?hcl.colors # ... and to sample the 110 available palettes: hcl.pals() # Examples: plotPal("Viridis") plotPal("Pastel 1") plotPal("Spectral") plotPal("OrRd") plotPal("Cold") plotPal("Mint") plotPal("Berlin") # If you are curious, you can execute the code below, then select the line that # draws the plot and press ctrl+enter repeatedly to step through the entire # set of palettes. # i <- 0 # plotPal(hcl.pals()[( i<-i+1 )]) # Worthy of special mention are the color-Brewer palettes, in particular for # their accessibility considerations. Do consider that not all of us view colors # in the same way! if (! requireNamespace("RColorBrewer", quietly = TRUE)) { install.packages("RColorBrewer") } ?RColorBrewer RColorBrewer::display.brewer.all(colorblindFriendly = TRUE) plotPal(RColorBrewer::brewer.pal(11, "PuOr"), N = 11) # === 06.3.2 Custom palettes # # Bespoke palettes are easily created with colorRampPalette(). The function # returns a palette, i.e. a function (not a vector of colors). You assign the # function to a variable (your palette), and you call it with an argument of how # many color values you want it to return. myPal <- colorRampPalette(c("#000000", "#FFFFFF", "#FF0000")) myPal(10) plotPal(myPal) # The parameter "bias" controls the spacing of colors at the end of the palette: plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 0.5)) plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 1.0)) plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 2.0)) # But a similar effect can be obtained by duplicating one or more colours to # spread the range ... plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000", "#FF0000"))) # ... and this can also be used to spread the middle plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FFFFFF", "#FF0000"))) # nicer pastels myPastels <- colorRampPalette(c("#FFFFFF", "#FF9AA2", "#FFB7B2", "#FFDAC1", "#E2F0CB", "#B5EAD7", "#C7CEEA", "#FFFFFF")) plotPal(myPastels) # outliers myOutliers <- colorRampPalette(c("#66666A", "#9999A6", "#CCCCCA", "#EEEEF9", "#FFBBA8", "#F30000"), bias = 0.5) plotPal(myOutliers) # Other many sources on the Web that help to generate matching colors upon which # to build palette. One such site with a very large selection of # community-contributed palettes is: https://color.adobe.com/ ... and there is # also an option to extract palettes from user-supplied images. # === 06.3.3 Transparency: The Alpha Channel # R colours are actually specified as quartets: the fourth value # the "Alpha channel" defines the transparency. Setting this to # values other than "FF" (the default) can be useful for very # crowded plots, or for creating overlays. x <- SC$xpr[, "t.10"] # towards the beginning of the cycle y <- SC$xpr[, "t.50"] # towards the end of the cycle # compare: plot(x,y, pch = 19, col = "#EE3A8C") plot(x,y, pch = 19, col = "#EE3A8C12") # Alpha at ~ 10% # or with multiple overlays of varying size using points() ... plot( x,y, pch = 16, cex = 1, col = "#AA330009") points(x,y, pch = 19, cex = 2, col = "#44558803") points(x,y, pch = 20, cex = 0.5, col = "#EE3A8C08") # A similar behaviour can be obtained from "density adapted colors with the # densCols() function plot (x, y, col = densCols(x, y), pch = 19, cex = 1.5) # ... or with the smootScatter function smoothScatter(x, y) smoothScatter(x, y, colramp = colorRampPalette(c("#66666A", "#9999A6", "#CCCCCA", "#EEEEF9", "#FFBBA8", "#F30000"), bias = 0.5), nrpoints = 0, xlim = c(-0.4, 0.3), ylim = c(-0.4, 0.3), xlab = "Expression (t = 10 min.)", ylab = "Expression (t = 50 min.)") abline(h = 0, col = "#FFFFFF", lwd = 0.5) abline(v = 0, col = "#FFFFFF", lwd = 0.5) # ToDo: expand to "real" density plots (cf. ) # == 06.4 abline(), lines() and segments() ================================ # lines() draws an arbitrary line from the supplied points N <- 300 x <- seq(0.8 * pi, 40 * pi, length.out = N) y <- sin(x)/x plot(x, y, type = "n") # only set up the axes abline(h = 0) lines(x, y, lwd = 4, col = "#FFFFFF") # "erase" parts of the axis lines(x, y, col = "#CC0000") # draw the line we want # Varying the color of a plotted line can't be done in base r. Use # ggplot2:: or plotrix::color.scale.lines() instead if (! requireNamespace("plotrix", quietly = TRUE)) { install.packages("plotrix") } # plotrix::color.scale.lines() can change color as well as width of a line # to give you more ways to visually display features of your data N <- 300 x <- seq(0.8 * pi, 40 * pi, length.out = N) y <- sin(x)/x myCol <- myPastels(N) # colors myLwd <- seq(0.5, 10, length.out = N) # line width plot(x, y, type = "n") # only set up the axes abline(h = 0, lwd = 2, col = "#CCCCCC") plotrix::color.scale.lines(x, y, col = myCol, lwd = myLwd) # = 07 AXES =============================================================== # For Details, see: ?plot.default n <- 1000 x <- rnorm(n) y <- x^3 * 0.25 + rnorm(n, sd=0.75) plot(x,y) # Default # Axes plot(x,y, xlim=c(-4, 4)) # fixed limits plot(x,y, xlim=c(-4, 4), ylim=c(10, -10)) # reverse is possible plot(x,y, log="xy") # log axes # The axis parameters in the default plot are limited. # If you want more control, suppress the printing of an axis # in the plot and use the axis() function instead. ?axis # Axis-labels and title are straightforward parameters of plot plot(x,y, xlab="rnorm(n)", ylab="x^3 * 0.25 + rnorm(n, sd=0.75)", cex.main=1.3, main="Sample\nPlot", cex.sub=0.75, col.sub="grey", sub="Scatterplot of noisy 3d-degree polynomial" ) # Add gridlines ?grid grid() # = 08 LEGENDS ============================================================ # == 08.1 basic legends ==================================================== # # ToDo: basic legend syntax # == 08.2 Color bars ======================================================= # Legends for a plot with a sequential or divergent spectrum are a special case, since we need to map a continuum of colors to a scale # Example: plotting expression levels in two cell-cycle phases against each other, and applying a divergent color spectrum that displays the model correlations: genes with low correlations have expression profiles that can not be well approximated with a periodic function. tRep <- c("t.15", "t.20", "t.25") # Replication-phase time points tDup <- c("t.40", "t.45", "t.50") # Duplication-phase time points x <- rowMeans(SC$xpr[ , tRep]) y <- rowMeans(SC$xpr[ , tDup]) plot(x, y) myCors <- SC$mdl$cor hist(myCors, breaks = 40) # The values are of course continuous, but we need to map them to a discrete set of colors. The strategy is as follows: # - scale the values we wish to map into the interval [0, 1] # - multiply with the desired number of intervals minus one, round, and add one # - these integers can be used as indices into a vector of colors, ... # - ... which gives the colors for the plot # A divergent palette corPal <- colorRampPalette(c("#90005DAA", "#D44F99AA", "#F4C2B8AA", "#F8DEB9AA", "#EEFAE9AA", "#D7F1C8AA", "#BBE9C4AA", "#7CB8A6AA"), alpha = TRUE) hist(myCors, breaks = seq(min(myCors), max(myCors), length.out = 21), col = corPal(20)) N <- 10 # We'll map into N intervals #$$$ # The usual way to scale a vector x to the [0,1] interval # is: x-min / max-min idx <- (myCors - min(myCors)) / (max(myCors) - min(myCors)) # Now: expand this to N-1, round, and add 1 idx <- round(idx * (N-1)) + 1 # confirm that these numbers are now suitable as indices into an 1:N vector table(idx) # create the color vector ( corCols <- corPal(N) ) # and make a vector of colors, one for each gene myCols <- corCols[idx] # then we plot ... plot(x, y, pch = 16, col = myCols, cex = 1.2) # there is a lot of overlap - let's re-plot in order of decreasing correlations # so the low correlations (reds) get plotted last. Ordering this way will give a # "cleaner" appearance for divergent plots. ord <- order(myCors, decreasing = TRUE) plot(x[ord], y[ord], pch = 16, col = myCols[ord], cex = 1.2) #: Titles etc. basePlot <- function() { m <- sprintf("Expression changes between phases (%d genes)", length(x)) plot(x[ord], y[ord], main = m, xlab = "mean Expression in Replication phase", ylab = "mean Expression in Duplication phase", sub = paste("Coloured by Correlation of", "Expression Profile (xpr) to", "Fitted Periodic Function (mdl)."), cex.main = 0.9, cex.lab = 0.7, cex.sub = 0.7, pch = 16, col = myCols[ord], cex = 1.2) abline(h = 0, col = "#0088FF44") # horizontal zero abline(v = 0, col = "#0088FF44") # vertical zero } basePlot() # to print a color-bar legend, we use the plotrix package if (! requireNamespace("plotrix", quietly = TRUE)) { install.packages("plotrix") } corLabels <- sprintf("%4.2f", # craft the labels to plot seq(min(myCors), max(myCors), length.out = length(corCols))) oPar <- par(mar=c(5,4,4,7)) # more space on the margin par("xpd" = FALSE) basePlot() # we need to figure out where to put the legend. par("usr") gives us the coordinates of the last plot window. We'll space the legend between 1.05 and 1.1 times the plot width, and centred along the y-axis with a height of 0.8 times the plot height. dx <- par("usr")[2] - par("usr")[1] dy <- par("usr")[4] - par("usr")[3] xl <- par("usr")[2] + (dx * 0.05) # left xr <- xl + (0.05 * dx) # right yb <- par("usr")[4] - (0.9 * dy) # bottom yt <- yb + (0.8 * dy) # top plotrix::color.legend(xl, yb, xr, yt, corLabels, corCols, align = "rb", # labels to the right of the bar cex = 0.8, gradient = "y") # vertical gradient # Almost perfect. Except we need a title for the color bar. # in this case the title is "r:" ... but the "r" should be in italics. par("xpd" = TRUE) # enable plotting into the margin text(xr + (0.01 * dx), yt + 0.015 * dy, expression(italic(r)["xpr,mdl"]), adj = c(0, 0), cex = 0.9) par(oPar) # ToDo - this is a common plot prototype - cast it into a function # = 09 LAYOUT ============================================================= # par, lattice, constant aspect ratio # Most parameters of the plot window can be set via # the functions plot(), hist() etc., but some need to # be set via the par() function. Calling par() without # arguments lists the current state of the plotting # parameters. Calling it with arguments, returns the # old parameters and sets new parameters. Thus setting # new parameters and saving the old ones can be done # in one step. The parameters that have to be set via # par include: # - multiple plots in one window (mfrow, mfcol, mfg) # - margin layout (mai, mar mex, oma, omd, omi) # - controlling position and size of a plot in the figure (fig, plt, ps, pty) # - see ?par for details. n <- 1000 x <- rnorm(n) y <- x^3 * 0.25 + rnorm(n, sd=0.75) # set window background and plotting axes via par opar <- par(bg="steelblue", fg="lightyellow") # set axis lables and titles via plot parameters plot(x,y, col.axis="lightyellow", col.lab="lightyellow") par(opar) # reset to old values plot(x,y) # confirm reset # aspect! ... # # = 10 TEXT =============================================================== # ToDo: sprintf() # Plotting arbitrary text # use the text() function to plot characters and strings to coordinates ?text # Example: add labels to the symbols # first set: plain symbols (1 to 20) text(x1-0.4, y1, paste(1:20), cex=0.75) # symbols with separate background (21 to 25) text(x2-0.4, y2, paste(21:25), cex=0.75) # third set: special characters, change font for clarity text(x3-0.4, y3, extra, col="slateblue", cex=0.75, vfont=c("serif", "plain")) # ToDo: writing into the margin ... # par("mar") # par("xpd") # par("srt") # # ToDo: plotmath() # demo(plotmath) # ToDo: expression() # = 11 DRAWING ON PLOTS =================================================== # To plot lines on top of plots there are several options: # segments() # for lines with known endpoints # lines() # joining x,y coordinate lists with lines # arrows() # ... but to get a filled arrow use polygon() # curve() # drawing a curve based on a given expression # abline() # drawing horizontal or vertical lines or lines with # # given slope and intercept () # Example: dividing a plot into 60° regions, centred on a point. # A general approach to "lines" on a plot is provided by segments(). # However in this special case one can use abline(). # We have to take care though that the aspect ratio for the # plot is exactly 1 - otherwise our angles are not right. # Therefore we need to set the asp parameter for plots. # For a general sketch # - we plot the frame a bit larger, don't draw axes # - draw the ablines # - draw two arrows to symbolize the coordinate axes p <- c(4, 2) plot(p[1], p[2], xlim=c(-0.5,10.5), ylim=c(-0.5,10.5), xlab="", ylab="", axes=FALSE, asp=1.0) abline(h=p[2], lty=2) # horizontal abline(p[2] - (p[1]*tan(pi/3)), tan(pi/3), lty=2) # intercept, slope abline(p[2] + (p[1]*tan(pi/3)), -tan(pi/3), lty=2) # intercept, slope arrows(0, 0, 10, 0, length=0.1) # length of arrow arrows(0, 0, 0, 10, length=0.1) # curves() # rect() # polygon() # example: plot the area under the normal distribution as a polygon, and overlay # a histogram set.seed(12357) x <- rnorm(50) sig <- 1.0 myBreaks <- seq(-3 * sig, 3 * sig, by = 0.5 * sig) # first, draw a histogram but don't show the bars hist(x, breaks = myBreaks, freq = FALSE, col = "#FFFFFF", border = "#FFFFFF") # next, plot the curve as a polygon() into the existing plot area dn <- seq(-3, 3, length.out = 100) polygon(dn, dnorm(dn), col = "#00FF8822", border = "#FFFFFF") # finally, replot the histogram. Setting "add = TRUE" plots it over the # previous histogram hist(x, breaks = myBreaks, freq = FALSE, col = "#0000FF22", border = "#00000033", # note: transparent colours add = TRUE) # # More: see the Index of functions for the graphics package # = 12 IMAGES ============================================================= # Todo # = 13 CONTOUR LINES ====================================================== # ToDo # = 14 3D PLOTS =========================================================== # ToDo # = 15 GRAPHS AND NETWORKS ================================================ # ToDo # = 16 OTHER GRPAHICS PACKAGES ============================================ # Packages in the standard distribution ... # # graphics:: # grid:: # lattice:: # Packages that can be downloaded from CRAN # ... use with install.packages("package"), then # library("package") # hexbin:: # ggplot2:: # Packages that can be downloaded from BioConductor # prada: # if (! requireNamespace("prada", quietly=TRUE)) { # if (! requireNamespace("BiocManager", quietly=TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("prada") # } # = 17 INTERACTIVE PLOTS ================================================== # ToDo # == 17.1 locator() ======================================================== # ToDo # == 17.2 plotly:: ========================================================= # ToDo # [End]