# Data frame with a definition of yeast cell cycle phase names and
# time-points derived from inspection of the SC$mdl$peaks distribution.
# names start ends
# 1 Sense 0 6
# 2 Prep 6 15
# 3 Replicate 15 30
# 4 Assemble 30 37
# 5 Segregate 37 43
# 6 Duplicate 43 55
# 7 Stabilize 55 65
#
# SC$GO
# -----
# Data frame with GO term information and GO term enrichment data
# SC$GO$ontology : {C|F|P} identifying the "cellular component", "molecular
# function" or "biological process" ontology
# SC$GO$GOid : Gene Ontyology ID of the term
# SC$GO$label : Short label of the term
# SC$GO$tAll : Count of times the term is annotated in any gene
# SC$GO$tCC : ... times term is annotated to a gene in this dataset
# SC$GO$t... : ... annotated to one ofg the cell-cycle phases
# SC$GO$xs... : ... enrichment factor in each cell cycle phase
#
# = 03 PROPORTIONS AND DISTRIBUTIONS ======================================
# Distributions of numeric variable characterize the values. Proportions compare values. A typical use case is to characterize a set of measurements, or compare several sets of measurements.
# Histograms are superbly informative and one of the workhorses of our analyses. A histogram answers the question how many observations do we have in a particular range of a (continuously varying) variable.
# Here we plot a histogram of the first expression peaks of genes, computed as
# the first peak of the fitted model for every gene in our set. We might be
# interested in whether the expression of different genes peaks continuously
# over the cycle, or whetehr there are time intervals where waves of expression
# of different genes can be observed in response to cycle-control signals.
hist(SC$mdl$peaks)
# We can explicitly set breakpoints for the histogram:
main="Timing of first peaks of during the cell-cycle",
sub="The bimodal distribution distinguishes replication from division",
cex.sub=0.8,
xlab="t (min.)",
ylab="counts",
col=myPal(81))# get 81 color values from myPal()
# === 03.4.1 overlaying histograms
# Histograms can be plotted one-over another to compare them
# Example: when do genes with different GO term annotations first peak?
# GO:0005730 "nucleolus"
# GO:0032200 "telomere organization"
# GO:0006260 "DNA replication"
sel1<-grep("GO:0005730",SC$ann$GO)# 103 genes
sel2<-grep("GO:0032200",SC$ann$GO)# 32 genes
sel3<-grep("GO:0006260",SC$ann$GO)# 67 genes
# define breaks. This is important to make the histograms comparable
myBreaks<-seq(0,80,by=4)
# plot the first histogram
hist(SC$mdl$peaks[sel1],breaks=myBreaks,
main="Peak timing for different GO terms",
xlab="t (min)",
ylab="Counts",
ylim=c(0,50),
col="#FF007955")
# first overlay
hist(SC$mdl$peaks[sel2],breaks=myBreaks,
col="#00B2FF55",
add=TRUE)
# second overlay
hist(SC$mdl$peaks[sel3],breaks=myBreaks,
col="#7FDFC955",
add=TRUE)
# Legend
legend("topright",
fill=c("#FF007955","#00B2FF55","#7FDFC955"),
legend=c("GO:0005730\n(nucleolus)\n",# "\n" is a line-break
"GO:0032200\n(telomere organization)\n",
"GO:0006260\n(DNA replication)\n"),
cex=0.6,
bty="n")# no box around the legend
# = 04 THE plot() FUNCTION ================================================
# plot() is the workhorse of data visualization in R. But plot() is also a "generic" - i.e. different "classes" can define their own plot-methods and plot will "dispatch" the right method for a class.
# == 04.1 line plots =======================================================
# Lines are useful to visually associate ralated data points, like points in a
# sequence of events.
# plot an expression profile: value over time
plot(SC$xpr[169,])
# note the x is implied and plotted as "index", the expression values are used
# for the y-axis by default.
# plot types
plot(SC$xpr[169,],type="p")# points, the default
plot(SC$xpr[169,],type="l")# lines
plot(SC$xpr[169,],type="b")# both points and lines
plot(SC$xpr[169,],type="c")# empty points and lines
plot(SC$xpr[169,],type="o")# overplotted
plot(SC$xpr[169,],type="s")# steps
plot(SC$xpr[169,],type="h")# like histogram
plot(SC$xpr[169,],type="n")# none - useful to create an empty frame
# to add other elements later
text(SC$xpr[169,],colnames(SC$xpr),cex=0.5)# ... such as labels
# title and axis labels
iRow<-169
t<-seq(0,120,by=5)# set actual x-values
plot(t,SC$xpr[iRow,],
type="b",
main="Expression profile",
sub="Data: GSE3653 (Pramila et al. 2002)",
xlab="time(min)",
ylab="Expression (log ratio)")
# adjusting font sizes
plot(t,SC$xpr[iRow,],
type="b",
main="Expression profile",
sub="Data: GSE3653 (Pramila et al. 2002)",
xlab="time(min)",
ylab="Expression (log ratio)",
cex.main=1.1,# title
cex.sub=0.6,# subtitle
cex.axis=0.7,# axis values
cex.lab=0.8)# axis labels
# x and y axis ranges can be defined with a two-element vector
plot(t,SC$xpr[iRow,],
type="b",
xlim=c(0,120),# tight against the values
ylim=c(-max(abs(SC$xpr[iRow,])),
max(abs(SC$xpr[iRow,]))),# symmetric around 0
main="",sub="",xlab="",ylab="")
abline(h=0,col="#FF000044")
# pull in annotations with sprintf()
plot(t,SC$xpr[iRow,],
type="b",
main=sprintf("Expression profile: %s (%s)",
SC$ann$stdName[iRow],
SC$ann$sysName[iRow]),
sub="",xlab="",ylab="")
# When exploring data, it is useful to have a standard view of data as a
# function:
xprPlot<-function(iRow){
t<-seq(0,120,by=5)
yMax<-max(abs(SC$xpr[iRow,]))*1.1
plot(t,SC$xpr[iRow,],
ylim=c(-yMax,yMax),
type="b",
main=sprintf("Expression profile: %s (%s)",
SC$ann$stdName[iRow],
SC$ann$sysName[iRow]),
xlab="time(min)",
ylab="Expression (log ratio)",
cex.main=1.1,
cex.axis=0.7,
cex.lab=0.8)
abline(h=0,col="#FF000044")
}
xprPlot(169)
xprPlot(1124)
# better to visualize by plotting one OVER the other.
# We have many options to encode information in scatter plots (and others), in order to be able to visualize and discover patterns and relationships. The most important ones are the type of symbol to use, its size, and its color.
# There are many types of symbols available to plot points and they can all be varied by type, size, and color to present additional information. Which one to use is defined in the argument pch . The default is pch = 1: the open circle. Characters 1:20 are regular symbols:
points(x3,y3,pch=myPch)# note: "myPCh" is a character vector while
# the other pch were identified by integers
# The ASCII codes for characters 32 to 126 can also be used as plotting symbols
myPch<-32:126
x4<-(((myPch-32)%%19)+2)/2
y4<-5-((myPch-32)%/%19)
points(x4,y4,pch=32:126,col="#0055AA")
# I don't think I have ever actually used characters as plotting characters in
# this way; it is much more straightforward to just use the text() function in
# that case, rather than having to look up which ASCII code is which character,
# and remembering which ones are available as numbers, and which ones are
# identified by the character itself. But the first twenty five I use a lot, and
# I also end up looking up their codes a lot. So here is a function to plot them for reference. You can paste it into your .Rprofile to keep it at hand:
pchRef<-function(){
# a reference plot for the first twenty five plotting characters
# In R, a palette is a function (!) that takes a number as its argument and
# returns that number of colors, those colors can then be used to color points
# in a plot or other elements. In principle, such colors can serve three
# distinct puposes:
# Qualitative colors have a different color for different types of entities.
# Here the color serves simply to distinguish the types, it represents
# "categorical information". For example, we can assign different colors to each
# individual amino acid.
# Sequential colors distinguish order in a sequence: the colors distinguish the first from the last element in the order. For example we often color the N-terminus of a protein blue(ish) and the C-terminus red(dish).
# Diverging colors encode some property and are scaled to represent particular
# values of the property. For example we might represent temperature with a
# black-body radiation color palette. Note that "sequential" can be seen as
# "diverging on order".
# R has are a number of inbuilt palettes; Here is a convenience function to view
# palettes. Input can be either a palette, the name of a hcl.colors palette, or a vector of colors:
plotPal<-function(pal,N=40){
if(is.character(pal)){
if(length(pal)==1){# name of a hcl palette
myCols<-hcl.colors(N,pal)
m<-pal
}else{# already a color vector
myCols<-colorRampPalette(pal)(N)
m<-sprintf("Custom palette (spread to %d values)",N)
}
}else{# palette function
myCols<-pal(N)
m<-as.character(substitute(pal))
}
barplot(rep(1,N),main=m,axes=FALSE,col=myCols)
}
plotPal(rainbow)# rainbow() is the quintessential qualitative palette
plotPal(cm.colors)# cm.colors() is sequential: cyan-white-magenta
plotPal(terrain.colors)# a diverging palette: map elevation colors
# A lot of thought has gone into the construction of palettes in the
# colorspace:: package, now available as hcl.colors in R (hcl: hue, chroma,
# luminance), which is a better way to specify perceptually equidistant colors.
# It's worthwhile to read through the help-file for more information -
?hcl.colors
# ... and to sample the 110 available palettes:
hcl.pals()
# Examples:
plotPal("Viridis")
plotPal("Pastel 1")
plotPal("Spectral")
plotPal("OrRd")
plotPal("Cold")
plotPal("Mint")
plotPal("Berlin")
# If you are curious, you can execute the code below, then select the line that
# draws the plot and press ctrl+enter repeatedly to step through the entire
# set of palettes.
# i <- 0
# plotPal(hcl.pals()[( i<-i+1 )])
# Worthy of special mention are the color-Brewer palettes, in particular for
# their accessibility considerations. Do consider that not all of us view colors
# == 08.2 Color bars =======================================================
# Legends for a plot with a sequential or divergent spectrum are a special case, since we need to map a continuum of colors to a scale
# Example: plotting expression levels in two cell-cycle phases against each other, and applying a divergent color spectrum that displays the model correlations: genes with low correlations have expression profiles that can not be well approximated with a periodic function.
tRep<-c("t.15","t.20","t.25")# Replication-phase time points
tDup<-c("t.40","t.45","t.50")# Duplication-phase time points
x<-rowMeans(SC$xpr[,tRep])
y<-rowMeans(SC$xpr[,tDup])
plot(x,y)
myCors<-SC$mdl$cor
hist(myCors,breaks=40)
# The values are of course continuous, but we need to map them to a discrete set of colors. The strategy is as follows:
# - scale the values we wish to map into the interval [0, 1]
# - multiply with the desired number of intervals minus one, round, and add one
# - these integers can be used as indices into a vector of colors, ...
m<-sprintf("Expression changes between phases (%d genes)",
length(x))
plot(x[ord],y[ord],
main=m,
xlab="mean Expression in Replication phase",
ylab="mean Expression in Duplication phase",
sub=paste("Coloured by Correlation of",
"Expression Profile (xpr) to",
"Fitted Periodic Function (mdl)."),
cex.main=0.9,
cex.lab=0.7,
cex.sub=0.7,
pch=16,
col=myCols[ord],
cex=1.2)
abline(h=0,col="#0088FF44")# horizontal zero
abline(v=0,col="#0088FF44")# vertical zero
}
basePlot()
# to print a color-bar legend, we use the plotrix package
if(!requireNamespace("plotrix",quietly=TRUE)){
install.packages("plotrix")
}
corLabels<-sprintf("%4.2f",# craft the labels to plot
seq(min(myCors),
max(myCors),
length.out=length(corCols)))
oPar<-par(mar=c(5,4,4,7))# more space on the margin
par("xpd"=FALSE)
basePlot()
# we need to figure out where to put the legend. par("usr") gives us the coordinates of the last plot window. We'll space the legend between 1.05 and 1.1 times the plot width, and centred along the y-axis with a height of 0.8 times the plot height.
dx<-par("usr")[2]-par("usr")[1]
dy<-par("usr")[4]-par("usr")[3]
xl<-par("usr")[2]+(dx*0.05)# left
xr<-xl+(0.05*dx)# right
yb<-par("usr")[4]-(0.9*dy)# bottom
yt<-yb+(0.8*dy)# top
plotrix::color.legend(xl,yb,xr,yt,
corLabels,
corCols,
align="rb",# labels to the right of the bar
cex=0.8,
gradient="y")# vertical gradient
# Almost perfect. Except we need a title for the color bar.
# in this case the title is "r:" ... but the "r" should be in italics.
par("xpd"=TRUE)# enable plotting into the margin
text(xr+(0.01*dx),yt+0.015*dy,
expression(italic(r)["xpr,mdl"]),
adj=c(0,0),
cex=0.9)
par(oPar)
# ToDo - this is a common plot prototype - cast it into a function