bch441-work-abc-units/RPR-SX-PDB.R

816 lines
27 KiB
R
Raw Normal View History

2017-09-12 20:09:20 +00:00
# RPR-SX-PDB.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-SX-PDB unit.
#
2017-10-22 01:01:29 +00:00
# Version: 1.0
2017-09-12 20:09:20 +00:00
#
2017-10-22 01:01:29 +00:00
# Date: 2017 10 19
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2017-10-22 01:01:29 +00:00
# 1.0 First live version, completely refactores 2016 code
# with remarkable speed gains. Added section on x, y, z
# (density) plots.
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
#
# TODO:
2017-11-12 09:38:15 +00:00
# Confirm that SS residue numbers are indices
2017-09-12 20:09:20 +00:00
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-10-22 01:01:29 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-10-22 01:01:29 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-10-22 01:01:29 +00:00
#TOC> ==========================================================================
2017-11-12 09:38:15 +00:00
#TOC>
2017-10-22 01:01:29 +00:00
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Introduction to the bio3D package 63
#TOC> 2 A Ramachandran plot 151
#TOC> 3 Density plots 227
#TOC> 3.1 Density-based colours 241
#TOC> 3.2 Plotting with smoothScatter() 260
#TOC> 3.3 Plotting hexbins 275
#TOC> 3.4 Plotting density contours 299
#TOC> 3.4.1 ... as overlay on a colored grid 333
#TOC> 3.4.2 ... as filled countour 350
#TOC> 3.4.3 ... as a perspective plot 381
#TOC> 4 cis-peptide bonds 399
#TOC> 5 H-bond lengths 414
2017-11-12 09:38:15 +00:00
#TOC>
2017-10-22 01:01:29 +00:00
#TOC> ==========================================================================
2017-09-12 20:09:20 +00:00
# In this example of protein structure interpretation, we ...
# - load the library "bio3D" which supports work with
# protein structure files,
# - explore some elementary functions of the library
2017-10-22 01:01:29 +00:00
# - explore plotting of density values with scatterplots
2017-09-12 20:09:20 +00:00
# - assemble a script to see whether H-bond lengths systematically differ
# between alpha-helical and beta-sheet structures.
2017-10-22 01:01:29 +00:00
# = 1 Introduction to the bio3D package ===================================
if (! require(bio3d, quietly=TRUE)) {
install.packages("bio3d")
2017-09-12 20:09:20 +00:00
library(bio3d)
}
# Package information:
# library(help = bio3d) # basic information
# browseVignettes("bio3d") # available vignettes
# data(package = "bio3d") # available datasets
2017-09-12 20:09:20 +00:00
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you could.
#
2017-10-22 01:01:29 +00:00
# But before you _load_ a file, have a look what such a file contains. I have
# packaged the 1BM8 file into the project:
file.show("./data/1BM8.pdb")
2017-09-12 20:09:20 +00:00
# Have a look at the header section, the atom records, the coordinate records
2017-10-22 01:01:29 +00:00
# etc.
2017-09-12 20:09:20 +00:00
#
2017-10-22 01:01:29 +00:00
# Task: Answer the following questions:
#
# What is the resolution of the structure?
# Is the first residue in the SEQRES section also the first residue
# with an ATOM record?
# How many water molecules does the structure contain?
# Can you explain REMARK 525 regarding HOH 459 and HOH 473?
# Are all atoms of the N-terminal residue present?
# Are all atoms of the C-terminal residue present?
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
apses <- read.pdb("1bm8") # load a molecule directly from the PDB via the
# Internet. (This is not your local version.)
2017-09-12 20:09:20 +00:00
# check what we have:
apses
# what is this actually?
str(apses)
# bio3d's pdb objects are simple lists. Great! You know lists!
2017-10-22 01:01:29 +00:00
# You see that there is a list element called $atom which is a data frame in
# which the columns arevectors of the same length - namely the number of atoms
# in the structure file. And there is a matrix of (x, y, z) triplets called xyz.
# And there is a vector that holds sequence, and two tables called helix and
# sheet. Let's pull out a few values to confirm how selection and subsetting
# works here:
2017-09-12 20:09:20 +00:00
# selection by atom ...
i <- 5
apses$atom[i,]
apses$atom[i, c("x", "y", "z")] # here we are selecting with column names!
apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers
# all atoms of a residue ...
2017-10-22 01:01:29 +00:00
i <- 48
2017-09-12 20:09:20 +00:00
apses$atom[apses$atom[,"resno"] == i, ]
# sequence of the first ten residues
2017-10-22 01:01:29 +00:00
apses$seqres[1:10] # the "A"s here identify chain "A"
2017-09-12 20:09:20 +00:00
# Can we convert this to one letter code?
aa321(apses$seqres[1:10])
# Lets get the implicit sequence:
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
# Do explicit and implicit sequence have the same length?
length(apses$seqres)
length(apses$atom$resid[apses$calpha])
# Are the sequences the same?
sum(apses$seqres == apses$atom$resid[apses$calpha])
# get a list of all CA atoms of arginine residues
sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA"
apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
# Good for now. Let's do some real work.
2017-10-22 01:01:29 +00:00
# = 2 A Ramachandran plot =================================================
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
# calculates all dihedral angles for backbone and sidechain bonds, NA where
# the bond does not exist in an amino acid.
2017-09-12 20:09:20 +00:00
tor <- torsion.pdb(apses)
2017-10-22 01:01:29 +00:00
plot(tor$phi, tor$psi,
xlim = c(-180, 180), ylim = c(-180, 180),
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
2017-09-12 20:09:20 +00:00
# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues. Let's replot the data, but
# color the points for glycine residues differently. First, we
# get a vector of glycine residue indices in the structure:
2017-10-22 01:01:29 +00:00
mySeq <- pdbseq(apses)
2017-09-12 20:09:20 +00:00
# Explore the result object and extract the indices of GLY resiues.
2017-10-22 01:01:29 +00:00
mySeq == "G"
which(mySeq == "G")
iGly <- which(mySeq == "G")
2017-09-12 20:09:20 +00:00
# Now plot all non-gly residues.
# Remember: negative indices exclude items from a vector
2017-10-22 01:01:29 +00:00
plot(tor$phi[-iGly], tor$psi[-iGly],
xlim=c(-180,180), ylim=c(-180,180),
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
2017-09-12 20:09:20 +00:00
# Now plot GLY only, but with green dots:
points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00")
# As you see, four residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script? Let's get their coordinate records:
2017-10-22 01:01:29 +00:00
# subset CA records
CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")]
# get index of outliers
2017-09-12 20:09:20 +00:00
iOutliers <- which(tor$phi > 30 & tor$phi < 90 &
tor$psi > 0 & tor$psi < 90)
2017-10-22 01:01:29 +00:00
# cbind records together
(dat <- cbind(CA[iOutliers, ],
phi = tor$phi[iOutliers],
psi = tor$psi[iOutliers]))
# remove the glycine ...
(dat <- dat[dat$resid != "GLY", ])
2017-09-12 20:09:20 +00:00
# let's add the residue numbers to the plot with the text function:
for (i in 1:nrow(dat)) {
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
text(dat$phi[i],
dat$psi[i],
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
pos = 4,
offset = 0.4,
cex = 0.7)
}
2017-10-22 01:01:29 +00:00
# You can check the residues in Chimera. Is there anything unusual about these
# residues?
# = 3 Density plots =======================================================
# Such x, y scatter-plots of data that is sampled from a distribution can tell
# us a lot about what shapes the distribution. The distribution is governed by
# the free energy of the phi-psi landscape in folded proteins, since folded
# proteins generally minimize the free energy of their conformations. We observe
# empirically, from comparing frequency statistics and mutation experiments,
# that this generall follows a Boltzmann distribution, where the free energy
# changes we observe in experments that change one conformation into another are
# proportional to the log-ratio of the number of times we observe each
# observation in the protein structure database (after correcting for
# observation bias). The proper way to visualize such 2D landscapes is with
# contour plots.
# == 3.1 Density-based colours =============================================
# A first approximation to scatterplots that visualize the density of the
# underlying distribution is coloring via the densCols() function.
?densCols
iNA <- c(which(is.na(tor$phi)), which(is.na(tor$psi)))
phi <- tor$phi[-iNA]
psi <- tor$psi[-iNA]
plot (phi, psi,
xlim = c(-180, 180), ylim = c(-180, 180),
col=densCols(phi,psi),
pch=20, cex=2,
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
# == 3.2 Plotting with smoothScatter() =====================================
# A better way, that spreads out the underlying density is smoothScatter()
smoothScatter(phi,psi)
smoothScatter(phi, psi,
xlim = c(-180, 180), ylim = c(-180, 180),
col = "#0033BB33",
pch = 3, cex = 0.6,
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
# == 3.3 Plotting hexbins ==================================================
# If we wish to approximate values in a histogram-like fashion, we can use
# hexbin()
if (! require(hexbin, quietly=TRUE)) {
2017-10-22 01:01:29 +00:00
install.packages("hexbin")
library(hexbin)
}
# Package information:
# library(help = hexbin) # basic information
# browseVignettes("hexbin") # available vignettes
# data(package = "hexbin") # available datasets
2017-10-22 01:01:29 +00:00
myColorRamp <- colorRampPalette(c("#EEEEEE",
"#3399CC",
"#2266DD"))
plot(hexbin(phi, psi, xbins = 10),
colramp = myColorRamp,
main = "phi-psi Density Bins for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
# == 3.4 Plotting density contours =========================================
# The best way to handle such data is provided by the function contour():
?contour
# Contour plots are not produced along the haphazardly sampled values of a data
# set, but on a regular grid. This means, we need to convert observed values
# into estimated densities. Density estimation is an important topic for
# exploratory data analysis, base R has the density() function for 1D
# distributions. But for 2D data like or phi-psi plots, we need a function from
# the MASS package: kde2d()
if (! require(MASS, quietly=TRUE)) {
2017-10-22 01:01:29 +00:00
install.packages("MASS")
library(MASS)
}
# Package information:
# library(help = MASS) # basic information
# browseVignettes("MASS") # available vignettes
# data(package = "MASS") # available datasets
2017-10-22 01:01:29 +00:00
?kde2d
dPhiPsi <-kde2d(phi, psi,
n = 60,
lims = c(-180, 180, -180, 180))
str(dPhiPsi)
# This is a list, with gridpoints in x and y, and the estimated densities in z.
# Generic plot with default parameters
contour(dPhiPsi)
2017-11-12 09:38:15 +00:00
# === 3.4.1 ... as overlay on a colored grid
2017-10-22 01:01:29 +00:00
image(dPhiPsi,
col = myColorRamp(100),
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
contour(dPhiPsi, col = "royalblue",
add = TRUE,
method = "edge",
nlevels = 10,
lty = 2)
points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
2017-11-12 09:38:15 +00:00
# === 3.4.2 ... as filled countour
2017-10-22 01:01:29 +00:00
filled.contour(dPhiPsi,
xlim = c(-180, 180), ylim = c(-180, 180),
nlevels = 10,
color.palette = myColorRamp,
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
# Note: we can pass additional plotting and overlay commands to the counter plot
# in a block of expressions passed via the plot.axes parameter:
filled.contour(dPhiPsi,
xlim = c(-180, 180), ylim = c(-180, 180),
nlevels = 10,
color.palette = myColorRamp,
main = "Ramachandran plot for 1BM8",
xlab = expression(phi),
ylab = expression(psi),
plot.axes = {
contour(dPhiPsi, col = "#00000044",
add = TRUE,
method = "edge",
nlevels = 10,
lty = 2)
points(phi, psi, col = "#00338866", pch = 3, cex = 0.7)
abline(h = 0, lwd = 0.5, col = "#00000044")
abline(v = 0, lwd = 0.5, col = "#00000044")
})
2017-11-12 09:38:15 +00:00
# === 3.4.3 ... as a perspective plot
2017-10-22 01:01:29 +00:00
persp(dPhiPsi,
xlab = "phi",
ylab = "psi",
zlab = "Density")
persp(dPhiPsi,
theta = 40,
phi = 10,
col = "#99AACC",
xlab = "phi",
ylab = "psi",
zlab = "Density")
# = 4 cis-peptide bonds ===================================================
2017-09-12 20:09:20 +00:00
# Are there any cis-peptide bonds in the structure?
tor$omega
#... gives us a quick answer. But wait - what values do we
2017-10-22 01:01:29 +00:00
# expect? And why are the values so different, ranging from -180° to 180°?
2017-09-12 20:09:20 +00:00
# Consider this plot: what am I doing here and why?
2017-10-22 01:01:29 +00:00
om <- c(360 + tor$omega[tor$omega < 0],
tor$omega[tor$omega >= 0])
hist(om, xlim=c(0,360))
2017-09-12 20:09:20 +00:00
abline(v=180, col="red")
2017-10-22 01:01:29 +00:00
# Note: a cis-peptide bond will have an omega torsion angle of around 0°
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# = 5 H-bond lengths ======================================================
2017-09-12 20:09:20 +00:00
# Let's do something a little less trivial and compare
# backbone H-bond lengths between helices and strands.
2017-10-22 01:01:29 +00:00
# Secondary structure is defined in the bio3d object's ...$helix and ...$strand
# list elements.
str(apses)
2017-09-12 20:09:20 +00:00
# We need to
2017-10-22 01:01:29 +00:00
# - collect all N atoms in alpha helices resp.
# beta strands;
# - collect all O atoms in alpha helices resp.
2017-09-12 20:09:20 +00:00
# beta strands;
# - fetch the atom coordinates;
2017-10-22 01:01:29 +00:00
# - calculate all N, O distances;
2017-09-12 20:09:20 +00:00
# - filter them for distances that indicate H-bonds; and,
# - plot the results.
2017-10-22 01:01:29 +00:00
# Secondary structure can either be obtained from definitions contained in most
# PDB files, or by running the DSSP algorithm IF(!) you have it installed on
# your machine. See the dssp() function of bio3d for instructions how to obtain
# and install DSSP and STRIDE. This is highly recommended for "real" work with
# structure coordinate files. The 1BM8 file contains secondary structure
# definitions:
2017-09-12 20:09:20 +00:00
apses$helix
apses$sheet
2017-10-22 01:01:29 +00:00
# A function to collect atom indices for particular type of secondary structure
ssSelect <- function(PDB, myChain = "A", ssType, myElety) {
# Function to retrieve specified atom types from specified secondary
# structure types in a PDB object.
# Parameters:
# PDB A bio3D PDB object
# myChain chr The chain to use. Default: chain "A".
# ssType chr A vector of keywords "helix" and/or "sheet"
# myElety chr A vector of $eletype atom types to return
# Value: num Indices of the matching atom rows in PDB$atom
# Build a vector of $resno numbers
starts <- numeric()
ends <- numeric()
if ("helix" %in% ssType) {
sel <- PDB$helix$chain %in% myChain
starts <- c(starts, PDB$helix$start[sel])
ends <- c(ends, PDB$helix$end[sel])
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
if ("sheet" %in% ssType) {
sel <- PDB$sheet$chain %in% myChain
starts <- c(starts, PDB$sheet$start[sel])
ends <- c(ends, PDB$sheet$end[sel])
}
myResno <- numeric()
for (i in seq_along(starts)) {
myResno <- c(myResno, starts[i]:ends[i])
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# get id's from PDB
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
x <- atom.select(PDB,
string = "protein",
type = "ATOM",
chain = myChain,
resno = myResno,
elety = myElety)
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
return(x$atom)
}
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Example:
ssSelect(apses, ssType = "sheet", myElety = "N")
ssSelect(apses, ssType = "sheet", myElety = "O")
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# That looks correct: O atoms should be stored three index position after N: the
# sequence of atoms in a PDB file is usually N, CA, C, O ... followed by the
# side chain coordinates.
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Now to extract the coordinates and calculate distances. Our function needs to
# take the PDB object and two vectors of atom indices as argument, and return a
# vector of pair-distances (actually dist.xyz() returns a matrix).
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
pairDist <- function(PDB, a, b) {
# Calculate pairwise distances between atoms indicated by a and b
# Parameters:
# PDB A bio3D PDB object
# a int A vector of atom indexes
# b int A vector of atom indexes
# Value: num matrix of Euclidian distances between the atoms given in a, b.
# There are as many rows as atoms in a, as many columns as
# atoms in b.
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
dMat <- numeric()
if (length(a) > 0 && length(b) > 0) {
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
A <- PDB$atom[a, c("x", "y", "z")]
B <- PDB$atom[b, c("x", "y", "z")]
dMat <- dist.xyz(A, B)
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
return(dMat)
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Let's see if this looks correct. Let's look at all the pairwise distances
# between N and O atoms in both types of secondary structure:
iN <- ssSelect(apses, ssType = c("helix", "sheet"), myElety = "N")
iO <- ssSelect(apses, ssType = c("helix", "sheet"), myElety = "O")
x <- pairDist(apses, iN, iO)
hist(x,
breaks = 80,
col = "lavenderblush",
main = "",
xlab = "N-O distances (Å)")
# Since we are collecting distance from all secondary structure elements, we
# are just seing a big peak of (meaningless) long-distance separations. We
# need to zoom in on the shorter distances, in which we expect
2017-09-12 20:09:20 +00:00
# hydrogen bonds:
2017-10-22 01:01:29 +00:00
hist(x[x < 4.2], # restrict to N-O distance less than 4.2 Å long
breaks=seq(2.0, 4.2, 0.1),
xlim=c(2.0,4.2),
col = "lavenderblush",
main = "",
xlab = "N-O distances (Å)")
2017-09-12 20:09:20 +00:00
# There is a large peak at about 2.2Å, and another
# large peak above 3.5Å. But these are not typical hydrogen
# bond distances! Rather these are (N,O) pairs in peptide
2017-10-22 01:01:29 +00:00
# bonds, and within residues. That's not good, because these will contaminate
# our statistics.
# We need to exclude all distances between N of a residue
2017-09-12 20:09:20 +00:00
# and O of a preceeding residue, and all (N,O) pairs in the
2017-10-22 01:01:29 +00:00
# same residue. We need a function to filter distances by residue numbers. And while we are filtering, we might as well throw away the non-H bond distances too.
filterHB <- function(PDB, iN, iO, dMat, cutoff = 3.9) {
# Filters distances between O(i-1) and N(i), and between N(i) and O(i)
# in a distance matrix where there is one row per N-atom and one
# column per O atom.
# Parameters:
# PDB a bio3D PDB object
# iN int a vector of N atom indexes
# iO int a vector of O atom indexes
# dMat num a distance matrix created by pairDist()
# cutoff num only return distances that are shorter than "cutoff
# Value: a distance matrix in which values that do not match the
# filter criteria have bee set to NA.
if (length(iN) > 0 && length(iO) > 0) {
resN <- PDB$atom$resno[iN]
resO <- PDB$atom$resno[iO]
for (i in seq_along(resN)) { # for all N atoms
for (j in seq_along(resO)) { # for all O atoms
if (dMat[i, j] > cutoff || # if: distance > cutoff, or ...
(resN[i] - 1) == resO[j] || # distance is N(i)---O(i-1), or ...
resN[i] == resO[j]) { # distance is N(i)---O(i), then:
dMat[i, j] <- NA # set this distance to NA.
2017-09-12 20:09:20 +00:00
}
}
}
}
2017-10-22 01:01:29 +00:00
return(dMat)
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Inspect the result:
hist(filterHB(apses, iN, iO, x),
breaks=seq(2.0, 4.2, 0.1),
xlim=c(2.0,4.2),
col = "paleturquoise",
main = "",
xlab = "N-O distances (Å)")
2017-09-12 20:09:20 +00:00
# Finally we can calculate alpha- and beta- structure
# bonds and compare them. In this section we'll explore
# different options for histogram plots.
2017-10-22 01:01:29 +00:00
# H-bonds in helices ...
iN <- ssSelect(apses, ssType = c("helix"), myElety = "N")
iO <- ssSelect(apses, ssType = c("helix"), myElety = "O")
dH <- filterHB(apses, iN, iO, pairDist(apses, iN, iO))
dH <- dH[!is.na(dH)]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# H-bonds in sheets. (We commonly use the letter "E" to symbolize a beta
# strand or sheet, because "E" visually evokes an extended strand with
# protruding sidechains.)
iN <- ssSelect(apses, ssType = c("sheet"), myElety = "N")
iO <- ssSelect(apses, ssType = c("sheet"), myElety = "O")
dE <- filterHB(apses, iN, iO, pairDist(apses, iN, iO))
dE <- dE[!is.na(dE)]
2017-09-12 20:09:20 +00:00
# The plain histogram functions without parameters
# give us white stacks.
hist(dH)
# and ...
hist(dE)
# We can see that the histrograms look different
# but that is better visualized by showing two plots
# in the same window. We use the par() function, for
# more flexible layout, look up the layout() function.
?par
?layout
opar <- par(no.readonly=TRUE) # store current state
par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column
# plot two histograms
hist(dH)
hist(dE)
# add color:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70")
# For better comparison, plot both in the
# same window:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
2017-10-22 01:01:29 +00:00
# ... oops, we dind't reset the graphics parameters. You can either close the
# window, a new window will open with default parameters, or ...
2017-09-12 20:09:20 +00:00
par(opar) # ... reset the graphics parameters
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
2017-10-22 01:01:29 +00:00
# We see that the leftmost column of the sheet bonds hides the helix bonds in
# that column. Not good. But we can make the colors transparent! We just need to
# add a fourth set of two hexadecimal-numbers to the #RRGGBB triplet. Lets use
# 2/3 transparent, in hexadecimal, 1/3 of 256 is x55 - i.e. an RGB triplet
# specied as #RRGGBB55 is only 33% opaque:
2017-09-12 20:09:20 +00:00
hist(dH, col="#DD005555")
hist(dE, col="#00AA7055", add=TRUE)
2017-10-22 01:01:29 +00:00
# To finalize the plots, let's do two more things: Explicitly define the breaks,
# to make sure they match up - otherwise they would not need to... like in this
# example:
2017-09-12 20:09:20 +00:00
hist(dH, col="#DD005555")
hist(dE[dE < 3], col="#00AA7055", add=TRUE)
2017-10-22 01:01:29 +00:00
# Breaks are a parameter in hist() that can either be a scalar, to define how
# many columns you want, or a vector, that defines the actual breakpoints.
2017-09-12 20:09:20 +00:00
brk=seq(2.4, 4.0, 0.1)
hist(dH, col="#DD005555", breaks=brk)
hist(dE, col="#00AA7055", breaks=brk, add=TRUE)
2017-10-22 01:01:29 +00:00
# The last thing to do is to think about rescaling the plot. You notice that the
# y-axis is scaled in absolute frequency (i.e. counts). That gives us some
# impression of the relative frequency, but it is of course skewed by observing
# relatively more or less of one type of secondary structure in a protein. As
# part of the hist() function we can rescale the values so that the sum over all
# is one: set the prameter freq=FALSE.
2017-09-12 20:09:20 +00:00
hist(dH, col="#DD005555", breaks=brk, freq=FALSE)
hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE)
# Adding labels and legend ...
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend("topright",
c(sprintf("alpha (N = %3d)", sum(hH$counts)),
sprintf("beta (N = %3d)", sum(hE$counts))),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA)
2017-10-22 01:01:29 +00:00
2017-09-12 20:09:20 +00:00
# With all the functions we have defined,
# it is easy to try this with a larger protein.
2017-10-22 01:01:29 +00:00
# 3ugj for example is VERY large.
2017-09-12 20:09:20 +00:00
pdb <- read.pdb("3ugj")
2017-10-22 01:01:29 +00:00
# helices...
iN <- ssSelect(pdb, ssType = c("helix"), myElety = "N")
iO <- ssSelect(pdb, ssType = c("helix"), myElety = "O")
dH <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO))
dH <- dH[!is.na(dH)]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# sheets
iN <- ssSelect(pdb, ssType = c("sheet"), myElety = "N")
iO <- ssSelect(pdb, ssType = c("sheet"), myElety = "O")
dE <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO))
dE <- dE[!is.na(dE)]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# histograms ...
2017-09-12 20:09:20 +00:00
brk=seq(2.4, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
2017-10-22 01:01:29 +00:00
# It looks more and more that the distribution is indeed different. Our sample
# is large, but derives from a single protein. To do database scale statistics,
# we should look at many more proteins. To give you a sense of how, let's do
# this for just ten proteins, randomly selected from non-homologous,
# high-resolution, single domain structures. I have provided a utility function
# for such a selection (details beyond the scope of this project).
myPDBs <- selectPDBrep(10)
# My selection is "2OVJ", "1HQS", "3BON", "4JZX", "3BQ3", "2IUM", "2C9E",
# "4X1F", "2V3I", "3GE3". Yours will be different.
# We are loading the files online - don't do this if you have bandwidth
# limitations.
dH <- c() # collect all helix H-bonds here
dE <- c() # collect all sheet H-bonds here
for (i in seq_along(myPDBs)) {
pdb <- read.pdb(myPDBs[i])
# helices...
iN <- ssSelect(pdb, ssType = c("helix"), myElety = "N")
iO <- ssSelect(pdb, ssType = c("helix"), myElety = "O")
x <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO))
dH <- c(dH, x[!is.na(x)])
# sheets
iN <- ssSelect(pdb, ssType = c("sheet"), myElety = "N")
iO <- ssSelect(pdb, ssType = c("sheet"), myElety = "O")
x <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO))
dE <- c(dE, x[!is.na(x)])
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Inspect the results
length(dH) # 4415 (your numbers are different, but it should be a lot)
length(dE) # 262
2017-09-12 20:09:20 +00:00
brk=seq(2.0, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
2017-10-22 01:01:29 +00:00
cex.main = 0.8,
main="Helix and Sheet H-bond lengths (10 representative structures)")
2017-09-12 20:09:20 +00:00
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
2017-10-22 01:01:29 +00:00
# Task:
# Why do you think these distributions are different?
# At what distance do you think H-bonds have the lowest energy?
# For alpha-helices? For beta-strands?
2017-09-12 20:09:20 +00:00
# [END]