2017-09-12 20:09:20 +00:00
# RPR-SX-PDB.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-SX-PDB unit.
#
2019-01-08 07:11:25 +00:00
# Version: 1.1
2017-09-12 20:09:20 +00:00
#
2019-01-08 07:11:25 +00:00
# Date: 2017 10 - 2019 01
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2019-01-08 07:11:25 +00:00
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
2017-10-22 01:01:29 +00:00
# 1.0 First live version, completely refactores 2016 code
# with remarkable speed gains. Added section on x, y, z
# (density) plots.
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
#
# TODO:
2017-11-12 09:38:15 +00:00
# Confirm that SS residue numbers are indices
2018-01-28 23:23:21 +00:00
# Set task seed from student number
2017-09-12 20:09:20 +00:00
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-10-22 01:01:29 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-10-22 01:01:29 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-10-29 03:05:53 +00:00
2017-10-22 01:01:29 +00:00
#TOC> ==========================================================================
2019-01-08 07:11:25 +00:00
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 Introduction to the bio3D package 61
#TOC> 2 A Ramachandran plot 152
#TOC> 3 Density plots 228
#TOC> 3.1 Density-based colours 242
#TOC> 3.2 Plotting with smoothScatter() 261
#TOC> 3.3 Plotting hexbins 276
#TOC> 3.4 Plotting density contours 304
#TOC> 3.4.1 ... as overlay on a colored grid 337
#TOC> 3.4.2 ... as filled countour 354
#TOC> 3.4.3 ... as a perspective plot 385
#TOC> 4 cis-peptide bonds 403
#TOC> 5 H-bond lengths 418
#TOC>
2017-10-22 01:01:29 +00:00
#TOC> ==========================================================================
2017-09-12 20:09:20 +00:00
# In this example of protein structure interpretation, we ...
# - load the library "bio3D" which supports work with
# protein structure files,
# - explore some elementary functions of the library
2017-10-22 01:01:29 +00:00
# - explore plotting of density values with scatterplots
2017-09-12 20:09:20 +00:00
# - assemble a script to see whether H-bond lengths systematically differ
# between alpha-helical and beta-sheet structures.
2017-10-22 01:01:29 +00:00
# = 1 Introduction to the bio3D package ===================================
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " bio3d" , quietly = TRUE ) ) {
2017-10-29 03:05:53 +00:00
install.packages ( " bio3d" )
2017-09-12 20:09:20 +00:00
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help = bio3d) # basic information
# browseVignettes("bio3d") # available vignettes
# data(package = "bio3d") # available datasets
2017-09-12 20:09:20 +00:00
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you could.
#
2017-10-22 01:01:29 +00:00
# But before you _load_ a file, have a look what such a file contains. I have
# packaged the 1BM8 file into the project:
file.show ( " ./data/1BM8.pdb" )
2017-09-12 20:09:20 +00:00
# Have a look at the header section, the atom records, the coordinate records
2017-10-22 01:01:29 +00:00
# etc.
2017-09-12 20:09:20 +00:00
#
2017-10-22 01:01:29 +00:00
# Task: Answer the following questions:
#
# What is the resolution of the structure?
# Is the first residue in the SEQRES section also the first residue
# with an ATOM record?
# How many water molecules does the structure contain?
# Can you explain REMARK 525 regarding HOH 459 and HOH 473?
# Are all atoms of the N-terminal residue present?
# Are all atoms of the C-terminal residue present?
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
apses <- bio3d :: read.pdb ( " 1bm8" ) # load a molecule directly from the PDB via
# the Internet.
2017-09-12 20:09:20 +00:00
# check what we have:
apses
# what is this actually?
str ( apses )
# bio3d's pdb objects are simple lists. Great! You know lists!
2017-10-22 01:01:29 +00:00
# You see that there is a list element called $atom which is a data frame in
# which the columns arevectors of the same length - namely the number of atoms
# in the structure file. And there is a matrix of (x, y, z) triplets called xyz.
# And there is a vector that holds sequence, and two tables called helix and
# sheet. Let's pull out a few values to confirm how selection and subsetting
# works here:
2017-09-12 20:09:20 +00:00
# selection by atom ...
i <- 5
apses $ atom [i , ]
apses $ atom [i , c ( " x" , " y" , " z" ) ] # here we are selecting with column names!
apses $ xyz [c ( i * 3 -2 , i * 3 -1 , i * 3 ) ] # here we are selcting with row numbers
# all atoms of a residue ...
2017-10-22 01:01:29 +00:00
i <- 48
2017-09-12 20:09:20 +00:00
apses $ atom [apses $ atom [ , " resno" ] == i , ]
# sequence of the first ten residues
2017-10-22 01:01:29 +00:00
apses $ seqres [1 : 10 ] # the "A"s here identify chain "A"
2017-09-12 20:09:20 +00:00
# Can we convert this to one letter code?
2019-01-08 07:11:25 +00:00
bio3d :: aa321 ( apses $ seqres [1 : 10 ] )
2017-09-12 20:09:20 +00:00
# Lets get the implicit sequence:
2019-01-08 07:11:25 +00:00
bio3d :: aa321 ( ( apses $ atom $ resid [apses $ calpha ] ) [1 : 10 ] )
# Do you understand this code?
2017-09-12 20:09:20 +00:00
# Do explicit and implicit sequence have the same length?
length ( apses $ seqres )
length ( apses $ atom $ resid [apses $ calpha ] )
# Are the sequences the same?
sum ( apses $ seqres == apses $ atom $ resid [apses $ calpha ] )
# get a list of all CA atoms of arginine residues
sel <- apses $ atom $ resid == " ARG" & apses $ atom $ elety == " CA"
apses $ atom [sel , c ( " eleno" , " elety" , " resid" , " chain" , " resno" , " insert" ) ]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
2019-01-08 07:11:25 +00:00
bio3d :: plot.bio3d ( apses $ atom $ b [apses $ calpha ] ,
sse = apses ,
typ = " l" ,
ylab = " B-factor" )
2017-09-12 20:09:20 +00:00
# Good for now. Let's do some real work.
2017-10-22 01:01:29 +00:00
# = 2 A Ramachandran plot =================================================
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
# calculates all dihedral angles for backbone and sidechain bonds, NA where
# the bond does not exist in an amino acid.
2019-01-08 07:11:25 +00:00
tor <- bio3d :: torsion.pdb ( apses )
2017-10-22 01:01:29 +00:00
plot ( tor $ phi , tor $ psi ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
2017-09-12 20:09:20 +00:00
# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues. Let's replot the data, but
# color the points for glycine residues differently. First, we
# get a vector of glycine residue indices in the structure:
2019-01-08 07:11:25 +00:00
mySeq <- bio3d :: pdbseq ( apses )
2017-09-12 20:09:20 +00:00
# Explore the result object and extract the indices of GLY resiues.
2017-10-22 01:01:29 +00:00
mySeq == " G"
which ( mySeq == " G" )
iGly <- which ( mySeq == " G" )
2017-09-12 20:09:20 +00:00
# Now plot all non-gly residues.
# Remember: negative indices exclude items from a vector
2017-10-22 01:01:29 +00:00
plot ( tor $ phi [ - iGly ] , tor $ psi [ - iGly ] ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
2017-09-12 20:09:20 +00:00
# Now plot GLY only, but with green dots:
points ( tor $ phi [iGly ] , tor $ psi [iGly ] , pch = 21 , cex = 0.9 , bg = " #00CC00" )
# As you see, four residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script? Let's get their coordinate records:
2017-10-22 01:01:29 +00:00
# subset CA records
CA <- apses $ atom [apses $ calpha , c ( " eleno" , " elety" , " resid" , " chain" , " resno" ) ]
# get index of outliers
2017-09-12 20:09:20 +00:00
iOutliers <- which ( tor $ phi > 30 & tor $ phi < 90 &
tor $ psi > 0 & tor $ psi < 90 )
2017-10-22 01:01:29 +00:00
# cbind records together
( dat <- cbind ( CA [iOutliers , ] ,
phi = tor $ phi [iOutliers ] ,
psi = tor $ psi [iOutliers ] ) )
# remove the glycine ...
( dat <- dat [dat $ resid != " GLY" , ] )
2017-09-12 20:09:20 +00:00
# let's add the residue numbers to the plot with the text function:
for ( i in 1 : nrow ( dat ) ) {
points ( dat $ phi [i ] , dat $ psi [i ] , pch = 21 , cex = 0.9 , bg = " #CC0000" )
text ( dat $ phi [i ] ,
dat $ psi [i ] ,
2019-01-08 07:11:25 +00:00
labels = sprintf ( " %s%d" , bio3d :: aa321 ( dat $ resid [i ] ) , dat $ resno [i ] ) ,
2017-09-12 20:09:20 +00:00
pos = 4 ,
offset = 0.4 ,
cex = 0.7 )
}
2017-10-22 01:01:29 +00:00
# You can check the residues in Chimera. Is there anything unusual about these
# residues?
# = 3 Density plots =======================================================
# Such x, y scatter-plots of data that is sampled from a distribution can tell
# us a lot about what shapes the distribution. The distribution is governed by
# the free energy of the phi-psi landscape in folded proteins, since folded
# proteins generally minimize the free energy of their conformations. We observe
# empirically, from comparing frequency statistics and mutation experiments,
# that this generall follows a Boltzmann distribution, where the free energy
# changes we observe in experments that change one conformation into another are
# proportional to the log-ratio of the number of times we observe each
# observation in the protein structure database (after correcting for
# observation bias). The proper way to visualize such 2D landscapes is with
# contour plots.
# == 3.1 Density-based colours =============================================
# A first approximation to scatterplots that visualize the density of the
# underlying distribution is coloring via the densCols() function.
? densCols
iNA <- c ( which ( is.na ( tor $ phi ) ) , which ( is.na ( tor $ psi ) ) )
phi <- tor $ phi [ - iNA ]
psi <- tor $ psi [ - iNA ]
plot ( phi , psi ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
col = densCols ( phi , psi ) ,
pch = 20 , cex = 2 ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
# == 3.2 Plotting with smoothScatter() =====================================
# A better way, that spreads out the underlying density is smoothScatter()
smoothScatter ( phi , psi )
smoothScatter ( phi , psi ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
col = " #0033BB33" ,
pch = 3 , cex = 0.6 ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
# == 3.3 Plotting hexbins ==================================================
# If we wish to approximate values in a histogram-like fashion, we can use
# hexbin()
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " hexbin" , quietly = TRUE ) ) {
2017-10-22 01:01:29 +00:00
install.packages ( " hexbin" )
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help = hexbin) # basic information
# browseVignettes("hexbin") # available vignettes
# data(package = "hexbin") # available datasets
2017-10-22 01:01:29 +00:00
myColorRamp <- colorRampPalette ( c ( " #EEEEEE" ,
" #3399CC" ,
" #2266DD" ) )
2019-01-08 07:11:25 +00:00
hexbin :: gplot.hexbin ( hexbin :: hexbin ( phi , psi , xbins = 10 ) ,
colramp = myColorRamp ,
main = " phi-psi Density Bins for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
# Note:
# Had we loaded hexbin with library(hexbin), the plot function would have
# been dispatched by the plot() generic, and we could simply have written:
# plot(hexbin(phi, psi, xbins = 10), ...
2017-10-22 01:01:29 +00:00
# == 3.4 Plotting density contours =========================================
# The best way to handle such data is provided by the function contour():
? contour
# Contour plots are not produced along the haphazardly sampled values of a data
# set, but on a regular grid. This means, we need to convert observed values
# into estimated densities. Density estimation is an important topic for
# exploratory data analysis, base R has the density() function for 1D
# distributions. But for 2D data like or phi-psi plots, we need a function from
# the MASS package: kde2d()
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " MASS" , quietly = TRUE ) ) {
2017-10-22 01:01:29 +00:00
install.packages ( " MASS" )
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help = MASS) # basic information
# browseVignettes("MASS") # available vignettes
# data(package = "MASS") # available datasets
2017-10-22 01:01:29 +00:00
2019-01-08 07:11:25 +00:00
? MASS :: kde2d
dPhiPsi <- MASS :: kde2d ( phi , psi ,
n = 60 ,
lims = c ( -180 , 180 , -180 , 180 ) )
2017-10-22 01:01:29 +00:00
str ( dPhiPsi )
# This is a list, with gridpoints in x and y, and the estimated densities in z.
# Generic plot with default parameters
contour ( dPhiPsi )
2019-01-08 07:11:25 +00:00
# === 3.4.1 ... as overlay on a colored grid
2017-10-22 01:01:29 +00:00
image ( dPhiPsi ,
col = myColorRamp ( 100 ) ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
contour ( dPhiPsi , col = " royalblue" ,
add = TRUE ,
method = " edge" ,
nlevels = 10 ,
lty = 2 )
points ( phi , psi , col = " #00338866" , pch = 3 , cex = 0.7 )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
2019-01-08 07:11:25 +00:00
# === 3.4.2 ... as filled countour
2017-10-22 01:01:29 +00:00
filled.contour ( dPhiPsi ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
nlevels = 10 ,
color.palette = myColorRamp ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) )
# Note: we can pass additional plotting and overlay commands to the counter plot
# in a block of expressions passed via the plot.axes parameter:
filled.contour ( dPhiPsi ,
xlim = c ( -180 , 180 ) , ylim = c ( -180 , 180 ) ,
nlevels = 10 ,
color.palette = myColorRamp ,
main = " Ramachandran plot for 1BM8" ,
xlab = expression ( phi ) ,
ylab = expression ( psi ) ,
plot.axes = {
contour ( dPhiPsi , col = " #00000044" ,
add = TRUE ,
method = " edge" ,
nlevels = 10 ,
lty = 2 )
points ( phi , psi , col = " #00338866" , pch = 3 , cex = 0.7 )
abline ( h = 0 , lwd = 0.5 , col = " #00000044" )
abline ( v = 0 , lwd = 0.5 , col = " #00000044" )
} )
2019-01-08 07:11:25 +00:00
# === 3.4.3 ... as a perspective plot
2017-10-22 01:01:29 +00:00
persp ( dPhiPsi ,
xlab = " phi" ,
ylab = " psi" ,
zlab = " Density" )
persp ( dPhiPsi ,
theta = 40 ,
phi = 10 ,
col = " #99AACC" ,
xlab = " phi" ,
ylab = " psi" ,
zlab = " Density" )
# = 4 cis-peptide bonds ===================================================
2017-09-12 20:09:20 +00:00
# Are there any cis-peptide bonds in the structure?
tor $ omega
#... gives us a quick answer. But wait - what values do we
2017-10-22 01:01:29 +00:00
# expect? And why are the values so different, ranging from -180° to 180°?
2017-09-12 20:09:20 +00:00
# Consider this plot: what am I doing here and why?
2017-10-22 01:01:29 +00:00
om <- c ( 360 + tor $ omega [tor $ omega < 0 ] ,
tor $ omega [tor $ omega >= 0 ] )
hist ( om , xlim = c ( 0 , 360 ) )
2017-09-12 20:09:20 +00:00
abline ( v = 180 , col = " red" )
2018-01-28 23:23:21 +00:00
# Note: a cis-peptide bond will have an omega torsion angle around 0°
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# = 5 H-bond lengths ======================================================
2017-09-12 20:09:20 +00:00
# Let's do something a little less trivial and compare
# backbone H-bond lengths between helices and strands.
2017-10-22 01:01:29 +00:00
# Secondary structure is defined in the bio3d object's ...$helix and ...$strand
# list elements.
str ( apses )
2017-09-12 20:09:20 +00:00
# We need to
2017-10-22 01:01:29 +00:00
# - collect all N atoms in alpha helices resp.
# beta strands;
# - collect all O atoms in alpha helices resp.
2017-09-12 20:09:20 +00:00
# beta strands;
# - fetch the atom coordinates;
2017-10-22 01:01:29 +00:00
# - calculate all N, O distances;
2017-09-12 20:09:20 +00:00
# - filter them for distances that indicate H-bonds; and,
# - plot the results.
2017-10-22 01:01:29 +00:00
# Secondary structure can either be obtained from definitions contained in most
# PDB files, or by running the DSSP algorithm IF(!) you have it installed on
# your machine. See the dssp() function of bio3d for instructions how to obtain
# and install DSSP and STRIDE. This is highly recommended for "real" work with
# structure coordinate files. The 1BM8 file contains secondary structure
# definitions:
2017-09-12 20:09:20 +00:00
apses $ helix
apses $ sheet
2017-10-22 01:01:29 +00:00
# A function to collect atom indices for particular type of secondary structure
ssSelect <- function ( PDB , myChain = " A" , ssType , myElety ) {
# Function to retrieve specified atom types from specified secondary
# structure types in a PDB object.
# Parameters:
# PDB A bio3D PDB object
# myChain chr The chain to use. Default: chain "A".
# ssType chr A vector of keywords "helix" and/or "sheet"
# myElety chr A vector of $eletype atom types to return
# Value: num Indices of the matching atom rows in PDB$atom
# Build a vector of $resno numbers
starts <- numeric ( )
ends <- numeric ( )
if ( " helix" %in% ssType ) {
sel <- PDB $ helix $ chain %in% myChain
starts <- c ( starts , PDB $ helix $ start [sel ] )
ends <- c ( ends , PDB $ helix $ end [sel ] )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
if ( " sheet" %in% ssType ) {
sel <- PDB $ sheet $ chain %in% myChain
starts <- c ( starts , PDB $ sheet $ start [sel ] )
ends <- c ( ends , PDB $ sheet $ end [sel ] )
}
myResno <- numeric ( )
for ( i in seq_along ( starts ) ) {
myResno <- c ( myResno , starts [i ] : ends [i ] )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# get id's from PDB
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
x <- bio3d :: atom.select ( PDB ,
string = " protein" ,
type = " ATOM" ,
chain = myChain ,
resno = myResno ,
elety = myElety )
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
return ( x $ atom )
}
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Example:
ssSelect ( apses , ssType = " sheet" , myElety = " N" )
ssSelect ( apses , ssType = " sheet" , myElety = " O" )
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# That looks correct: O atoms should be stored three index position after N: the
# sequence of atoms in a PDB file is usually N, CA, C, O ... followed by the
# side chain coordinates.
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# Now to extract the coordinates and calculate distances. Our function needs to
# take the PDB object and two vectors of atom indices as argument, and return a
# vector of pair-distances (actually dist.xyz() returns a matrix).
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
pairDist <- function ( PDB , a , b ) {
# Calculate pairwise distances between atoms indicated by a and b
# Parameters:
# PDB A bio3D PDB object
# a int A vector of atom indexes
# b int A vector of atom indexes
# Value: num matrix of Euclidian distances between the atoms given in a, b.
# There are as many rows as atoms in a, as many columns as
# atoms in b.
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
dMat <- numeric ( )
if ( length ( a ) > 0 && length ( b ) > 0 ) {
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
A <- PDB $ atom [a , c ( " x" , " y" , " z" ) ]
B <- PDB $ atom [b , c ( " x" , " y" , " z" ) ]
2019-01-08 07:11:25 +00:00
dMat <- bio3d :: dist.xyz ( A , B )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
return ( dMat )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Let's see if this looks correct. Let's look at all the pairwise distances
# between N and O atoms in both types of secondary structure:
iN <- ssSelect ( apses , ssType = c ( " helix" , " sheet" ) , myElety = " N" )
iO <- ssSelect ( apses , ssType = c ( " helix" , " sheet" ) , myElety = " O" )
x <- pairDist ( apses , iN , iO )
hist ( x ,
breaks = 80 ,
col = " lavenderblush" ,
main = " " ,
xlab = " N-O distances (Å)" )
# Since we are collecting distance from all secondary structure elements, we
# are just seing a big peak of (meaningless) long-distance separations. We
# need to zoom in on the shorter distances, in which we expect
2017-09-12 20:09:20 +00:00
# hydrogen bonds:
2017-10-22 01:01:29 +00:00
hist ( x [x < 4.2 ] , # restrict to N-O distance less than 4.2 Å long
breaks = seq ( 2.0 , 4.2 , 0.1 ) ,
xlim = c ( 2.0 , 4.2 ) ,
col = " lavenderblush" ,
main = " " ,
xlab = " N-O distances (Å)" )
2017-09-12 20:09:20 +00:00
# There is a large peak at about 2.2Å, and another
# large peak above 3.5Å. But these are not typical hydrogen
# bond distances! Rather these are (N,O) pairs in peptide
2017-10-22 01:01:29 +00:00
# bonds, and within residues. That's not good, because these will contaminate
# our statistics.
# We need to exclude all distances between N of a residue
2017-09-12 20:09:20 +00:00
# and O of a preceeding residue, and all (N,O) pairs in the
2017-10-22 01:01:29 +00:00
# same residue. We need a function to filter distances by residue numbers. And while we are filtering, we might as well throw away the non-H bond distances too.
filterHB <- function ( PDB , iN , iO , dMat , cutoff = 3.9 ) {
# Filters distances between O(i-1) and N(i), and between N(i) and O(i)
# in a distance matrix where there is one row per N-atom and one
# column per O atom.
# Parameters:
# PDB a bio3D PDB object
# iN int a vector of N atom indexes
# iO int a vector of O atom indexes
# dMat num a distance matrix created by pairDist()
# cutoff num only return distances that are shorter than "cutoff
# Value: a distance matrix in which values that do not match the
# filter criteria have bee set to NA.
if ( length ( iN ) > 0 && length ( iO ) > 0 ) {
resN <- PDB $ atom $ resno [iN ]
resO <- PDB $ atom $ resno [iO ]
for ( i in seq_along ( resN ) ) { # for all N atoms
for ( j in seq_along ( resO ) ) { # for all O atoms
if ( dMat [i , j ] > cutoff || # if: distance > cutoff, or ...
( resN [i ] - 1 ) == resO [j ] || # distance is N(i)---O(i-1), or ...
resN [i ] == resO [j ] ) { # distance is N(i)---O(i), then:
dMat [i , j ] <- NA # set this distance to NA.
2017-09-12 20:09:20 +00:00
}
}
}
}
2017-10-22 01:01:29 +00:00
return ( dMat )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Inspect the result:
hist ( filterHB ( apses , iN , iO , x ) ,
breaks = seq ( 2.0 , 4.2 , 0.1 ) ,
xlim = c ( 2.0 , 4.2 ) ,
col = " paleturquoise" ,
main = " " ,
xlab = " N-O distances (Å)" )
2017-09-12 20:09:20 +00:00
# Finally we can calculate alpha- and beta- structure
# bonds and compare them. In this section we'll explore
# different options for histogram plots.
2017-10-22 01:01:29 +00:00
# H-bonds in helices ...
iN <- ssSelect ( apses , ssType = c ( " helix" ) , myElety = " N" )
iO <- ssSelect ( apses , ssType = c ( " helix" ) , myElety = " O" )
dH <- filterHB ( apses , iN , iO , pairDist ( apses , iN , iO ) )
dH <- dH [ ! is.na ( dH ) ]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# H-bonds in sheets. (We commonly use the letter "E" to symbolize a beta
# strand or sheet, because "E" visually evokes an extended strand with
# protruding sidechains.)
iN <- ssSelect ( apses , ssType = c ( " sheet" ) , myElety = " N" )
iO <- ssSelect ( apses , ssType = c ( " sheet" ) , myElety = " O" )
dE <- filterHB ( apses , iN , iO , pairDist ( apses , iN , iO ) )
dE <- dE [ ! is.na ( dE ) ]
2017-09-12 20:09:20 +00:00
# The plain histogram functions without parameters
# give us white stacks.
hist ( dH )
# and ...
hist ( dE )
# We can see that the histrograms look different
# but that is better visualized by showing two plots
# in the same window. We use the par() function, for
# more flexible layout, look up the layout() function.
? par
? layout
opar <- par ( no.readonly = TRUE ) # store current state
par ( mfrow = c ( 2 , 1 ) ) # set graphics parameters: 2 rows, one column
# plot two histograms
hist ( dH )
hist ( dE )
# add color:
hist ( dH , col = " #DD0055" )
hist ( dE , col = " #00AA70" )
# For better comparison, plot both in the
# same window:
hist ( dH , col = " #DD0055" )
hist ( dE , col = " #00AA70" , add = TRUE )
2017-10-22 01:01:29 +00:00
# ... oops, we dind't reset the graphics parameters. You can either close the
# window, a new window will open with default parameters, or ...
2017-09-12 20:09:20 +00:00
par ( opar ) # ... reset the graphics parameters
hist ( dH , col = " #DD0055" )
hist ( dE , col = " #00AA70" , add = TRUE )
2017-10-22 01:01:29 +00:00
# We see that the leftmost column of the sheet bonds hides the helix bonds in
# that column. Not good. But we can make the colors transparent! We just need to
# add a fourth set of two hexadecimal-numbers to the #RRGGBB triplet. Lets use
# 2/3 transparent, in hexadecimal, 1/3 of 256 is x55 - i.e. an RGB triplet
# specied as #RRGGBB55 is only 33% opaque:
2017-09-12 20:09:20 +00:00
hist ( dH , col = " #DD005555" )
hist ( dE , col = " #00AA7055" , add = TRUE )
2017-10-22 01:01:29 +00:00
# To finalize the plots, let's do two more things: Explicitly define the breaks,
# to make sure they match up - otherwise they would not need to... like in this
# example:
2017-09-12 20:09:20 +00:00
hist ( dH , col = " #DD005555" )
hist ( dE [dE < 3 ] , col = " #00AA7055" , add = TRUE )
2017-10-22 01:01:29 +00:00
# Breaks are a parameter in hist() that can either be a scalar, to define how
# many columns you want, or a vector, that defines the actual breakpoints.
2017-09-12 20:09:20 +00:00
brk = seq ( 2.4 , 4.0 , 0.1 )
hist ( dH , col = " #DD005555" , breaks = brk )
hist ( dE , col = " #00AA7055" , breaks = brk , add = TRUE )
2017-10-22 01:01:29 +00:00
# The last thing to do is to think about rescaling the plot. You notice that the
# y-axis is scaled in absolute frequency (i.e. counts). That gives us some
# impression of the relative frequency, but it is of course skewed by observing
# relatively more or less of one type of secondary structure in a protein. As
# part of the hist() function we can rescale the values so that the sum over all
# is one: set the prameter freq=FALSE.
2017-09-12 20:09:20 +00:00
hist ( dH , col = " #DD005555" , breaks = brk , freq = FALSE )
hist ( dE , col = " #00AA7055" , breaks = brk , freq = FALSE , add = TRUE )
# Adding labels and legend ...
hH <- hist ( dH ,
freq = FALSE ,
breaks = brk ,
col = " #DD005550" ,
xlab = " (N,O) distance (Å)" ,
ylab = " Density" ,
ylim = c ( 0 , 4 ) ,
main = " Helix and Sheet H-bond lengths" )
hE <- hist ( dE ,
freq = FALSE ,
breaks = brk ,
col = " #00AA7060" ,
add = TRUE )
legend ( " topright" ,
c ( sprintf ( " alpha (N = %3d)" , sum ( hH $ counts ) ) ,
sprintf ( " beta (N = %3d)" , sum ( hE $ counts ) ) ) ,
fill = c ( " #DD005550" , " #00AA7060" ) , bty = ' n' ,
border = NA )
2017-10-22 01:01:29 +00:00
2017-09-12 20:09:20 +00:00
# With all the functions we have defined,
# it is easy to try this with a larger protein.
2017-10-22 01:01:29 +00:00
# 3ugj for example is VERY large.
2017-09-12 20:09:20 +00:00
pdb <- read.pdb ( " 3ugj" )
2017-10-22 01:01:29 +00:00
# helices...
iN <- ssSelect ( pdb , ssType = c ( " helix" ) , myElety = " N" )
iO <- ssSelect ( pdb , ssType = c ( " helix" ) , myElety = " O" )
dH <- filterHB ( pdb , iN , iO , pairDist ( pdb , iN , iO ) )
dH <- dH [ ! is.na ( dH ) ]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# sheets
iN <- ssSelect ( pdb , ssType = c ( " sheet" ) , myElety = " N" )
iO <- ssSelect ( pdb , ssType = c ( " sheet" ) , myElety = " O" )
dE <- filterHB ( pdb , iN , iO , pairDist ( pdb , iN , iO ) )
dE <- dE [ ! is.na ( dE ) ]
2017-09-12 20:09:20 +00:00
2017-10-22 01:01:29 +00:00
# histograms ...
2017-09-12 20:09:20 +00:00
brk = seq ( 2.4 , 4.0 , 0.1 )
hH <- hist ( dH ,
freq = FALSE ,
breaks = brk ,
col = " #DD005550" ,
xlab = " (N,O) distance (Å)" ,
ylab = " Density" ,
ylim = c ( 0 , 4 ) ,
main = " Helix and Sheet H-bond lengths" )
hE <- hist ( dE ,
freq = FALSE ,
breaks = brk ,
col = " #00AA7060" ,
add = TRUE )
legend ( ' topright' ,
c ( paste ( " alpha (N = " , sum ( hH $ counts ) , " )" ) ,
paste ( " beta (N = " , sum ( hE $ counts ) , " )" ) ) ,
fill = c ( " #DD005550" , " #00AA7060" ) , bty = ' n' ,
border = NA ,
inset = 0.1 )
2017-10-22 01:01:29 +00:00
# It looks more and more that the distribution is indeed different. Our sample
# is large, but derives from a single protein. To do database scale statistics,
# we should look at many more proteins. To give you a sense of how, let's do
# this for just ten proteins, randomly selected from non-homologous,
# high-resolution, single domain structures. I have provided a utility function
# for such a selection (details beyond the scope of this project).
myPDBs <- selectPDBrep ( 10 )
# My selection is "2OVJ", "1HQS", "3BON", "4JZX", "3BQ3", "2IUM", "2C9E",
# "4X1F", "2V3I", "3GE3". Yours will be different.
# We are loading the files online - don't do this if you have bandwidth
# limitations.
dH <- c ( ) # collect all helix H-bonds here
dE <- c ( ) # collect all sheet H-bonds here
for ( i in seq_along ( myPDBs ) ) {
pdb <- read.pdb ( myPDBs [i ] )
# helices...
iN <- ssSelect ( pdb , ssType = c ( " helix" ) , myElety = " N" )
iO <- ssSelect ( pdb , ssType = c ( " helix" ) , myElety = " O" )
x <- filterHB ( pdb , iN , iO , pairDist ( pdb , iN , iO ) )
dH <- c ( dH , x [ ! is.na ( x ) ] )
# sheets
iN <- ssSelect ( pdb , ssType = c ( " sheet" ) , myElety = " N" )
iO <- ssSelect ( pdb , ssType = c ( " sheet" ) , myElety = " O" )
x <- filterHB ( pdb , iN , iO , pairDist ( pdb , iN , iO ) )
dE <- c ( dE , x [ ! is.na ( x ) ] )
2017-09-12 20:09:20 +00:00
}
2017-10-22 01:01:29 +00:00
# Inspect the results
length ( dH ) # 4415 (your numbers are different, but it should be a lot)
length ( dE ) # 262
2017-09-12 20:09:20 +00:00
brk = seq ( 2.0 , 4.0 , 0.1 )
hH <- hist ( dH ,
freq = FALSE ,
breaks = brk ,
col = " #DD005550" ,
xlab = " (N,O) distance (Å)" ,
ylab = " Density" ,
ylim = c ( 0 , 4 ) ,
2017-10-22 01:01:29 +00:00
cex.main = 0.8 ,
main = " Helix and Sheet H-bond lengths (10 representative structures)" )
2017-09-12 20:09:20 +00:00
hE <- hist ( dE ,
freq = FALSE ,
breaks = brk ,
col = " #00AA7060" ,
add = TRUE )
legend ( ' topright' ,
c ( paste ( " alpha (N = " , sum ( hH $ counts ) , " )" ) ,
paste ( " beta (N = " , sum ( hE $ counts ) , " )" ) ) ,
fill = c ( " #DD005550" , " #00AA7060" ) , bty = ' n' ,
border = NA ,
inset = 0.1 )
2017-10-22 01:01:29 +00:00
# Task:
# Why do you think these distributions are different?
# At what distance do you think H-bonds have the lowest energy?
# For alpha-helices? For beta-strands?
2017-09-12 20:09:20 +00:00
# [END]