718 lines
22 KiB
R
718 lines
22 KiB
R
|
# RPR-SX-PDB.R
|
||
|
#
|
||
|
# Purpose: A Bioinformatics Course:
|
||
|
# R code accompanying the RPR-SX-PDB unit.
|
||
|
#
|
||
|
# Version: 0.1
|
||
|
#
|
||
|
# Date: 2017 08 28
|
||
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||
|
#
|
||
|
# Versions:
|
||
|
# 0.1 First code copied from 2016 material.
|
||
|
|
||
|
#
|
||
|
# TODO:
|
||
|
#
|
||
|
#
|
||
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||
|
|
||
|
# If there are portions you don't understand, use R's help system, Google for an
|
||
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||
|
# going on. That's not how it works ...
|
||
|
|
||
|
# ==============================================================================
|
||
|
|
||
|
# = 1 ___Section___
|
||
|
|
||
|
# In this example of protein structure interpretation, we ...
|
||
|
# - load the library "bio3D" which supports work with
|
||
|
# protein structure files,
|
||
|
# - explore some elementary functions of the library
|
||
|
# - assemble a script to see whether H-bond lengths systematically differ
|
||
|
# between alpha-helical and beta-sheet structures.
|
||
|
|
||
|
if(!require(bio3d)) {
|
||
|
install.packages("bio3d", dependencies=TRUE)
|
||
|
library(bio3d)
|
||
|
}
|
||
|
|
||
|
lbio3d() # ... lists the newly installed functions,
|
||
|
# they all have help files associated.
|
||
|
# More information is available in the so-called
|
||
|
# "vignettes" that are distributed with most R packages:
|
||
|
vignette("bio3d_vignettes")
|
||
|
|
||
|
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
|
||
|
# store them locally, but you could.
|
||
|
#
|
||
|
# But before you _load_ a file, have a look what such a file contains. I have packaged the 1BM8 file into the project:
|
||
|
file.show("./assets/1BM8.pdb")
|
||
|
|
||
|
# Have a look at the header section, the atom records, the coordinate records
|
||
|
# etc. Answer the following questions:
|
||
|
#
|
||
|
# What is the resolution of the structure?
|
||
|
# Is the first residue in the SEQRES section also the first residue
|
||
|
# with an ATOM record?
|
||
|
# How many water molecules does the structure contain?
|
||
|
# Can you explain REMARK 525 regarding HOH 459 and HOH 473?
|
||
|
# Are all atoms of the N-terminal residue present?
|
||
|
# Are all atoms of the C-terminal residue present?
|
||
|
|
||
|
apses <- read.pdb("1bm8") # load a molecule directly from PDB
|
||
|
|
||
|
# check what we have:
|
||
|
apses
|
||
|
|
||
|
# what is this actually?
|
||
|
str(apses)
|
||
|
|
||
|
# bio3d's pdb objects are simple lists. Great! You know lists!
|
||
|
|
||
|
# You see that there is a table called atom in which the elements are vectors of the same length - namely the number of atoms in the structure file. And there is a matrix of (x, y, z) triplets called xyz. And there is a vector that holds sequence, and two tables called helix and sheet that look a lot like our feature annotation tables - in fact many of the principles to store this strutcure data are similar to how we constructed our protein database. Let's pull out a few values to confirm how selection and subsetting works here:
|
||
|
|
||
|
# selection by atom ...
|
||
|
i <- 5
|
||
|
apses$atom[i,]
|
||
|
apses$atom[i, c("x", "y", "z")] # here we are selecting with column names!
|
||
|
apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers
|
||
|
|
||
|
# all atoms of a residue ...
|
||
|
i <- "48" #string!
|
||
|
apses$atom[apses$atom[,"resno"] == i, ]
|
||
|
|
||
|
# sequence of the first ten residues
|
||
|
apses$seqres[1:10] # the As here identify the chain
|
||
|
|
||
|
# Can we convert this to one letter code?
|
||
|
aa321(apses$seqres[1:10])
|
||
|
|
||
|
# Lets get the implicit sequence:
|
||
|
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
|
||
|
|
||
|
# Do explicit and implicit sequence have the same length?
|
||
|
length(apses$seqres)
|
||
|
length(apses$atom$resid[apses$calpha])
|
||
|
|
||
|
# Are the sequences the same?
|
||
|
sum(apses$seqres == apses$atom$resid[apses$calpha])
|
||
|
|
||
|
# get a list of all CA atoms of arginine residues
|
||
|
sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA"
|
||
|
apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
|
||
|
|
||
|
# The introduction to bio3d tutorial at
|
||
|
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
|
||
|
# has the following example:
|
||
|
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
|
||
|
|
||
|
# Good for now. Let's do some real work.
|
||
|
|
||
|
# ==============================================================================
|
||
|
# PART TWO: A Ramachandran plot
|
||
|
# ==============================================================================
|
||
|
|
||
|
# Calculate a Ramachandran plot for the structure
|
||
|
tor <- torsion.pdb(apses)
|
||
|
plot(tor$phi, tor$psi)
|
||
|
|
||
|
# As you can see, there are a number of points in the upper-right
|
||
|
# quadrant of the plot. This combination of phi-psi angles defines
|
||
|
# the conformation of a left-handed alpha helix and is generally
|
||
|
# only observed for glycine residues. Let's replot the data, but
|
||
|
# color the points for glycine residues differently. First, we
|
||
|
# get a vector of glycine residue indices in the structure:
|
||
|
|
||
|
sSeq <- pdbseq(apses)
|
||
|
|
||
|
# Explore the result object and extract the indices of GLY resiues.
|
||
|
sSeq == "G"
|
||
|
which(sSeq == "G")
|
||
|
as.numeric(which(sSeq == "G"))
|
||
|
iGly <- as.numeric(which(sSeq == "G"))
|
||
|
|
||
|
# Now plot all non-gly residues.
|
||
|
# Remember: negative indices exclude items from a vector
|
||
|
plot(tor$phi[-iGly], tor$psi[-iGly], xlim=c(-180,180), ylim=c(-180,180))
|
||
|
|
||
|
# Now plot GLY only, but with green dots:
|
||
|
points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00")
|
||
|
|
||
|
# As you see, four residues in the upper-right quadrant are
|
||
|
# not glycine. But what residues are these? Is there an
|
||
|
# error in our script? Let's get their coordinate records:
|
||
|
|
||
|
iOutliers <- which(tor$phi > 30 & tor$phi < 90 &
|
||
|
tor$psi > 0 & tor$psi < 90)
|
||
|
CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")]
|
||
|
dat <- cbind(CA[iOutliers, ], phi = tor$phi[iOutliers], psi = tor$psi[iOutliers])
|
||
|
dat
|
||
|
|
||
|
# remove the glycine from our table ...
|
||
|
dat <- dat[dat$resid != "GLY", ]
|
||
|
dat
|
||
|
|
||
|
# let's add the residue numbers to the plot with the text function:
|
||
|
for (i in 1:nrow(dat)) {
|
||
|
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
|
||
|
text(dat$phi[i],
|
||
|
dat$psi[i],
|
||
|
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
|
||
|
pos = 4,
|
||
|
offset = 0.4,
|
||
|
cex = 0.7)
|
||
|
}
|
||
|
|
||
|
# You can check the residues in Chimera. Is there anything unusual?
|
||
|
|
||
|
# Are there any cis-peptide bonds in the structure?
|
||
|
tor$omega
|
||
|
#... gives us a quick answer. But wait - what values do we
|
||
|
# expect? And why are the values so different?
|
||
|
# Consider this plot: what am I doing here and why?
|
||
|
x <- tor$omega[tor$omega > 0]
|
||
|
x <- c(x, 360 + tor$omega[tor$omega < 0])
|
||
|
hist(x, xlim=c(90,270))
|
||
|
abline(v=180, col="red")
|
||
|
|
||
|
|
||
|
|
||
|
# ==============================================================================
|
||
|
# PART THREE: H-bond lengths
|
||
|
# ==============================================================================
|
||
|
|
||
|
# Let's do something a little less trivial and compare
|
||
|
# backbone H-bond lengths between helices and strands.
|
||
|
#
|
||
|
# Secondary structure is defined in the list components ...$helix
|
||
|
# and ...$strand.
|
||
|
|
||
|
# We need to
|
||
|
# - collect all residue indices for alpha helices resp.
|
||
|
# beta strands;
|
||
|
# - fetch the atom coordinates;
|
||
|
# - calculate all N, O distances using dist.xyz();
|
||
|
# - filter them for distances that indicate H-bonds; and,
|
||
|
# - plot the results.
|
||
|
|
||
|
# Secondary structure can either be obtained from
|
||
|
# definitions contained in many PDB files, or by running
|
||
|
# the DSSP algorithm IF(!) you have it installed on your
|
||
|
# machine. The 1BM8 file contains definitions
|
||
|
|
||
|
apses$helix
|
||
|
apses$sheet
|
||
|
|
||
|
|
||
|
# (1): collect the residue numbers
|
||
|
# between the segment boundaries.
|
||
|
|
||
|
H <- numeric() # This will contain the helix residue numbers
|
||
|
for (i in 1:length(apses$helix)) {
|
||
|
H <- c(H, apses$helix$start[i]:apses$helix$end[i])
|
||
|
}
|
||
|
|
||
|
# Doing the same for the sheet residue numbers requires
|
||
|
# very similar code. Rather than retype the code, it is
|
||
|
# better to write a function that can do both.
|
||
|
|
||
|
getSecondary <- function(sec) {
|
||
|
iRes <- c()
|
||
|
for (i in 1:length(sec$start)) {
|
||
|
iRes <- c(iRes, sec$start[i]:sec$end[i])
|
||
|
}
|
||
|
return(iRes)
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# Compare ...
|
||
|
H
|
||
|
getSecondary(apses$helix)
|
||
|
|
||
|
# ... and use for strands
|
||
|
|
||
|
E <- getSecondary(apses$sheet)
|
||
|
|
||
|
|
||
|
# Now here's a problem: these numbers refer to the
|
||
|
# residue numbers as defined in the atom records. They
|
||
|
# can't be used directly to address e.g. the first, second
|
||
|
# third residue etc. since the first residue has the
|
||
|
# residue number 4...
|
||
|
apses$atom[1,]
|
||
|
|
||
|
# Therefore we need to
|
||
|
# 1: convert the numbers to strings;
|
||
|
# 2: subset the atom table for rows contain these strings.
|
||
|
#
|
||
|
# Essentially, we don't treat the "residue numbers" as numbers,
|
||
|
# but as labels. That's fine, as long as they are unique.
|
||
|
|
||
|
# (2): fetch coordinates of N and O atoms
|
||
|
# for residues in alpha- and beta- conformation.
|
||
|
|
||
|
# For one residue, the procedure goes as follows:
|
||
|
|
||
|
res <- H[17] # pick an arbitrary alpha-helix residue to illustrate
|
||
|
res
|
||
|
res <- as.character(res)
|
||
|
res
|
||
|
|
||
|
# all atom rows for this residue
|
||
|
apses$atom[apses$atom[,"resno"] == res, ]
|
||
|
|
||
|
# add condition: row with "N" atom only
|
||
|
apses$atom[apses$atom[,"resno"] == res &
|
||
|
apses$atom[,"elety"] == "N", ]
|
||
|
|
||
|
# add column selection: "x", "y", "z"
|
||
|
apses$atom[apses$atom[,"resno"] == res &
|
||
|
apses$atom[,"elety"] == "N",
|
||
|
c("x", "y", "z")]
|
||
|
|
||
|
# convert to numbers
|
||
|
as.numeric (
|
||
|
apses$atom[apses$atom[,"resno"] == res &
|
||
|
apses$atom[,"elety"] == "N",
|
||
|
c("x", "y", "z")]
|
||
|
)
|
||
|
|
||
|
# Now we need to add this into a matrix as we iterate over the desired residues.
|
||
|
# We need to execute the procedure four times for alpha and beta Ns and Os
|
||
|
# respectively. Rather than duplicate the code four times over, we write a
|
||
|
# function. Never duplicate code, because if you need to make changes it is too
|
||
|
# easy to forget making the change consistently in all copies.
|
||
|
|
||
|
|
||
|
getAtom <- function(PDB, r, AT) {
|
||
|
# Function to iterate over residue number strings and
|
||
|
# return a matrix of x, y, z triplets for each atom
|
||
|
# of a requested type.
|
||
|
mat <- c() #initialize as empty matrix
|
||
|
for (i in 1:length(r)) {
|
||
|
res <- as.character(r[i])
|
||
|
v <- as.numeric (
|
||
|
PDB$atom[PDB$atom[,"resno"] == res &
|
||
|
PDB$atom[,"elety"] == AT,
|
||
|
c("x", "y", "z")]
|
||
|
)
|
||
|
mat <- rbind(mat, v)
|
||
|
}
|
||
|
return(mat)
|
||
|
}
|
||
|
|
||
|
# Now run the functions with the parameters we need...
|
||
|
H.xyz.N <- getAtom(apses, H, "N") # backbone N atoms in helix
|
||
|
H.xyz.O <- getAtom(apses, H, "O") # backbone O atoms in helix
|
||
|
E.xyz.N <- getAtom(apses, E, "N") # backbone N atoms in strand
|
||
|
E.xyz.O <- getAtom(apses, E, "O") # backbone O atoms in strand
|
||
|
|
||
|
|
||
|
# (3): calculate distances between N and O atoms to find H-bonds.
|
||
|
|
||
|
# We spent most of our effort so far just preparing our raw data for analysis.
|
||
|
# Now we can actually start measuring. bio3d provides the function dist.xyz() -
|
||
|
# but lets agree it builds character to code this ourselves.
|
||
|
|
||
|
# Consider the distance of the first (N,O) pair.
|
||
|
H.xyz.N[1,]
|
||
|
H.xyz.O[1,]
|
||
|
|
||
|
a <- H.xyz.N[1,]
|
||
|
b <- H.xyz.O[1,]
|
||
|
|
||
|
dist.xyz(a, b)
|
||
|
|
||
|
# or ...
|
||
|
sqrt( (a[1]-b[1])^2 + (a[2]-b[2])^2 + (a[3]-b[3])^2 )
|
||
|
# ... i.e. pythagoras theorem in 3D.
|
||
|
|
||
|
|
||
|
# Calculating all pairwise distances from a matrix of
|
||
|
# xyz coordinates sounds like a useful function.
|
||
|
|
||
|
PairDist.xyz <- function(xyzA, xyzB) {
|
||
|
PD <- c()
|
||
|
for (i in 1:nrow(xyzA)) {
|
||
|
for (j in 1:nrow(xyzB)) {
|
||
|
PD <- c(PD, dist.xyz(xyzA[i,], xyzB[j,]))
|
||
|
}
|
||
|
}
|
||
|
return(PD)
|
||
|
}
|
||
|
|
||
|
# And see what we get:
|
||
|
D <- PairDist.xyz(H.xyz.N, H.xyz.O)
|
||
|
hist(D)
|
||
|
|
||
|
# let's zoom in on the shorter distances, in which we expect
|
||
|
# hydrogen bonds:
|
||
|
hist(D[D < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
|
||
|
|
||
|
# There is a large peak at about 2.2Å, and another
|
||
|
# large peak above 3.5Å. But these are not typical hydrogen
|
||
|
# bond distances! Rather these are (N,O) pairs in peptide
|
||
|
# bonds, and within residues. That's not good, it will
|
||
|
# cause all sorts of problems with statistics later.
|
||
|
# We should exclude all distances between N of a residue
|
||
|
# and O of a preceeding residue, and all (N,O) pairs in the
|
||
|
# same residue. But for this, we need to store atom type
|
||
|
# and residue information with the coordinates. Our code
|
||
|
# will get a bit bulkier. It's often hard to find a good
|
||
|
# balance between terse generic code, and code that
|
||
|
# handles special cases.
|
||
|
|
||
|
# We could do two things:
|
||
|
# - add a column with residue information to the coordinates
|
||
|
# - add a column with atom type information
|
||
|
# ... but actually we also would need chain information, and
|
||
|
# then we really have almost everything that is contained in the record
|
||
|
# in the first place.
|
||
|
|
||
|
# This suggests a different strategy: let's rewrite our function
|
||
|
# getAtom() to return indices, not coordinates, and use the indices
|
||
|
# to extract coordinates, and THEN we can add all sorts of
|
||
|
# additional constraints.
|
||
|
|
||
|
# Here we set up the function with a default chain argument
|
||
|
|
||
|
getAtomIndex <- function(PDB, V_res, elety, chain="A") {
|
||
|
# Function to access a bio3d pdb object, iterate over
|
||
|
# a vector of residues and return a vector of indices
|
||
|
# to matching atom elements. Nb. bio3d handles insert
|
||
|
# and alt fields incorrectly: their values should not
|
||
|
# be NA but " ", i.e. a single blank. Therefore this
|
||
|
# function does not test for "alt" and "insert".
|
||
|
|
||
|
v <- c() #initialize as empty vector
|
||
|
for (i in 1:length(V_res)) {
|
||
|
res <- as.character(V_res[i])
|
||
|
x <- which(PDB$atom[,"resno"] == res &
|
||
|
PDB$atom[,"chain"] == chain &
|
||
|
PDB$atom[,"elety"] == elety)
|
||
|
v <- c(v, x)
|
||
|
}
|
||
|
return(v)
|
||
|
}
|
||
|
|
||
|
# test this ...
|
||
|
getAtomIndex(apses, H, "N")
|
||
|
getAtomIndex(apses, H, "O")
|
||
|
|
||
|
# That looks correct: O atoms should be stored three
|
||
|
# rows further down: the sequence of atoms in a PDB
|
||
|
# file is usually N, CA, C, O ... followed by the side
|
||
|
# chain coordinates.
|
||
|
|
||
|
# Now to extract the coordinates and calculate distances.
|
||
|
# Our function needs to take the PDB object and
|
||
|
# two vectors of atom indices as argument, and return
|
||
|
# a vector of pair-distances.
|
||
|
|
||
|
PairDist <- function(PDB, a, b) {
|
||
|
PD <- c()
|
||
|
for (i in 1:length(a)) {
|
||
|
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
|
||
|
for (j in 1:length(b)) {
|
||
|
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
|
||
|
PD <- c(PD, dist.xyz(p, q))
|
||
|
}
|
||
|
}
|
||
|
return(PD)
|
||
|
}
|
||
|
|
||
|
# Let's see if this looks correct:
|
||
|
|
||
|
H.N <- getAtomIndex(apses, H, "N")
|
||
|
H.O <- getAtomIndex(apses, H, "O")
|
||
|
X <- PairDist(apses, H.N, H.O)
|
||
|
hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
|
||
|
|
||
|
# Now we are back where we started out from, but with
|
||
|
# a different logic of the function that we can easily
|
||
|
# modify to exclude (N_i, O_i-1) distances (peptide bond),
|
||
|
# and (N_i, O_i) distances (within residue).
|
||
|
|
||
|
HB <- function(PDB, a, b) {
|
||
|
HBcutoff <- 4.0
|
||
|
PD <- c()
|
||
|
for (i in 1:length(a)) {
|
||
|
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
|
||
|
res_i <- as.numeric(PDB$atom[a[i], "resno"])
|
||
|
for (j in 1:length(b)) {
|
||
|
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
|
||
|
res_j <- as.numeric(PDB$atom[a[j], "resno"])
|
||
|
if (res_i != res_j+1 &
|
||
|
res_i != res_j ) {
|
||
|
d <- dist.xyz(p, q)
|
||
|
if (d < HBcutoff) {
|
||
|
PD <- c(PD, d)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return(PD)
|
||
|
}
|
||
|
|
||
|
# test this:
|
||
|
X <- HB(apses, H.N, H.O)
|
||
|
hist(X)
|
||
|
|
||
|
# ... and this looks much more like the distribution we are
|
||
|
# seeking.
|
||
|
|
||
|
# Why did we go along this detour for coding the
|
||
|
# function? The point is that there are usually
|
||
|
# several ways to use or define datastructures and
|
||
|
# functions. Which one is the best way may not be
|
||
|
# obvious until you understand the problem better.
|
||
|
# At first, we wrote a very generic function that
|
||
|
# extracts atom coordinates to be able to compute
|
||
|
# with them. This is simple and elegant. But we
|
||
|
# recognized limitations in that we could not
|
||
|
# make more sophisticated selections that we needed
|
||
|
# to reflect our biological idea of hydrogen
|
||
|
# bonds. Thus we changed our datastructure
|
||
|
# and functions to accomodate our new requirements
|
||
|
# better. You have to be flexible and able to look
|
||
|
# at a task from different angles to succeed.
|
||
|
|
||
|
# Finally we can calculate alpha- and beta- structure
|
||
|
# bonds and compare them. In this section we'll explore
|
||
|
# different options for histogram plots.
|
||
|
|
||
|
H.N <- getAtomIndex(apses, H, "N")
|
||
|
H.O <- getAtomIndex(apses, H, "O")
|
||
|
dH <- HB(apses, H.N, H.O)
|
||
|
|
||
|
E.N <- getAtomIndex(apses, E, "N")
|
||
|
E.O <- getAtomIndex(apses, E, "O")
|
||
|
dE <- HB(apses, E.N, E.O)
|
||
|
|
||
|
# The plain histogram functions without parameters
|
||
|
# give us white stacks.
|
||
|
|
||
|
hist(dH)
|
||
|
|
||
|
# and ...
|
||
|
hist(dE)
|
||
|
|
||
|
# We can see that the histrograms look different
|
||
|
# but that is better visualized by showing two plots
|
||
|
# in the same window. We use the par() function, for
|
||
|
# more flexible layout, look up the layout() function.
|
||
|
?par
|
||
|
?layout
|
||
|
|
||
|
opar <- par(no.readonly=TRUE) # store current state
|
||
|
par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column
|
||
|
|
||
|
# plot two histograms
|
||
|
hist(dH)
|
||
|
hist(dE)
|
||
|
|
||
|
|
||
|
# add color:
|
||
|
hist(dH, col="#DD0055")
|
||
|
hist(dE, col="#00AA70")
|
||
|
|
||
|
|
||
|
|
||
|
# For better comparison, plot both in the
|
||
|
# same window:
|
||
|
|
||
|
hist(dH, col="#DD0055")
|
||
|
hist(dE, col="#00AA70", add=TRUE)
|
||
|
|
||
|
# ... oops, we dind't reset the graphics parameters.
|
||
|
# You can either close the window, a new window
|
||
|
# will open with default parameters, or ...
|
||
|
par(opar) # ... reset the graphics parameters
|
||
|
|
||
|
hist(dH, col="#DD0055")
|
||
|
hist(dE, col="#00AA70", add=TRUE)
|
||
|
|
||
|
# We see that the leftmost column of the sheet bonds
|
||
|
# overlaps the helix bonds. Not good. But we
|
||
|
# can make the colors transparent! We just need to
|
||
|
# add a fourth set of two hexadecimal-numbers to
|
||
|
# the #RRGGBB triplet. Lets use 2/3 transparent,
|
||
|
# in hexadecimal, 1/3 of 256 is x55 - i.e. an
|
||
|
# RGB triplet specied as #RRGGBB55 is only 33%
|
||
|
# opaque:
|
||
|
|
||
|
hist(dH, col="#DD005555")
|
||
|
hist(dE, col="#00AA7055", add=TRUE)
|
||
|
|
||
|
# To finalize the plots, let's do two more things:
|
||
|
# Explicitly define the breaks, to make sure they
|
||
|
# match up - otherwise they would not need to...
|
||
|
# see for example:
|
||
|
|
||
|
hist(dH, col="#DD005555")
|
||
|
hist(dE[dE < 3], col="#00AA7055", add=TRUE)
|
||
|
|
||
|
# Breaks are a parameter in hist() that can
|
||
|
# either be a scalar, to define how many columns
|
||
|
# you want, or a vector, that defines the actual
|
||
|
# breakpoints.
|
||
|
brk=seq(2.4, 4.0, 0.1)
|
||
|
|
||
|
hist(dH, col="#DD005555", breaks=brk)
|
||
|
hist(dE, col="#00AA7055", breaks=brk, add=TRUE)
|
||
|
|
||
|
# The last thing to do is to think about rescaling the plot.
|
||
|
# You notice that the y-axis is scaled in absolute frequency.
|
||
|
# That gives us some impression of the relative frequency,
|
||
|
# but it is of course skewed by observing relatively more
|
||
|
# or less of one type of secondary structure in a protein.
|
||
|
# As part of the hist() function we can rescale the values so
|
||
|
# that the sum over all is one: set the prameter freq=FALSE.
|
||
|
|
||
|
hist(dH, col="#DD005555", breaks=brk, freq=FALSE)
|
||
|
hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE)
|
||
|
|
||
|
# Adding labels and legend ...
|
||
|
|
||
|
hH <- hist(dH,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#DD005550",
|
||
|
xlab="(N,O) distance (Å)",
|
||
|
ylab="Density",
|
||
|
ylim=c(0,4),
|
||
|
main="Helix and Sheet H-bond lengths")
|
||
|
hE <- hist(dE,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#00AA7060",
|
||
|
add=TRUE)
|
||
|
|
||
|
legend("topright",
|
||
|
c(sprintf("alpha (N = %3d)", sum(hH$counts)),
|
||
|
sprintf("beta (N = %3d)", sum(hE$counts))),
|
||
|
fill = c("#DD005550", "#00AA7060"), bty = 'n',
|
||
|
border = NA)
|
||
|
# ===========================================================
|
||
|
# With all the functions we have defined,
|
||
|
# it is easy to try this with a larger protein.
|
||
|
# 3ugj for example is VERY large. The calculation will take a few
|
||
|
# minutes:
|
||
|
|
||
|
pdb <- read.pdb("3ugj")
|
||
|
|
||
|
H <- getSecondary(pdb$helix)
|
||
|
E <- getSecondary(pdb$sheet)
|
||
|
|
||
|
H.N <- getAtomIndex(pdb, H, "N")
|
||
|
H.O <- getAtomIndex(pdb, H, "O")
|
||
|
dH <- HB(pdb, H.N, H.O)
|
||
|
|
||
|
E.N <- getAtomIndex(pdb, E, "N")
|
||
|
E.O <- getAtomIndex(pdb, E, "O")
|
||
|
dE <- HB(pdb, E.N, E.O)
|
||
|
|
||
|
brk=seq(2.4, 4.0, 0.1)
|
||
|
|
||
|
hH <- hist(dH,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#DD005550",
|
||
|
xlab="(N,O) distance (Å)",
|
||
|
ylab="Density",
|
||
|
ylim=c(0,4),
|
||
|
main="Helix and Sheet H-bond lengths")
|
||
|
hE <- hist(dE,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#00AA7060",
|
||
|
add=TRUE)
|
||
|
|
||
|
legend('topright',
|
||
|
c(paste("alpha (N = ", sum(hH$counts), ")"),
|
||
|
paste("beta (N = ", sum(hE$counts), ")")),
|
||
|
fill = c("#DD005550", "#00AA7060"), bty = 'n',
|
||
|
border = NA,
|
||
|
inset = 0.1)
|
||
|
|
||
|
# It looks more and more that the distribution is
|
||
|
# indeed different. Our sample is large, but derives
|
||
|
# from a single protein.
|
||
|
# To do database scale statistics, we should look
|
||
|
# at many more proteins. To give you a sense of how,
|
||
|
# let's do this for just ten proteins, taken from
|
||
|
# the architecture level of the CATH database for
|
||
|
# mixed alpha-beta proteins (see:
|
||
|
# http://www.cathdb.info/browse/browse_hierarchy_tree):
|
||
|
|
||
|
PDBarchitectures <- c("3A4R", "A")
|
||
|
names(PDBarchitectures) <- c("ID", "chain")
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1EWF","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("2VXN","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1I3K","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1C0P","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("3QVP","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1J5U","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("2IMH","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("3NVS","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1UD9","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1XKN","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("1OZN","A"))
|
||
|
PDBarchitectures <- rbind(PDBarchitectures, c("2DKJ","A"))
|
||
|
|
||
|
dH <- c()
|
||
|
dE <- c()
|
||
|
|
||
|
for (i in 1:nrow(PDBarchitectures)) {
|
||
|
pdb <- read.pdb(PDBarchitectures[i,1])
|
||
|
chain <- PDBarchitectures[i,2]
|
||
|
H <- getSecondary(pdb$helix)
|
||
|
H.N <- getAtomIndex(pdb, H, "N", chain)
|
||
|
H.O <- getAtomIndex(pdb, H, "O", chain)
|
||
|
dH <- c(dH, HB(pdb, H.N, H.O))
|
||
|
|
||
|
E <- getSecondary(pdb$sheet)
|
||
|
E.N <- getAtomIndex(pdb, E, "N", chain)
|
||
|
E.O <- getAtomIndex(pdb, E, "O", chain)
|
||
|
dE <- c(dE, HB(pdb, E.N, E.O))
|
||
|
}
|
||
|
|
||
|
brk=seq(2.0, 4.0, 0.1)
|
||
|
|
||
|
hH <- hist(dH,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#DD005550",
|
||
|
xlab="(N,O) distance (Å)",
|
||
|
ylab="Density",
|
||
|
ylim=c(0,4),
|
||
|
main="Helix and Sheet H-bond lengths")
|
||
|
hE <- hist(dE,
|
||
|
freq=FALSE,
|
||
|
breaks=brk,
|
||
|
col="#00AA7060",
|
||
|
add=TRUE)
|
||
|
|
||
|
legend('topright',
|
||
|
c(paste("alpha (N = ", sum(hH$counts), ")"),
|
||
|
paste("beta (N = ", sum(hE$counts), ")")),
|
||
|
fill = c("#DD005550", "#00AA7060"), bty = 'n',
|
||
|
border = NA,
|
||
|
inset = 0.1)
|
||
|
|
||
|
# Why do you think these distributions are different?
|
||
|
# At what distance do you think H-bonds have the lowest energy?
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# = 1 Tasks
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# [END]
|