bch441-work-abc-units/RPR-SX-PDB.R
2017-09-12 16:09:20 -04:00

718 lines
22 KiB
R

# RPR-SX-PDB.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-SX-PDB unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# In this example of protein structure interpretation, we ...
# - load the library "bio3D" which supports work with
# protein structure files,
# - explore some elementary functions of the library
# - assemble a script to see whether H-bond lengths systematically differ
# between alpha-helical and beta-sheet structures.
if(!require(bio3d)) {
install.packages("bio3d", dependencies=TRUE)
library(bio3d)
}
lbio3d() # ... lists the newly installed functions,
# they all have help files associated.
# More information is available in the so-called
# "vignettes" that are distributed with most R packages:
vignette("bio3d_vignettes")
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you could.
#
# But before you _load_ a file, have a look what such a file contains. I have packaged the 1BM8 file into the project:
file.show("./assets/1BM8.pdb")
# Have a look at the header section, the atom records, the coordinate records
# etc. Answer the following questions:
#
# What is the resolution of the structure?
# Is the first residue in the SEQRES section also the first residue
# with an ATOM record?
# How many water molecules does the structure contain?
# Can you explain REMARK 525 regarding HOH 459 and HOH 473?
# Are all atoms of the N-terminal residue present?
# Are all atoms of the C-terminal residue present?
apses <- read.pdb("1bm8") # load a molecule directly from PDB
# check what we have:
apses
# what is this actually?
str(apses)
# bio3d's pdb objects are simple lists. Great! You know lists!
# You see that there is a table called atom in which the elements are vectors of the same length - namely the number of atoms in the structure file. And there is a matrix of (x, y, z) triplets called xyz. And there is a vector that holds sequence, and two tables called helix and sheet that look a lot like our feature annotation tables - in fact many of the principles to store this strutcure data are similar to how we constructed our protein database. Let's pull out a few values to confirm how selection and subsetting works here:
# selection by atom ...
i <- 5
apses$atom[i,]
apses$atom[i, c("x", "y", "z")] # here we are selecting with column names!
apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers
# all atoms of a residue ...
i <- "48" #string!
apses$atom[apses$atom[,"resno"] == i, ]
# sequence of the first ten residues
apses$seqres[1:10] # the As here identify the chain
# Can we convert this to one letter code?
aa321(apses$seqres[1:10])
# Lets get the implicit sequence:
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
# Do explicit and implicit sequence have the same length?
length(apses$seqres)
length(apses$atom$resid[apses$calpha])
# Are the sequences the same?
sum(apses$seqres == apses$atom$resid[apses$calpha])
# get a list of all CA atoms of arginine residues
sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA"
apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
# Good for now. Let's do some real work.
# ==============================================================================
# PART TWO: A Ramachandran plot
# ==============================================================================
# Calculate a Ramachandran plot for the structure
tor <- torsion.pdb(apses)
plot(tor$phi, tor$psi)
# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues. Let's replot the data, but
# color the points for glycine residues differently. First, we
# get a vector of glycine residue indices in the structure:
sSeq <- pdbseq(apses)
# Explore the result object and extract the indices of GLY resiues.
sSeq == "G"
which(sSeq == "G")
as.numeric(which(sSeq == "G"))
iGly <- as.numeric(which(sSeq == "G"))
# Now plot all non-gly residues.
# Remember: negative indices exclude items from a vector
plot(tor$phi[-iGly], tor$psi[-iGly], xlim=c(-180,180), ylim=c(-180,180))
# Now plot GLY only, but with green dots:
points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00")
# As you see, four residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script? Let's get their coordinate records:
iOutliers <- which(tor$phi > 30 & tor$phi < 90 &
tor$psi > 0 & tor$psi < 90)
CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")]
dat <- cbind(CA[iOutliers, ], phi = tor$phi[iOutliers], psi = tor$psi[iOutliers])
dat
# remove the glycine from our table ...
dat <- dat[dat$resid != "GLY", ]
dat
# let's add the residue numbers to the plot with the text function:
for (i in 1:nrow(dat)) {
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
text(dat$phi[i],
dat$psi[i],
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
pos = 4,
offset = 0.4,
cex = 0.7)
}
# You can check the residues in Chimera. Is there anything unusual?
# Are there any cis-peptide bonds in the structure?
tor$omega
#... gives us a quick answer. But wait - what values do we
# expect? And why are the values so different?
# Consider this plot: what am I doing here and why?
x <- tor$omega[tor$omega > 0]
x <- c(x, 360 + tor$omega[tor$omega < 0])
hist(x, xlim=c(90,270))
abline(v=180, col="red")
# ==============================================================================
# PART THREE: H-bond lengths
# ==============================================================================
# Let's do something a little less trivial and compare
# backbone H-bond lengths between helices and strands.
#
# Secondary structure is defined in the list components ...$helix
# and ...$strand.
# We need to
# - collect all residue indices for alpha helices resp.
# beta strands;
# - fetch the atom coordinates;
# - calculate all N, O distances using dist.xyz();
# - filter them for distances that indicate H-bonds; and,
# - plot the results.
# Secondary structure can either be obtained from
# definitions contained in many PDB files, or by running
# the DSSP algorithm IF(!) you have it installed on your
# machine. The 1BM8 file contains definitions
apses$helix
apses$sheet
# (1): collect the residue numbers
# between the segment boundaries.
H <- numeric() # This will contain the helix residue numbers
for (i in 1:length(apses$helix)) {
H <- c(H, apses$helix$start[i]:apses$helix$end[i])
}
# Doing the same for the sheet residue numbers requires
# very similar code. Rather than retype the code, it is
# better to write a function that can do both.
getSecondary <- function(sec) {
iRes <- c()
for (i in 1:length(sec$start)) {
iRes <- c(iRes, sec$start[i]:sec$end[i])
}
return(iRes)
}
# Compare ...
H
getSecondary(apses$helix)
# ... and use for strands
E <- getSecondary(apses$sheet)
# Now here's a problem: these numbers refer to the
# residue numbers as defined in the atom records. They
# can't be used directly to address e.g. the first, second
# third residue etc. since the first residue has the
# residue number 4...
apses$atom[1,]
# Therefore we need to
# 1: convert the numbers to strings;
# 2: subset the atom table for rows contain these strings.
#
# Essentially, we don't treat the "residue numbers" as numbers,
# but as labels. That's fine, as long as they are unique.
# (2): fetch coordinates of N and O atoms
# for residues in alpha- and beta- conformation.
# For one residue, the procedure goes as follows:
res <- H[17] # pick an arbitrary alpha-helix residue to illustrate
res
res <- as.character(res)
res
# all atom rows for this residue
apses$atom[apses$atom[,"resno"] == res, ]
# add condition: row with "N" atom only
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N", ]
# add column selection: "x", "y", "z"
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N",
c("x", "y", "z")]
# convert to numbers
as.numeric (
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N",
c("x", "y", "z")]
)
# Now we need to add this into a matrix as we iterate over the desired residues.
# We need to execute the procedure four times for alpha and beta Ns and Os
# respectively. Rather than duplicate the code four times over, we write a
# function. Never duplicate code, because if you need to make changes it is too
# easy to forget making the change consistently in all copies.
getAtom <- function(PDB, r, AT) {
# Function to iterate over residue number strings and
# return a matrix of x, y, z triplets for each atom
# of a requested type.
mat <- c() #initialize as empty matrix
for (i in 1:length(r)) {
res <- as.character(r[i])
v <- as.numeric (
PDB$atom[PDB$atom[,"resno"] == res &
PDB$atom[,"elety"] == AT,
c("x", "y", "z")]
)
mat <- rbind(mat, v)
}
return(mat)
}
# Now run the functions with the parameters we need...
H.xyz.N <- getAtom(apses, H, "N") # backbone N atoms in helix
H.xyz.O <- getAtom(apses, H, "O") # backbone O atoms in helix
E.xyz.N <- getAtom(apses, E, "N") # backbone N atoms in strand
E.xyz.O <- getAtom(apses, E, "O") # backbone O atoms in strand
# (3): calculate distances between N and O atoms to find H-bonds.
# We spent most of our effort so far just preparing our raw data for analysis.
# Now we can actually start measuring. bio3d provides the function dist.xyz() -
# but lets agree it builds character to code this ourselves.
# Consider the distance of the first (N,O) pair.
H.xyz.N[1,]
H.xyz.O[1,]
a <- H.xyz.N[1,]
b <- H.xyz.O[1,]
dist.xyz(a, b)
# or ...
sqrt( (a[1]-b[1])^2 + (a[2]-b[2])^2 + (a[3]-b[3])^2 )
# ... i.e. pythagoras theorem in 3D.
# Calculating all pairwise distances from a matrix of
# xyz coordinates sounds like a useful function.
PairDist.xyz <- function(xyzA, xyzB) {
PD <- c()
for (i in 1:nrow(xyzA)) {
for (j in 1:nrow(xyzB)) {
PD <- c(PD, dist.xyz(xyzA[i,], xyzB[j,]))
}
}
return(PD)
}
# And see what we get:
D <- PairDist.xyz(H.xyz.N, H.xyz.O)
hist(D)
# let's zoom in on the shorter distances, in which we expect
# hydrogen bonds:
hist(D[D < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
# There is a large peak at about 2.2Å, and another
# large peak above 3.5Å. But these are not typical hydrogen
# bond distances! Rather these are (N,O) pairs in peptide
# bonds, and within residues. That's not good, it will
# cause all sorts of problems with statistics later.
# We should exclude all distances between N of a residue
# and O of a preceeding residue, and all (N,O) pairs in the
# same residue. But for this, we need to store atom type
# and residue information with the coordinates. Our code
# will get a bit bulkier. It's often hard to find a good
# balance between terse generic code, and code that
# handles special cases.
# We could do two things:
# - add a column with residue information to the coordinates
# - add a column with atom type information
# ... but actually we also would need chain information, and
# then we really have almost everything that is contained in the record
# in the first place.
# This suggests a different strategy: let's rewrite our function
# getAtom() to return indices, not coordinates, and use the indices
# to extract coordinates, and THEN we can add all sorts of
# additional constraints.
# Here we set up the function with a default chain argument
getAtomIndex <- function(PDB, V_res, elety, chain="A") {
# Function to access a bio3d pdb object, iterate over
# a vector of residues and return a vector of indices
# to matching atom elements. Nb. bio3d handles insert
# and alt fields incorrectly: their values should not
# be NA but " ", i.e. a single blank. Therefore this
# function does not test for "alt" and "insert".
v <- c() #initialize as empty vector
for (i in 1:length(V_res)) {
res <- as.character(V_res[i])
x <- which(PDB$atom[,"resno"] == res &
PDB$atom[,"chain"] == chain &
PDB$atom[,"elety"] == elety)
v <- c(v, x)
}
return(v)
}
# test this ...
getAtomIndex(apses, H, "N")
getAtomIndex(apses, H, "O")
# That looks correct: O atoms should be stored three
# rows further down: the sequence of atoms in a PDB
# file is usually N, CA, C, O ... followed by the side
# chain coordinates.
# Now to extract the coordinates and calculate distances.
# Our function needs to take the PDB object and
# two vectors of atom indices as argument, and return
# a vector of pair-distances.
PairDist <- function(PDB, a, b) {
PD <- c()
for (i in 1:length(a)) {
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
for (j in 1:length(b)) {
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
PD <- c(PD, dist.xyz(p, q))
}
}
return(PD)
}
# Let's see if this looks correct:
H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
X <- PairDist(apses, H.N, H.O)
hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
# Now we are back where we started out from, but with
# a different logic of the function that we can easily
# modify to exclude (N_i, O_i-1) distances (peptide bond),
# and (N_i, O_i) distances (within residue).
HB <- function(PDB, a, b) {
HBcutoff <- 4.0
PD <- c()
for (i in 1:length(a)) {
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
res_i <- as.numeric(PDB$atom[a[i], "resno"])
for (j in 1:length(b)) {
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
res_j <- as.numeric(PDB$atom[a[j], "resno"])
if (res_i != res_j+1 &
res_i != res_j ) {
d <- dist.xyz(p, q)
if (d < HBcutoff) {
PD <- c(PD, d)
}
}
}
}
return(PD)
}
# test this:
X <- HB(apses, H.N, H.O)
hist(X)
# ... and this looks much more like the distribution we are
# seeking.
# Why did we go along this detour for coding the
# function? The point is that there are usually
# several ways to use or define datastructures and
# functions. Which one is the best way may not be
# obvious until you understand the problem better.
# At first, we wrote a very generic function that
# extracts atom coordinates to be able to compute
# with them. This is simple and elegant. But we
# recognized limitations in that we could not
# make more sophisticated selections that we needed
# to reflect our biological idea of hydrogen
# bonds. Thus we changed our datastructure
# and functions to accomodate our new requirements
# better. You have to be flexible and able to look
# at a task from different angles to succeed.
# Finally we can calculate alpha- and beta- structure
# bonds and compare them. In this section we'll explore
# different options for histogram plots.
H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
dH <- HB(apses, H.N, H.O)
E.N <- getAtomIndex(apses, E, "N")
E.O <- getAtomIndex(apses, E, "O")
dE <- HB(apses, E.N, E.O)
# The plain histogram functions without parameters
# give us white stacks.
hist(dH)
# and ...
hist(dE)
# We can see that the histrograms look different
# but that is better visualized by showing two plots
# in the same window. We use the par() function, for
# more flexible layout, look up the layout() function.
?par
?layout
opar <- par(no.readonly=TRUE) # store current state
par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column
# plot two histograms
hist(dH)
hist(dE)
# add color:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70")
# For better comparison, plot both in the
# same window:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
# ... oops, we dind't reset the graphics parameters.
# You can either close the window, a new window
# will open with default parameters, or ...
par(opar) # ... reset the graphics parameters
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
# We see that the leftmost column of the sheet bonds
# overlaps the helix bonds. Not good. But we
# can make the colors transparent! We just need to
# add a fourth set of two hexadecimal-numbers to
# the #RRGGBB triplet. Lets use 2/3 transparent,
# in hexadecimal, 1/3 of 256 is x55 - i.e. an
# RGB triplet specied as #RRGGBB55 is only 33%
# opaque:
hist(dH, col="#DD005555")
hist(dE, col="#00AA7055", add=TRUE)
# To finalize the plots, let's do two more things:
# Explicitly define the breaks, to make sure they
# match up - otherwise they would not need to...
# see for example:
hist(dH, col="#DD005555")
hist(dE[dE < 3], col="#00AA7055", add=TRUE)
# Breaks are a parameter in hist() that can
# either be a scalar, to define how many columns
# you want, or a vector, that defines the actual
# breakpoints.
brk=seq(2.4, 4.0, 0.1)
hist(dH, col="#DD005555", breaks=brk)
hist(dE, col="#00AA7055", breaks=brk, add=TRUE)
# The last thing to do is to think about rescaling the plot.
# You notice that the y-axis is scaled in absolute frequency.
# That gives us some impression of the relative frequency,
# but it is of course skewed by observing relatively more
# or less of one type of secondary structure in a protein.
# As part of the hist() function we can rescale the values so
# that the sum over all is one: set the prameter freq=FALSE.
hist(dH, col="#DD005555", breaks=brk, freq=FALSE)
hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE)
# Adding labels and legend ...
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend("topright",
c(sprintf("alpha (N = %3d)", sum(hH$counts)),
sprintf("beta (N = %3d)", sum(hE$counts))),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA)
# ===========================================================
# With all the functions we have defined,
# it is easy to try this with a larger protein.
# 3ugj for example is VERY large. The calculation will take a few
# minutes:
pdb <- read.pdb("3ugj")
H <- getSecondary(pdb$helix)
E <- getSecondary(pdb$sheet)
H.N <- getAtomIndex(pdb, H, "N")
H.O <- getAtomIndex(pdb, H, "O")
dH <- HB(pdb, H.N, H.O)
E.N <- getAtomIndex(pdb, E, "N")
E.O <- getAtomIndex(pdb, E, "O")
dE <- HB(pdb, E.N, E.O)
brk=seq(2.4, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
# It looks more and more that the distribution is
# indeed different. Our sample is large, but derives
# from a single protein.
# To do database scale statistics, we should look
# at many more proteins. To give you a sense of how,
# let's do this for just ten proteins, taken from
# the architecture level of the CATH database for
# mixed alpha-beta proteins (see:
# http://www.cathdb.info/browse/browse_hierarchy_tree):
PDBarchitectures <- c("3A4R", "A")
names(PDBarchitectures) <- c("ID", "chain")
PDBarchitectures <- rbind(PDBarchitectures, c("1EWF","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2VXN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1I3K","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1C0P","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3QVP","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1J5U","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2IMH","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3NVS","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1UD9","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1XKN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1OZN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2DKJ","A"))
dH <- c()
dE <- c()
for (i in 1:nrow(PDBarchitectures)) {
pdb <- read.pdb(PDBarchitectures[i,1])
chain <- PDBarchitectures[i,2]
H <- getSecondary(pdb$helix)
H.N <- getAtomIndex(pdb, H, "N", chain)
H.O <- getAtomIndex(pdb, H, "O", chain)
dH <- c(dH, HB(pdb, H.N, H.O))
E <- getSecondary(pdb$sheet)
E.N <- getAtomIndex(pdb, E, "N", chain)
E.O <- getAtomIndex(pdb, E, "O", chain)
dE <- c(dE, HB(pdb, E.N, E.O))
}
brk=seq(2.0, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
# Why do you think these distributions are different?
# At what distance do you think H-bonds have the lowest energy?
# = 1 Tasks
# [END]