# RPR-SX-PDB.R # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-SX-PDB unit. # # Version: 0.1 # # Date: 2017 08 28 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 0.1 First code copied from 2016 material. # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # ============================================================================== # = 1 ___Section___ # In this example of protein structure interpretation, we ... # - load the library "bio3D" which supports work with # protein structure files, # - explore some elementary functions of the library # - assemble a script to see whether H-bond lengths systematically differ # between alpha-helical and beta-sheet structures. if(!require(bio3d)) { install.packages("bio3d", dependencies=TRUE) library(bio3d) } lbio3d() # ... lists the newly installed functions, # they all have help files associated. # More information is available in the so-called # "vignettes" that are distributed with most R packages: vignette("bio3d_vignettes") # bio3d can load molecules directly from the PDB servers, you don't _have_ to # store them locally, but you could. # # But before you _load_ a file, have a look what such a file contains. I have packaged the 1BM8 file into the project: file.show("./assets/1BM8.pdb") # Have a look at the header section, the atom records, the coordinate records # etc. Answer the following questions: # # What is the resolution of the structure? # Is the first residue in the SEQRES section also the first residue # with an ATOM record? # How many water molecules does the structure contain? # Can you explain REMARK 525 regarding HOH 459 and HOH 473? # Are all atoms of the N-terminal residue present? # Are all atoms of the C-terminal residue present? apses <- read.pdb("1bm8") # load a molecule directly from PDB # check what we have: apses # what is this actually? str(apses) # bio3d's pdb objects are simple lists. Great! You know lists! # You see that there is a table called atom in which the elements are vectors of the same length - namely the number of atoms in the structure file. And there is a matrix of (x, y, z) triplets called xyz. And there is a vector that holds sequence, and two tables called helix and sheet that look a lot like our feature annotation tables - in fact many of the principles to store this strutcure data are similar to how we constructed our protein database. Let's pull out a few values to confirm how selection and subsetting works here: # selection by atom ... i <- 5 apses$atom[i,] apses$atom[i, c("x", "y", "z")] # here we are selecting with column names! apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers # all atoms of a residue ... i <- "48" #string! apses$atom[apses$atom[,"resno"] == i, ] # sequence of the first ten residues apses$seqres[1:10] # the As here identify the chain # Can we convert this to one letter code? aa321(apses$seqres[1:10]) # Lets get the implicit sequence: aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code? # Do explicit and implicit sequence have the same length? length(apses$seqres) length(apses$atom$resid[apses$calpha]) # Are the sequences the same? sum(apses$seqres == apses$atom$resid[apses$calpha]) # get a list of all CA atoms of arginine residues sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA" apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")] # The introduction to bio3d tutorial at # http://thegrantlab.org/bio3d/tutorials/structure-analysis # has the following example: plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") # Good for now. Let's do some real work. # ============================================================================== # PART TWO: A Ramachandran plot # ============================================================================== # Calculate a Ramachandran plot for the structure tor <- torsion.pdb(apses) plot(tor$phi, tor$psi) # As you can see, there are a number of points in the upper-right # quadrant of the plot. This combination of phi-psi angles defines # the conformation of a left-handed alpha helix and is generally # only observed for glycine residues. Let's replot the data, but # color the points for glycine residues differently. First, we # get a vector of glycine residue indices in the structure: sSeq <- pdbseq(apses) # Explore the result object and extract the indices of GLY resiues. sSeq == "G" which(sSeq == "G") as.numeric(which(sSeq == "G")) iGly <- as.numeric(which(sSeq == "G")) # Now plot all non-gly residues. # Remember: negative indices exclude items from a vector plot(tor$phi[-iGly], tor$psi[-iGly], xlim=c(-180,180), ylim=c(-180,180)) # Now plot GLY only, but with green dots: points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00") # As you see, four residues in the upper-right quadrant are # not glycine. But what residues are these? Is there an # error in our script? Let's get their coordinate records: iOutliers <- which(tor$phi > 30 & tor$phi < 90 & tor$psi > 0 & tor$psi < 90) CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")] dat <- cbind(CA[iOutliers, ], phi = tor$phi[iOutliers], psi = tor$psi[iOutliers]) dat # remove the glycine from our table ... dat <- dat[dat$resid != "GLY", ] dat # let's add the residue numbers to the plot with the text function: for (i in 1:nrow(dat)) { points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000") text(dat$phi[i], dat$psi[i], labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]), pos = 4, offset = 0.4, cex = 0.7) } # You can check the residues in Chimera. Is there anything unusual? # Are there any cis-peptide bonds in the structure? tor$omega #... gives us a quick answer. But wait - what values do we # expect? And why are the values so different? # Consider this plot: what am I doing here and why? x <- tor$omega[tor$omega > 0] x <- c(x, 360 + tor$omega[tor$omega < 0]) hist(x, xlim=c(90,270)) abline(v=180, col="red") # ============================================================================== # PART THREE: H-bond lengths # ============================================================================== # Let's do something a little less trivial and compare # backbone H-bond lengths between helices and strands. # # Secondary structure is defined in the list components ...$helix # and ...$strand. # We need to # - collect all residue indices for alpha helices resp. # beta strands; # - fetch the atom coordinates; # - calculate all N, O distances using dist.xyz(); # - filter them for distances that indicate H-bonds; and, # - plot the results. # Secondary structure can either be obtained from # definitions contained in many PDB files, or by running # the DSSP algorithm IF(!) you have it installed on your # machine. The 1BM8 file contains definitions apses$helix apses$sheet # (1): collect the residue numbers # between the segment boundaries. H <- numeric() # This will contain the helix residue numbers for (i in 1:length(apses$helix)) { H <- c(H, apses$helix$start[i]:apses$helix$end[i]) } # Doing the same for the sheet residue numbers requires # very similar code. Rather than retype the code, it is # better to write a function that can do both. getSecondary <- function(sec) { iRes <- c() for (i in 1:length(sec$start)) { iRes <- c(iRes, sec$start[i]:sec$end[i]) } return(iRes) } # Compare ... H getSecondary(apses$helix) # ... and use for strands E <- getSecondary(apses$sheet) # Now here's a problem: these numbers refer to the # residue numbers as defined in the atom records. They # can't be used directly to address e.g. the first, second # third residue etc. since the first residue has the # residue number 4... apses$atom[1,] # Therefore we need to # 1: convert the numbers to strings; # 2: subset the atom table for rows contain these strings. # # Essentially, we don't treat the "residue numbers" as numbers, # but as labels. That's fine, as long as they are unique. # (2): fetch coordinates of N and O atoms # for residues in alpha- and beta- conformation. # For one residue, the procedure goes as follows: res <- H[17] # pick an arbitrary alpha-helix residue to illustrate res res <- as.character(res) res # all atom rows for this residue apses$atom[apses$atom[,"resno"] == res, ] # add condition: row with "N" atom only apses$atom[apses$atom[,"resno"] == res & apses$atom[,"elety"] == "N", ] # add column selection: "x", "y", "z" apses$atom[apses$atom[,"resno"] == res & apses$atom[,"elety"] == "N", c("x", "y", "z")] # convert to numbers as.numeric ( apses$atom[apses$atom[,"resno"] == res & apses$atom[,"elety"] == "N", c("x", "y", "z")] ) # Now we need to add this into a matrix as we iterate over the desired residues. # We need to execute the procedure four times for alpha and beta Ns and Os # respectively. Rather than duplicate the code four times over, we write a # function. Never duplicate code, because if you need to make changes it is too # easy to forget making the change consistently in all copies. getAtom <- function(PDB, r, AT) { # Function to iterate over residue number strings and # return a matrix of x, y, z triplets for each atom # of a requested type. mat <- c() #initialize as empty matrix for (i in 1:length(r)) { res <- as.character(r[i]) v <- as.numeric ( PDB$atom[PDB$atom[,"resno"] == res & PDB$atom[,"elety"] == AT, c("x", "y", "z")] ) mat <- rbind(mat, v) } return(mat) } # Now run the functions with the parameters we need... H.xyz.N <- getAtom(apses, H, "N") # backbone N atoms in helix H.xyz.O <- getAtom(apses, H, "O") # backbone O atoms in helix E.xyz.N <- getAtom(apses, E, "N") # backbone N atoms in strand E.xyz.O <- getAtom(apses, E, "O") # backbone O atoms in strand # (3): calculate distances between N and O atoms to find H-bonds. # We spent most of our effort so far just preparing our raw data for analysis. # Now we can actually start measuring. bio3d provides the function dist.xyz() - # but lets agree it builds character to code this ourselves. # Consider the distance of the first (N,O) pair. H.xyz.N[1,] H.xyz.O[1,] a <- H.xyz.N[1,] b <- H.xyz.O[1,] dist.xyz(a, b) # or ... sqrt( (a[1]-b[1])^2 + (a[2]-b[2])^2 + (a[3]-b[3])^2 ) # ... i.e. pythagoras theorem in 3D. # Calculating all pairwise distances from a matrix of # xyz coordinates sounds like a useful function. PairDist.xyz <- function(xyzA, xyzB) { PD <- c() for (i in 1:nrow(xyzA)) { for (j in 1:nrow(xyzB)) { PD <- c(PD, dist.xyz(xyzA[i,], xyzB[j,])) } } return(PD) } # And see what we get: D <- PairDist.xyz(H.xyz.N, H.xyz.O) hist(D) # let's zoom in on the shorter distances, in which we expect # hydrogen bonds: hist(D[D < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0)) # There is a large peak at about 2.2Å, and another # large peak above 3.5Å. But these are not typical hydrogen # bond distances! Rather these are (N,O) pairs in peptide # bonds, and within residues. That's not good, it will # cause all sorts of problems with statistics later. # We should exclude all distances between N of a residue # and O of a preceeding residue, and all (N,O) pairs in the # same residue. But for this, we need to store atom type # and residue information with the coordinates. Our code # will get a bit bulkier. It's often hard to find a good # balance between terse generic code, and code that # handles special cases. # We could do two things: # - add a column with residue information to the coordinates # - add a column with atom type information # ... but actually we also would need chain information, and # then we really have almost everything that is contained in the record # in the first place. # This suggests a different strategy: let's rewrite our function # getAtom() to return indices, not coordinates, and use the indices # to extract coordinates, and THEN we can add all sorts of # additional constraints. # Here we set up the function with a default chain argument getAtomIndex <- function(PDB, V_res, elety, chain="A") { # Function to access a bio3d pdb object, iterate over # a vector of residues and return a vector of indices # to matching atom elements. Nb. bio3d handles insert # and alt fields incorrectly: their values should not # be NA but " ", i.e. a single blank. Therefore this # function does not test for "alt" and "insert". v <- c() #initialize as empty vector for (i in 1:length(V_res)) { res <- as.character(V_res[i]) x <- which(PDB$atom[,"resno"] == res & PDB$atom[,"chain"] == chain & PDB$atom[,"elety"] == elety) v <- c(v, x) } return(v) } # test this ... getAtomIndex(apses, H, "N") getAtomIndex(apses, H, "O") # That looks correct: O atoms should be stored three # rows further down: the sequence of atoms in a PDB # file is usually N, CA, C, O ... followed by the side # chain coordinates. # Now to extract the coordinates and calculate distances. # Our function needs to take the PDB object and # two vectors of atom indices as argument, and return # a vector of pair-distances. PairDist <- function(PDB, a, b) { PD <- c() for (i in 1:length(a)) { p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")]) for (j in 1:length(b)) { q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")]) PD <- c(PD, dist.xyz(p, q)) } } return(PD) } # Let's see if this looks correct: H.N <- getAtomIndex(apses, H, "N") H.O <- getAtomIndex(apses, H, "O") X <- PairDist(apses, H.N, H.O) hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0)) # Now we are back where we started out from, but with # a different logic of the function that we can easily # modify to exclude (N_i, O_i-1) distances (peptide bond), # and (N_i, O_i) distances (within residue). HB <- function(PDB, a, b) { HBcutoff <- 4.0 PD <- c() for (i in 1:length(a)) { p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")]) res_i <- as.numeric(PDB$atom[a[i], "resno"]) for (j in 1:length(b)) { q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")]) res_j <- as.numeric(PDB$atom[a[j], "resno"]) if (res_i != res_j+1 & res_i != res_j ) { d <- dist.xyz(p, q) if (d < HBcutoff) { PD <- c(PD, d) } } } } return(PD) } # test this: X <- HB(apses, H.N, H.O) hist(X) # ... and this looks much more like the distribution we are # seeking. # Why did we go along this detour for coding the # function? The point is that there are usually # several ways to use or define datastructures and # functions. Which one is the best way may not be # obvious until you understand the problem better. # At first, we wrote a very generic function that # extracts atom coordinates to be able to compute # with them. This is simple and elegant. But we # recognized limitations in that we could not # make more sophisticated selections that we needed # to reflect our biological idea of hydrogen # bonds. Thus we changed our datastructure # and functions to accomodate our new requirements # better. You have to be flexible and able to look # at a task from different angles to succeed. # Finally we can calculate alpha- and beta- structure # bonds and compare them. In this section we'll explore # different options for histogram plots. H.N <- getAtomIndex(apses, H, "N") H.O <- getAtomIndex(apses, H, "O") dH <- HB(apses, H.N, H.O) E.N <- getAtomIndex(apses, E, "N") E.O <- getAtomIndex(apses, E, "O") dE <- HB(apses, E.N, E.O) # The plain histogram functions without parameters # give us white stacks. hist(dH) # and ... hist(dE) # We can see that the histrograms look different # but that is better visualized by showing two plots # in the same window. We use the par() function, for # more flexible layout, look up the layout() function. ?par ?layout opar <- par(no.readonly=TRUE) # store current state par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column # plot two histograms hist(dH) hist(dE) # add color: hist(dH, col="#DD0055") hist(dE, col="#00AA70") # For better comparison, plot both in the # same window: hist(dH, col="#DD0055") hist(dE, col="#00AA70", add=TRUE) # ... oops, we dind't reset the graphics parameters. # You can either close the window, a new window # will open with default parameters, or ... par(opar) # ... reset the graphics parameters hist(dH, col="#DD0055") hist(dE, col="#00AA70", add=TRUE) # We see that the leftmost column of the sheet bonds # overlaps the helix bonds. Not good. But we # can make the colors transparent! We just need to # add a fourth set of two hexadecimal-numbers to # the #RRGGBB triplet. Lets use 2/3 transparent, # in hexadecimal, 1/3 of 256 is x55 - i.e. an # RGB triplet specied as #RRGGBB55 is only 33% # opaque: hist(dH, col="#DD005555") hist(dE, col="#00AA7055", add=TRUE) # To finalize the plots, let's do two more things: # Explicitly define the breaks, to make sure they # match up - otherwise they would not need to... # see for example: hist(dH, col="#DD005555") hist(dE[dE < 3], col="#00AA7055", add=TRUE) # Breaks are a parameter in hist() that can # either be a scalar, to define how many columns # you want, or a vector, that defines the actual # breakpoints. brk=seq(2.4, 4.0, 0.1) hist(dH, col="#DD005555", breaks=brk) hist(dE, col="#00AA7055", breaks=brk, add=TRUE) # The last thing to do is to think about rescaling the plot. # You notice that the y-axis is scaled in absolute frequency. # That gives us some impression of the relative frequency, # but it is of course skewed by observing relatively more # or less of one type of secondary structure in a protein. # As part of the hist() function we can rescale the values so # that the sum over all is one: set the prameter freq=FALSE. hist(dH, col="#DD005555", breaks=brk, freq=FALSE) hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE) # Adding labels and legend ... hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), main="Helix and Sheet H-bond lengths") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend("topright", c(sprintf("alpha (N = %3d)", sum(hH$counts)), sprintf("beta (N = %3d)", sum(hE$counts))), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA) # =========================================================== # With all the functions we have defined, # it is easy to try this with a larger protein. # 3ugj for example is VERY large. The calculation will take a few # minutes: pdb <- read.pdb("3ugj") H <- getSecondary(pdb$helix) E <- getSecondary(pdb$sheet) H.N <- getAtomIndex(pdb, H, "N") H.O <- getAtomIndex(pdb, H, "O") dH <- HB(pdb, H.N, H.O) E.N <- getAtomIndex(pdb, E, "N") E.O <- getAtomIndex(pdb, E, "O") dE <- HB(pdb, E.N, E.O) brk=seq(2.4, 4.0, 0.1) hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), main="Helix and Sheet H-bond lengths") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend('topright', c(paste("alpha (N = ", sum(hH$counts), ")"), paste("beta (N = ", sum(hE$counts), ")")), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA, inset = 0.1) # It looks more and more that the distribution is # indeed different. Our sample is large, but derives # from a single protein. # To do database scale statistics, we should look # at many more proteins. To give you a sense of how, # let's do this for just ten proteins, taken from # the architecture level of the CATH database for # mixed alpha-beta proteins (see: # http://www.cathdb.info/browse/browse_hierarchy_tree): PDBarchitectures <- c("3A4R", "A") names(PDBarchitectures) <- c("ID", "chain") PDBarchitectures <- rbind(PDBarchitectures, c("1EWF","A")) PDBarchitectures <- rbind(PDBarchitectures, c("2VXN","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1I3K","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1C0P","A")) PDBarchitectures <- rbind(PDBarchitectures, c("3QVP","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1J5U","A")) PDBarchitectures <- rbind(PDBarchitectures, c("2IMH","A")) PDBarchitectures <- rbind(PDBarchitectures, c("3NVS","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1UD9","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1XKN","A")) PDBarchitectures <- rbind(PDBarchitectures, c("1OZN","A")) PDBarchitectures <- rbind(PDBarchitectures, c("2DKJ","A")) dH <- c() dE <- c() for (i in 1:nrow(PDBarchitectures)) { pdb <- read.pdb(PDBarchitectures[i,1]) chain <- PDBarchitectures[i,2] H <- getSecondary(pdb$helix) H.N <- getAtomIndex(pdb, H, "N", chain) H.O <- getAtomIndex(pdb, H, "O", chain) dH <- c(dH, HB(pdb, H.N, H.O)) E <- getSecondary(pdb$sheet) E.N <- getAtomIndex(pdb, E, "N", chain) E.O <- getAtomIndex(pdb, E, "O", chain) dE <- c(dE, HB(pdb, E.N, E.O)) } brk=seq(2.0, 4.0, 0.1) hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), main="Helix and Sheet H-bond lengths") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend('topright', c(paste("alpha (N = ", sum(hH$counts), ")"), paste("beta (N = ", sum(hE$counts), ")")), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA, inset = 0.1) # Why do you think these distributions are different? # At what distance do you think H-bonds have the lowest energy? # = 1 Tasks # [END]