# tocID <- "RPR-SX-PDB.R" # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-SX-PDB unit. # # Version: 1.3 # # Date: 2017-10 - 2020-09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.3 2020 Maintenance # 1.2 2019 Maintenance # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout # 1.0 First live version, completely refactores 2016 code # with remarkable speed gains. Added section on x, y, z # (density) plots. # 0.1 First code copied from 2016 material. # # TODO: # Confirm that SS residue numbers are indices # Set task seed from student number # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------------- #TOC> 1 Introduction to the bio3D package 63 #TOC> 2 A Ramachandran plot 155 #TOC> 3 Density plots 231 #TOC> 3.1 Density-based colours 245 #TOC> 3.2 Plotting with smoothScatter() 264 #TOC> 3.3 Plotting hexbins 279 #TOC> 3.4 Plotting density contours 307 #TOC> 3.4.1 ... as overlay on a coloured grid 340 #TOC> 3.4.2 ... as filled countour 357 #TOC> 3.4.3 ... as a perspective plot 388 #TOC> 4 cis-peptide bonds 406 #TOC> 5 H-bond lengths 421 #TOC> #TOC> ========================================================================== # In this example of protein structure interpretation, we ... # - download the package bio3D:: which supports work with # protein structure files, # - explore some elementary functions of the package # - explore plotting of density values with scatterplots # - assemble a script to see whether H-bond lengths systematically differ # between alpha-helical and beta-sheet structures. # = 1 Introduction to the bio3D package =================================== if (! requireNamespace("bio3d", quietly = TRUE)) { install.packages("bio3d") } # Package information: # library(help = bio3d) # basic information # browseVignettes("bio3d") # available vignettes # data(package = "bio3d") # available datasets # bio3d:: can load molecules directly from the PDB servers, you don't _have_ to # store them locally, but you could. # # But before you _load_ a file, have a look what such a file contains. I have # packaged the 1BM8 file into the project: file.show("./data/1BM8.pdb") # Have a look at the header section, the atom records, the coordinate records # etc. # # Task: # Answer the following questions: # What is the resolution of the structure? # Is the first residue in the SEQRES section also the first residue # with an ATOM record? # How many water molecules does the structure contain? # Can you explain REMARK 525 regarding HOH 459 and HOH 473? # Are all atoms of the N-terminal residue present? # Are all atoms of the C-terminal residue present? apses <- bio3d::read.pdb("1bm8") # load a molecule directly from the PDB via # the Internet. # check what we have: apses # what is this actually? str(apses) # bio3d's pdb objects are simple lists. Great! You know lists! # You see that there is a list element called $atom which is a data frame in # which the columns are vectors of the same length - namely the number of atoms # in the structure file. And there is a matrix of (x, y, z) triplets called xyz. # And there is a vector that holds sequence, and two tables called helix and # sheet. Let's pull out a few values to confirm how selection and subsetting # works here: # selection by atom ... i <- 5 apses$atom[i,] apses$atom[i, c("x", "y", "z")] # here we are selecting with column names! apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selecting with row numbers # all atoms of a residue ... i <- 48 apses$atom[apses$atom[,"resno"] == i, ] # sequence of the first ten residues apses$seqres[1:10] # the "A"s here identify chain "A" # Can we convert this to one letter code? bio3d::aa321(apses$seqres[1:10]) # Lets get the implicit sequence: bio3d::aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code? # Do explicit and implicit sequence have the same length? length(apses$seqres) length(apses$atom$resid[apses$calpha]) # Are the sequences the same? any(apses$seqres != apses$atom$resid[apses$calpha]) # get a list of all CA atoms of arginine residues cols <- c("eleno", "elety", "resid", "chain", "resno", "insert") sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA" apses$atom[sel, cols] # The introduction to bio3d tutorial at # http://thegrantlab.org/bio3d/tutorials/structure-analysis # has the following example: bio3d::plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") # Good for now. Let's do some real work. # = 2 A Ramachandran plot ================================================= # Calculate a Ramachandran plot for the structure. The torsion.pdb() function # calculates all dihedral angles for backbone and sidechain bonds, NA where # the bond does not exist in an amino acid. tor <- bio3d::torsion.pdb(apses) plot(tor$phi, tor$psi, xlim = c(-180, 180), ylim = c(-180, 180), main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") # As you can see, there are a number of points in the upper-right # quadrant of the plot. This combination of phi-psi angles defines # the conformation of a left-handed alpha helix and is generally # only observed for glycine residues. Let's replot the data, but # colour the points for glycine residues differently. First, we # get a vector of glycine residue indices in the structure: mySeq <- bio3d::pdbseq(apses) # Explore the result object and extract the indices of GLY resiues. mySeq == "G" which(mySeq == "G") iGly <- which(mySeq == "G") # Now plot all non-gly residues. # Remember: negative indices exclude items from a vector plot(tor$phi[-iGly], tor$psi[-iGly], xlim=c(-180,180), ylim=c(-180,180), main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") # Now plot GLY only, but with green dots: points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00") # As you see, four residues in the upper-right quadrant are # not glycine. But what residues are these? Is there an # error in our script? Let's get their coordinate records: # subset CA records CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")] # get index of outliers iOutliers <- which(tor$phi > 30 & tor$phi < 90 & tor$psi > 0 & tor$psi < 90) # cbind records together (dat <- cbind(CA[iOutliers, ], phi = tor$phi[iOutliers], psi = tor$psi[iOutliers])) # remove the glycine ... (dat <- dat[dat$resid != "GLY", ]) # let's add the residue numbers to the plot with the text function: for (i in 1:nrow(dat)) { points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000") text(dat$phi[i], dat$psi[i], labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]), pos = 4, offset = 0.4, cex = 0.7) } # You can check the residues in Chimera. Is there anything unusual about these # residues? # = 3 Density plots ======================================================= # Such x, y scatter-plots of data that is sampled from a distribution can tell # us a lot about what shapes the distribution. The distribution is governed by # the free energy of the phi-psi landscape in folded proteins, since folded # proteins generally minimize the free energy of their conformations. We observe # empirically, from comparing frequency statistics and mutation experiments, # that this generall follows a Boltzmann distribution, where the free energy # changes we observe in experments that change one conformation into another are # proportional to the log-ratio of the number of times we observe each # observation in the protein structure database (after correcting for # observation bias). The proper way to visualize such 2D landscapes is with # contour plots. # == 3.1 Density-based colours ============================================= # A first approximation to scatterplots that visualize the density of the # underlying distribution is colouring via the densCols() function. ?densCols iNA <- c(which(is.na(tor$phi)), which(is.na(tor$psi))) phi <- tor$phi[-iNA] psi <- tor$psi[-iNA] plot (phi, psi, xlim = c(-180, 180), ylim = c(-180, 180), col=densCols(phi,psi), pch=20, cex=2, main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") # == 3.2 Plotting with smoothScatter() ===================================== # A better way, that spreads out the underlying density is smoothScatter() smoothScatter(phi,psi) smoothScatter(phi, psi, xlim = c(-180, 180), ylim = c(-180, 180), col = "#0033BB33", pch = 3, cex = 0.6, main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") # == 3.3 Plotting hexbins ================================================== # If we wish to approximate values in a histogram-like fashion, we can use # hexbin() if (! requireNamespace("hexbin", quietly = TRUE)) { install.packages("hexbin") } # Package information: # library(help = hexbin) # basic information # browseVignettes("hexbin") # available vignettes # data(package = "hexbin") # available datasets myColorRamp <- colorRampPalette(c("#EEEEEE", "#3399CC", "#2266DD")) hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10), colramp = myColorRamp, main = "phi-psi Density Bins for 1BM8", xlab = expression(phi), ylab = expression(psi)) # Note: # Had we loaded hexbin with library(hexbin), the plot function would have # been dispatched by the plot() generic, and we could simply have written: # plot(hexbin(phi, psi, xbins = 10), ... # == 3.4 Plotting density contours ========================================= # The best way to handle such data is provided by the function contour(): ?contour # Contour plots are not produced along the haphazardly sampled values of a data # set, but on a regular grid. This means, we need to convert observed values # into estimated densities. Density estimation is an important topic for # exploratory data analysis, base R has the density() function for 1D # distributions. But for 2D data like or phi-psi plots, we need a function from # the MASS package: kde2d() if (! requireNamespace("MASS", quietly = TRUE)) { install.packages("MASS") } # Package information: # library(help = MASS) # basic information # browseVignettes("MASS") # available vignettes # data(package = "MASS") # available datasets ?MASS::kde2d dPhiPsi <-MASS::kde2d(phi, psi, n = 60, lims = c(-180, 180, -180, 180)) str(dPhiPsi) # This is a list, with gridpoints in x and y, and the estimated densities in z. # Generic plot with default parameters contour(dPhiPsi) # === 3.4.1 ... as overlay on a coloured grid image(dPhiPsi, col = myColorRamp(100), main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) contour(dPhiPsi, col = "royalblue", add = TRUE, method = "edge", nlevels = 10, lty = 2) points(phi, psi, col = "#00338866", pch = 3, cex = 0.7) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") # === 3.4.2 ... as filled countour filled.contour(dPhiPsi, xlim = c(-180, 180), ylim = c(-180, 180), nlevels = 10, color.palette = myColorRamp, main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi)) # Note: we can pass additional plotting and overlay commands to the counter plot # in a block of expressions passed via the plot.axes parameter: filled.contour(dPhiPsi, xlim = c(-180, 180), ylim = c(-180, 180), nlevels = 10, color.palette = myColorRamp, main = "Ramachandran plot for 1BM8", xlab = expression(phi), ylab = expression(psi), plot.axes = { contour(dPhiPsi, col = "#00000044", add = TRUE, method = "edge", nlevels = 10, lty = 2) points(phi, psi, col = "#00338866", pch = 3, cex = 0.7) abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") }) # === 3.4.3 ... as a perspective plot persp(dPhiPsi, xlab = "phi", ylab = "psi", zlab = "Density") persp(dPhiPsi, theta = 40, phi = 10, col = "#99AACC", xlab = "phi", ylab = "psi", zlab = "Density") # = 4 cis-peptide bonds =================================================== # Are there any cis-peptide bonds in the structure? tor$omega #... gives us a quick answer. But wait - what values do we # expect? And why are the values so different, ranging from -180° to 180°? # Consider this plot: what am I doing here and why? om <- c(360 + tor$omega[tor$omega < 0], tor$omega[tor$omega >= 0]) hist(om, xlim=c(0,360)) abline(v=180, col="red") # Note: a cis-peptide bond will have an omega torsion angle around 0° # = 5 H-bond lengths ====================================================== # Let's do something a little less trivial and compare # backbone H-bond lengths between helices and strands. # Secondary structure is defined in the bio3d object's ...$helix and ...$strand # list elements. str(apses) # We need to # - collect all N atoms in alpha helices resp. # beta strands; # - collect all O atoms in alpha helices resp. # beta strands; # - fetch the atom coordinates; # - calculate all N, O distances; # - filter them for distances that indicate H-bonds; and, # - plot the results. # Secondary structure can either be obtained from definitions contained in most # PDB files, or by running the DSSP algorithm IF(!) you have it installed on # your machine. See the dssp() function of bio3d for instructions how to obtain # and install DSSP and STRIDE. This is highly recommended for "real" work with # structure coordinate files. The 1BM8 file contains secondary structure # definitions: apses$helix apses$sheet # A function to collect atom indices for particular type of secondary structure ssSelect <- function(PDB, myChain = "A", ssType, myElety) { # Function to retrieve specified atom types from specified secondary # structure types in a PDB object. # Parameters: # PDB A bio3D PDB object # myChain chr The chain to use. Default: chain "A". # ssType chr A vector of keywords "helix" and/or "sheet" # myElety chr A vector of $eletype atom types to return # Value: num Indices of the matching atom rows in PDB$atom # Build a vector of $resno numbers starts <- numeric() ends <- numeric() if ("helix" %in% ssType) { sel <- PDB$helix$chain %in% myChain starts <- c(starts, PDB$helix$start[sel]) ends <- c(ends, PDB$helix$end[sel]) } if ("sheet" %in% ssType) { sel <- PDB$sheet$chain %in% myChain starts <- c(starts, PDB$sheet$start[sel]) ends <- c(ends, PDB$sheet$end[sel]) } myResno <- numeric() for (i in seq_along(starts)) { myResno <- c(myResno, starts[i]:ends[i]) } # get id's from PDB x <- bio3d::atom.select(PDB, string = "protein", type = "ATOM", chain = myChain, resno = myResno, elety = myElety) return(x$atom) } # Example: ssSelect(apses, ssType = "sheet", myElety = "N") ssSelect(apses, ssType = "sheet", myElety = "O") # That looks correct: O atoms should be stored three index position after N: the # sequence of atoms in a PDB file is usually N, CA, C, O ... followed by the # side chain coordinates. # Now to extract the coordinates and calculate distances. Our function needs to # take the PDB object and two vectors of atom indices as argument, and return a # vector of pair-distances (actually dist.xyz() returns a matrix). pairDist <- function(PDB, a, b) { # Calculate pairwise distances between atoms indicated by a and b # Parameters: # PDB A bio3D PDB object # a int A vector of atom indexes # b int A vector of atom indexes # Value: num matrix of Euclidian distances between the atoms given in a, b. # There are as many rows as atoms in a, as many columns as # atoms in b. dMat <- numeric() if (length(a) > 0 && length(b) > 0) { A <- PDB$atom[a, c("x", "y", "z")] B <- PDB$atom[b, c("x", "y", "z")] dMat <- bio3d::dist.xyz(A, B) } return(dMat) } # Let's see if this looks correct. Let's look at all the pairwise distances # between N and O atoms in both types of secondary structure: iN <- ssSelect(apses, ssType = c("helix", "sheet"), myElety = "N") iO <- ssSelect(apses, ssType = c("helix", "sheet"), myElety = "O") x <- pairDist(apses, iN, iO) hist(x, breaks = 80, col = "lavenderblush", main = "", xlab = "N-O distances (Å)") # Since we are collecting distance from all secondary structure elements, we # are just seing a big peak of (meaningless) long-distance separations. We # need to zoom in on the shorter distances, in which we expect # hydrogen bonds: hist(x[x < 4.2], # restrict to N-O distance less than 4.2 Å long breaks=seq(2.0, 4.2, 0.1), xlim=c(2.0,4.2), col = "lavenderblush", main = "", xlab = "N-O distances (Å)") # There is a large peak at about 2.2Å, and another # large peak above 3.5Å. But these are not typical hydrogen # bond distances! Rather these are (N,O) pairs in peptide # bonds, and within residues. That's not good, because these will contaminate # our statistics. # We need to exclude all distances between N of a residue # and O of a preceeding residue, and all (N,O) pairs in the # same residue. We need a function to filter distances by residue numbers. And while we are filtering, we might as well throw away the non-H bond distances too. filterHB <- function(PDB, iN, iO, dMat, cutoff = 3.9) { # Filters distances between O(i-1) and N(i), and between N(i) and O(i) # in a distance matrix where there is one row per N-atom and one # column per O atom. # Parameters: # PDB a bio3D PDB object # iN int a vector of N atom indexes # iO int a vector of O atom indexes # dMat num a distance matrix created by pairDist() # cutoff num only return distances that are shorter than "cutoff # Value: a distance matrix in which values that do not match the # filter criteria have bee set to NA. if (length(iN) > 0 && length(iO) > 0) { resN <- PDB$atom$resno[iN] resO <- PDB$atom$resno[iO] for (i in seq_along(resN)) { # for all N atoms for (j in seq_along(resO)) { # for all O atoms if (dMat[i, j] > cutoff || # if: distance > cutoff, or ... (resN[i] - 1) == resO[j] || # distance is N(i)---O(i-1), or ... resN[i] == resO[j]) { # distance is N(i)---O(i), then: dMat[i, j] <- NA # set this distance to NA. } } } } return(dMat) } # Inspect the result: hist(filterHB(apses, iN, iO, x), breaks=seq(2.0, 4.2, 0.1), xlim=c(2.0,4.2), col = "paleturquoise", main = "", xlab = "N-O distances (Å)") # Finally we can calculate alpha- and beta- structure # bonds and compare them. In this section we'll explore # different options for histogram plots. # H-bonds in helices ... iN <- ssSelect(apses, ssType = c("helix"), myElety = "N") iO <- ssSelect(apses, ssType = c("helix"), myElety = "O") dH <- filterHB(apses, iN, iO, pairDist(apses, iN, iO)) dH <- dH[!is.na(dH)] # H-bonds in sheets. (We commonly use the letter "E" to symbolize a beta # strand or sheet, because "E" visually evokes an extended strand with # protruding sidechains.) iN <- ssSelect(apses, ssType = c("sheet"), myElety = "N") iO <- ssSelect(apses, ssType = c("sheet"), myElety = "O") dE <- filterHB(apses, iN, iO, pairDist(apses, iN, iO)) dE <- dE[!is.na(dE)] # The plain histogram functions without parameters # give us white stacks. hist(dH) # and ... hist(dE) # We can see that the histrograms look different # but that is better visualized by showing two plots # in the same window. We use the par() function, for # more flexible layout, look up the layout() function. ?par ?layout opar <- par(no.readonly=TRUE) # store current state par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column # plot two histograms hist(dH) hist(dE) # add colour: hist(dH, col="#DD0055") hist(dE, col="#00AA70") # For better comparison, plot both in the # same window (use the "add = TRUE" parameter in hist() ): hist(dH, col="#DD0055") hist(dE, col="#00AA70", add = TRUE) # ... oops, we dind't reset the graphics parameters. You can either use # the broom icon of the Plot tab to clear asll plots, or ... par(opar) # ... reset the graphics parameters hist(dH, col = "#DD0055") hist(dE, col = "#00AA70", add = TRUE) # We see that the leftmost column of the sheet bonds hides the helix bonds in # that column. Not good. But we can make the colours transparent! We just need to # add a fourth set of two hexadecimal-numbers to the #RRGGBB triplet. (This # sets the transparency of the colour, it is called the "alpha channel"). Lets # use 2/3 transparent, in hexadecimal, 1/3 of 256 is x55 - i.e. an RGB triplet # specied as #RRGGBB55 is only 33% opaque: hist(dH, col = "#DD005555") hist(dE, col = "#00AA7055", add = TRUE) # To finalize the plots, let's do two more things: Explicitly define the breaks, # to make sure they match up - otherwise they would not need to... like in this # example: hist(dH, col ="#DD005555") hist(dE[dE < 3], col = "#00AA7055", add = TRUE) # Breaks are a parameter in hist() that can either be a scalar, to define how # many columns you want, or a vector, that defines the actual breakpoints. brk=seq(2.4, 4.0, 0.1) hist(dH, col="#DD005555", breaks=brk) hist(dE, col="#00AA7055", breaks=brk, add=TRUE) # The last thing to do is to think about rescaling the plot. You notice that the # y-axis is scaled in absolute frequency (i.e. counts). That gives us some # impression of the relative frequency, but it is of course skewed by observing # relatively more or less of one type of secondary structure in a protein. As # part of the hist() function we can rescale the values so that the sum over all # is one: set the parameter freq=FALSE. And we explicitly set y-axis limits # to accommodate bot distributions. hist(dH, col="#DD005555", breaks=brk, freq=FALSE, ylim = c(0, 4)) hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE) # Adding labels and legend ... hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), main="Helix and Sheet H-bond lengths") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend("topright", c(sprintf("alpha (N = %3d)", sum(hH$counts)), sprintf("beta (N = %3d)", sum(hE$counts))), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA) # With all the functions we have defined, # it is easy to try this with a larger protein. # 3ugj for example is VERY large. pdb <- bio3d::read.pdb("3ugj") # helices... iN <- ssSelect(pdb, ssType = c("helix"), myElety = "N") iO <- ssSelect(pdb, ssType = c("helix"), myElety = "O") dH <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO)) dH <- dH[!is.na(dH)] # sheets iN <- ssSelect(pdb, ssType = c("sheet"), myElety = "N") iO <- ssSelect(pdb, ssType = c("sheet"), myElety = "O") dE <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO)) dE <- dE[!is.na(dE)] # histograms ... brk=seq(2.4, 4.0, 0.1) hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), main="Helix and Sheet H-bond lengths") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend('topright', c(paste("alpha (N = ", sum(hH$counts), ")"), paste("beta (N = ", sum(hE$counts), ")")), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA, inset = 0.1) # It's apparent that the distributions are indeed different. Our sample # is large, but derives from a single protein. To do database scale statistics, # we should look at many more proteins. To give you a sense of how, let's do # this for just ten proteins, randomly selected from non-homologous, # high-resolution, single domain structures. I have provided a utility function # for such a selection (details beyond the scope of this project). myPDBs <- selectPDBrep(10) # My selection is "2OVJ", "1HQS", "3BON", "4JZX", "3BQ3", "2IUM", "2C9E", # "4X1F", "2V3I", "3GE3". Yours will be different. # We are loading the files online - don't do this if you have bandwidth # limitations. dH <- c() # collect all helix H-bonds here dE <- c() # collect all sheet H-bonds here for (i in seq_along(myPDBs)) { pdb <- bio3d::read.pdb(myPDBs[i]) # helices... iN <- ssSelect(pdb, ssType = c("helix"), myElety = "N") iO <- ssSelect(pdb, ssType = c("helix"), myElety = "O") x <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO)) dH <- c(dH, x[!is.na(x)]) # sheets iN <- ssSelect(pdb, ssType = c("sheet"), myElety = "N") iO <- ssSelect(pdb, ssType = c("sheet"), myElety = "O") x <- filterHB(pdb, iN, iO, pairDist(pdb, iN, iO)) dE <- c(dE, x[!is.na(x)]) } # Inspect the results length(dH) # 4415 (your numbers are different, but there should be many) length(dE) # 262 brk=seq(2.0, 4.0, 0.1) hH <- hist(dH, freq=FALSE, breaks=brk, col="#DD005550", xlab="(N,O) distance (Å)", ylab="Density", ylim=c(0,4), cex.main = 0.8, main="Helix and Sheet H-bond lengths (10 representative structures)") hE <- hist(dE, freq=FALSE, breaks=brk, col="#00AA7060", add=TRUE) legend('topright', c(paste("alpha (N = ", sum(hH$counts), ")"), paste("beta (N = ", sum(hE$counts), ")")), fill = c("#DD005550", "#00AA7060"), bty = 'n', border = NA, inset = 0.1) # Task: # Why do you think these distributions are different? # At what distance do you think H-bonds have the lowest energy? # For alpha-helices? For beta-strands? # [END]