2020-09-18 11:56:30 +00:00
# tocID <- "BIN-ALI-Similarity.R"
#
2017-09-12 20:09:20 +00:00
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
2020-09-25 11:06:24 +00:00
# Version: 1.2
2017-09-12 20:09:20 +00:00
#
2020-09-25 11:06:24 +00:00
# Date: 2017-10 - 2020-09
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2020-09-25 11:06:24 +00:00
# 1.2 2020 Updates
2019-01-08 07:11:25 +00:00
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
2017-10-23 03:39:12 +00:00
# 1.0 Refactored for 2017; add aaindex, ternary plot.
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
2017-10-23 03:39:12 +00:00
#
2017-09-12 20:09:20 +00:00
#
# TODO:
2020-09-25 11:06:24 +00:00
# Update ggtern:: ternary plot to use aacol dots under text
2017-09-12 20:09:20 +00:00
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-10-23 03:39:12 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-10-23 03:39:12 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-10-29 03:05:53 +00:00
2017-10-23 03:39:12 +00:00
#TOC> ==========================================================================
2020-09-25 11:07:25 +00:00
#TOC>
2019-01-08 07:11:25 +00:00
#TOC> Section Title Line
#TOC> ----------------------------------------------
2020-09-25 11:07:25 +00:00
#TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 189
#TOC> 3 Background score 230
#TOC>
2017-10-23 03:39:12 +00:00
#TOC> ==========================================================================
# = 1 Amino Acid Properties ===============================================
# A large collection of amino acid property tables is available via the seqinr
# package:
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " seqinr" , quietly = TRUE ) ) {
2017-10-23 03:39:12 +00:00
install.packages ( " seqinr" )
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
2017-10-23 03:39:12 +00:00
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
# data:
? aaindex
2020-09-25 11:06:24 +00:00
data ( aaindex , package = " seqinr" ) # load the aaindex list from the package
2017-10-23 03:39:12 +00:00
length ( aaindex )
# Here are all the index descriptions
for ( i in 1 : length ( aaindex ) ) {
cat ( paste ( i , " : " , aaindex [ [i ] ] $ D , " \n" , sep = " " ) )
}
# It's a bit cumbersome to search through the descriptions ... here is a
# function to make this easier:
searchAAindex <- function ( patt ) {
# Searches the aaindex descriptions for regular expression "patt"
# and prints index number and description.
hits <- which ( sapply ( aaindex , function ( x ) length ( grep ( patt , x $ D ) ) > 0 ) )
for ( i in seq_along ( hits ) ) {
cat ( sprintf ( " %3d\t%s\n" , hits [i ] , aaindex [ [ hits [i ] ] ] $ D ) )
}
}
searchAAindex ( " free energy" ) # Search for "free energy"
searchAAindex ( " (size)|(volume)" ) # Search for "size" or "volume":
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# Let's examine ...
# ... a hydrophobicity index
( Y <- aaindex [ [528 ] ] [c ( " D" , " I" ) ] )
# ... a volume index
( V <- aaindex [ [150 ] ] [c ( " D" , " I" ) ] )
# ... and one of our own: side-chain pK values as reported by
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
# to 7.4 (physiological pH)
K <- list ( I = c ( 7.4 , # Ala
12.3 , # Arg
7.4 , # Asn
3.9 , # Asp
8.6 , # Cys
7.4 , # Gln
4.3 , # Glu
7.4 , # Gly
6.5 , # His
7.4 , # Ile
7.4 , # Leu
10.4 , # Lys
7.4 , # Met
7.4 , # Phe
7.4 , # Pro
7.4 , # Ser
7.4 , # Thr
7.4 , # Trp
9.8 , # Tyr
7.4 ) ) # Val
names ( K $ I ) <- c ( " Ala" , " Arg" , " Asn" , " Asp" , " Cys" , " Gln" , " Glu" , " Gly" , " His" , " Ile" ,
" Leu" , " Lys" , " Met" , " Phe" , " Pro" , " Ser" , " Thr" , " Trp" , " Tyr" , " Val" )
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
2020-09-25 11:06:24 +00:00
# pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ...
aac <- AACOLS [toupper ( seqinr :: a ( names ( Y $ I ) ) ) ]
plot ( Y $ I , V $ I ,
xlab = " hydrophobicity" , ylab = " volume" ,
pch = 21 ,
cex = 6 ,
col = aac ,
bg = aac )
text ( Y $ I , V $ I , names ( Y $ I ) , cex = 0.8 )
plot ( Y $ I , K $ I ,
xlab = " hydrophobicity" , ylab = " pK" ,
pch = 21 ,
cex = 6 ,
col = aac ,
bg = aac )
text ( Y $ I , K $ I , names ( Y $ I ) , cex = 0.8 )
2017-10-23 03:39:12 +00:00
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot":
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " ggtern" , quietly = TRUE ) ) {
2017-10-23 03:39:12 +00:00
install.packages ( " ggtern" )
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help = ggtern) # basic information
# browseVignettes("ggtern") # available vignettes
# data(package = "ggtern") # available datasets
2017-10-23 03:39:12 +00:00
# collect into data frame, normalize to (0.05, 0.95)
myDat <- data.frame ( " phi" = 0.9 * ( ( ( Y $ I - min ( Y $ I ) ) / ( max ( Y $ I ) - min ( Y $ I ) ) ) ) +0.05 ,
" vol" = 0.9 * ( ( ( V $ I - min ( V $ I ) ) / ( max ( V $ I ) - min ( V $ I ) ) ) ) +0.05 ,
" pK" = 0.9 * ( ( ( K $ I - min ( K $ I ) ) / ( max ( K $ I ) - min ( K $ I ) ) ) ) +0.05 ,
stringsAsFactors = FALSE )
rownames ( myDat ) <- names ( Y $ I )
2019-01-08 07:11:25 +00:00
ggtern :: ggtern ( data = myDat ,
ggplot2 :: aes ( x = vol ,
y = phi ,
z = pK ,
label = rownames ( myDat ) ) ) + ggplot2 :: geom_text ( )
2017-10-23 03:39:12 +00:00
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
2020-09-25 11:06:24 +00:00
# ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...)
prc <- prcomp ( myDat )
plot ( prc $ x [ , 1 ] , prc $ x [ , 2 ] , xlab = " " , ylab = " " , xaxt = " n" , yaxt = " n" ,
pch = 19 , cex = 6 , col = aad , cex.main = 0.7 ,
main = " Principal Component Analysis of Amino Acid Features" )
text ( prc $ x [ , 1 ] , prc $ x [ , 2 ] , names ( Y $ I ) , cex = 0.8 , col = " #00000088" )
# This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein.
2017-10-23 03:39:12 +00:00
# = 2 Mutation Data matrix ================================================
# A mutation data matrix encodes all amino acid pairscores in a matrix.
# The Biostrings package contains the most common mutation data matrices.
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
if ( ! requireNamespace ( " BiocManager" , quietly = TRUE ) ) {
install.packages ( " BiocManager" )
}
if ( ! requireNamespace ( " Biostrings" , quietly = TRUE ) ) {
BiocManager :: install ( " Biostrings" )
2017-09-12 20:09:20 +00:00
}
2017-10-29 03:05:53 +00:00
# Package information:
# library(help=Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
2017-09-12 20:09:20 +00:00
2020-10-02 10:59:29 +00:00
# Let's attach the BLOSUM62 mutation data matrix from the package
data ( BLOSUM62 , package = " Biostrings" )
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# ... and see what it contains. (You've seen this matrix before.)
2017-09-12 20:09:20 +00:00
BLOSUM62
2017-10-23 03:39:12 +00:00
# We can simply access values via the row/column names.
# Identical amino acids have high scores ...
BLOSUM62 [ " H" , " H" ] # Score for a pair of two histidines
BLOSUM62 [ " S" , " S" ] # Score for a pair of two serines
# Similar amino acids have low positive scores ...
BLOSUM62 [ " L" , " I" ] # Score for a leucine / lysine pair
BLOSUM62 [ " F" , " Y" ] # etc.
# Dissimilar amino acids have negative scores ...
BLOSUM62 [ " L" , " K" ] # Score for a leucine / lysine pair
BLOSUM62 [ " Q" , " P" ] # etc.
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
BLOSUM62 [ " R" , " W" ] # the matrix is symmetric!
BLOSUM62 [ " W" , " R" ]
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# = 3 Background score ====================================================
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
# The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence?
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity
# codes and the gap character:
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
sum ( BLOSUM62 [1 : 20 , 1 : 20 ] ) / 400
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# But that score could be higher for real sequences, for which the amino acid
# distribution is not random. For example membrane proteins have a large number
# of hydrophobic residues - an alignment of unrelated proteins might produce
# positive scores. And there are other proteins with biased amino acid
# compositions, in particular poteins that interact with multiple other
# proteins. Let's test how this impacts the background score by comparing a
# sequence with shuffled sequences. These have the same composition, but are
# obvioulsy not homologous. The data directory contains the FASTA file for the
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ...
2017-09-12 20:09:20 +00:00
2019-01-08 07:11:25 +00:00
aa3FG7 <- Biostrings :: readAAStringSet ( " ./data/3FG7.fa" ) [ [1 ] ]
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues.
2019-01-08 07:11:25 +00:00
aa2F1C <- Biostrings :: readAAStringSet ( " ./data/2F1C.fa" ) [ [1 ] ]
2017-10-23 03:39:12 +00:00
# Here is a function that takes two sequences and
# returns their average pairscore.
averagePairScore <- function ( a , b , MDM = BLOSUM62 ) {
# Returns average pairscore of two sequences.
# Parameters:
# a, b chr amino acid sequence string
# MDM mutation data matrix. Default is BLOSUM62
# Value: num average pairscore.
a <- unlist ( strsplit ( a , " " ) )
b <- unlist ( strsplit ( b , " " ) )
v <- 0
for ( i in seq_along ( a ) ) {
v <- v + MDM [ a [i ] , b [i ] ]
}
return ( v / length ( a ) )
}
orig3FG7 <- toString ( aa3FG7 )
orig2F1C <- toString ( aa2F1C )
N <- 1000
scores3FG7 <- numeric ( N )
scores2F1C <- numeric ( N )
for ( i in 1 : N ) {
scores3FG7 [i ] <- averagePairScore ( orig3FG7 , toString ( sample ( aa3FG7 ) ) )
scores2F1C [i ] <- averagePairScore ( orig2F1C , toString ( sample ( aa2F1C ) ) )
}
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# Plot the distributions
hist ( scores3FG7 ,
col = " #5599EE33" ,
breaks = seq ( -1.5 , 0 , by = 0.1 ) ,
main = " Pairscores for randomly shuffled sequences" ,
xlab = " Average pairscore from BLOSUM 62" )
hist ( scores2F1C ,
col = " #55EE9933" ,
breaks = seq ( -1.5 , 0 , by = 0.1 ) ,
add = TRUE )
abline ( v = sum ( BLOSUM62 [1 : 20 , 1 : 20 ] ) / 400 , col = " firebrick" , lwd = 2 )
legend ( ' topright' ,
c ( " 3FG7 (villin)" , " 2F1C (OmpG)" ) ,
fill = c ( " #5599EE33" , " #55EE9933" ) , bty = ' n' ,
inset = 0.1 )
2017-09-12 20:09:20 +00:00
2017-10-23 03:39:12 +00:00
# This is an important result: even though we have shuffled significantly biased
# sequences, and the average scores trend above the average of the mutation data
# matrix, the average scores still remain comfortably below zero. This means
# that we can't (in general) improve a high-scoring alignment by simply
# extending it with randomly matched residues. We will only improve the score if
# the similarity of newly added residues is larger than what we expect to get by
# random chance!
2017-09-12 20:09:20 +00:00
# [END]