2017-09-12 20:09:20 +00:00
|
|
|
# BIN-ALI-Optimal_sequence_alignment.R
|
|
|
|
#
|
|
|
|
# Purpose: A Bioinformatics Course:
|
|
|
|
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
|
|
|
#
|
2017-11-16 21:37:53 +00:00
|
|
|
# Version: 1.4
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2017-11-14 07:57:13 +00:00
|
|
|
# Date: 2017 09 - 2017 11
|
2017-09-12 20:09:20 +00:00
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# Versions:
|
2017-11-16 21:37:53 +00:00
|
|
|
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
|
|
|
# entire library.
|
2017-11-16 21:01:56 +00:00
|
|
|
# 1.3 Updated confirmation task with correct logic
|
2017-11-16 19:46:15 +00:00
|
|
|
# 1.2 Added missing load of seqinr package
|
2017-11-14 07:57:13 +00:00
|
|
|
# 1.1 Update annotation file logic - it could already have been
|
|
|
|
# prepared in the BIN-FUNC-Annotation unit.
|
2017-10-30 21:24:09 +00:00
|
|
|
# 1.0.1 bugfix
|
|
|
|
# 1.0 First 2017 live version.
|
2017-09-12 20:09:20 +00:00
|
|
|
# 0.1 First code copied from 2016 material.
|
|
|
|
#
|
|
|
|
# TODO:
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
|
|
# going on. That's not how it works ...
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# ==============================================================================
|
|
|
|
|
2017-10-29 03:05:53 +00:00
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
#TOC> ==========================================================================
|
2017-11-16 21:37:53 +00:00
|
|
|
#TOC>
|
2017-11-14 07:57:13 +00:00
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> --------------------------------------------------------------------
|
2017-11-16 21:37:53 +00:00
|
|
|
#TOC> 1 Prepare 52
|
|
|
|
#TOC> 2 Biostrings Pairwise Alignment 66
|
|
|
|
#TOC> 2.1 Optimal global alignment 84
|
|
|
|
#TOC> 2.2 Optimal local alignment 147
|
|
|
|
#TOC> 3 APSES Domain annotation by alignment 171
|
|
|
|
#TOC> 4 Update your database script 252
|
|
|
|
#TOC> 4.1 Preparing an annotation file ... 258
|
|
|
|
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260
|
|
|
|
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303
|
|
|
|
#TOC> 4.2 Execute and Validate 327
|
|
|
|
#TOC>
|
2017-10-23 03:38:32 +00:00
|
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
|
|
|
|
|
|
# = 1 Prepare =============================================================
|
|
|
|
|
2017-11-16 21:37:53 +00:00
|
|
|
# To simplify code, we pull the function s2c(x) from the seqinr package,
|
|
|
|
# rather than using the lengthier idiom unlist(strsplit(x, "").
|
|
|
|
# This assumes that the seqinr package has been installed previously.
|
|
|
|
s2c <- seqinr::s2c
|
2017-11-16 19:46:15 +00:00
|
|
|
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# You need to recreate the protein database that you have constructed in the
|
|
|
|
# BIN-Storing_data unit.
|
|
|
|
|
|
|
|
source("makeProteinDB.R")
|
|
|
|
|
|
|
|
|
|
|
|
# = 2 Biostrings Pairwise Alignment =======================================
|
|
|
|
|
2017-11-16 19:46:15 +00:00
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
if (!require(Biostrings, quietly=TRUE)) {
|
2017-10-29 03:05:53 +00:00
|
|
|
if (! exists("biocLite")) {
|
|
|
|
source("https://bioconductor.org/biocLite.R")
|
|
|
|
}
|
2017-10-23 03:38:32 +00:00
|
|
|
biocLite("Biostrings")
|
|
|
|
library(Biostrings)
|
|
|
|
}
|
2017-10-29 03:05:53 +00:00
|
|
|
# library(help = Biostrings) # basic information
|
|
|
|
# browseVignettes("Biostrings") # available vignettes
|
|
|
|
# data(package = "Biostrings") # available datasets
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-29 03:05:53 +00:00
|
|
|
# Biostrings stores sequences in "XString" objects. Once we have converted our
|
|
|
|
# target sequences to AAString objects, the alignment itself is straightforward.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# == 2.1 Optimal global alignment ==========================================
|
|
|
|
|
|
|
|
# The pairwiseAlignment() function was written to behave
|
|
|
|
# exactly like the functions you encountered on the EMBOSS server.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# First: make AAString objects ...
|
|
|
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
|
|
|
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
|
|
|
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
?pairwiseAlignment
|
|
|
|
# ... and align.
|
2017-10-23 03:38:32 +00:00
|
|
|
# Global optimal alignment with end-gap penalties is default.
|
2017-09-12 20:09:20 +00:00
|
|
|
ali1 <- pairwiseAlignment(
|
|
|
|
aaMBP1_SACCE,
|
2017-10-04 03:38:48 +00:00
|
|
|
aaMBP1_MYSPE,
|
2017-09-12 20:09:20 +00:00
|
|
|
substitutionMatrix = "BLOSUM62",
|
|
|
|
gapOpening = 10,
|
|
|
|
gapExtension = 0.5)
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
str(ali1) # ... it's complicated
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
|
|
|
# tame it:
|
|
|
|
ali1
|
|
|
|
writePairwiseAlignments(ali1) # That should look familiar
|
|
|
|
|
|
|
|
# And we can make the internal structure work for us (@ is for classes as
|
|
|
|
# $ is for lists ...)
|
|
|
|
str(ali1@pattern)
|
|
|
|
ali1@pattern
|
|
|
|
ali1@pattern@range
|
|
|
|
ali1@pattern@indel
|
|
|
|
ali1@pattern@mismatch
|
|
|
|
|
|
|
|
# or work with "normal" R functions
|
|
|
|
# the alignment length
|
|
|
|
nchar(ali1@pattern)
|
|
|
|
|
|
|
|
# the number of identities
|
|
|
|
sum(s2c(as.character(ali1@pattern)) ==
|
2017-11-16 19:46:15 +00:00
|
|
|
s2c(as.character(ali1@subject)))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# ... e.g. to calculate the percentage of identities
|
|
|
|
100 *
|
|
|
|
sum(s2c(as.character(ali1@pattern)) ==
|
2017-11-16 19:46:15 +00:00
|
|
|
s2c(as.character(ali1@subject))) /
|
2017-09-12 20:09:20 +00:00
|
|
|
nchar(ali1@pattern)
|
|
|
|
# ... which should be the same as reported in the writePairwiseAlignments()
|
|
|
|
# output. Awkward to type? Then it calls for a function:
|
|
|
|
#
|
|
|
|
percentID <- function(al) {
|
|
|
|
# returns the percent-identity of a Biostrings alignment object
|
|
|
|
return(100 *
|
2017-11-16 21:37:53 +00:00
|
|
|
sum(seqinr::s2c(as.character(al@pattern)) ==
|
|
|
|
seqinr::s2c(as.character(al@subject))) /
|
2017-09-12 20:09:20 +00:00
|
|
|
nchar(al@pattern))
|
|
|
|
}
|
|
|
|
|
|
|
|
percentID(ali1)
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# == 2.2 Optimal local alignment ===========================================
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# Compare with local optimal alignment (like EMBOSS Water)
|
|
|
|
ali2 <- pairwiseAlignment(
|
|
|
|
aaMBP1_SACCE,
|
2017-10-04 03:38:48 +00:00
|
|
|
aaMBP1_MYSPE,
|
2017-09-12 20:09:20 +00:00
|
|
|
type = "local",
|
|
|
|
substitutionMatrix = "BLOSUM62",
|
|
|
|
gapOpening = 50,
|
|
|
|
gapExtension = 10)
|
|
|
|
|
|
|
|
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
|
2017-10-23 03:38:32 +00:00
|
|
|
# DNA binding domain - but that one has quite
|
|
|
|
# high sequence identity:
|
2017-09-12 20:09:20 +00:00
|
|
|
percentID(ali2)
|
|
|
|
|
|
|
|
# == TASK: ==
|
|
|
|
|
|
|
|
# Compare the two alignments. I have weighted the local alignment heavily
|
|
|
|
# towards an ungapped alignment by setting very high gap penalties. Try changing
|
|
|
|
# the gap penalties and see what happens: how does the number of indels change,
|
|
|
|
# how does the length of indels change...
|
|
|
|
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# = 3 APSES Domain annotation by alignment ================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
# In this section we define the MYSPE APSES sequence by performing a global,
|
2017-10-23 03:38:32 +00:00
|
|
|
# optimal sequence alignment of the yeast APSES domain with the full length
|
|
|
|
# protein sequence of the protein that was the most similar to the yeast APSES
|
|
|
|
# domain.
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# I have annotated the yeast APSES domain as a feature in the
|
2017-09-12 20:09:20 +00:00
|
|
|
# database. To view the annotation, we can retrieve it via the proteinID and
|
|
|
|
# featureID. Here is the yeast protein ID:
|
2017-10-23 03:38:32 +00:00
|
|
|
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
# ... and if you look at the feature table, you can identify the feature ID
|
2017-10-23 03:38:32 +00:00
|
|
|
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# ... and with the two annotations we can get the corresponding ID from the
|
2017-09-12 20:09:20 +00:00
|
|
|
# annotation table
|
2017-10-23 03:38:32 +00:00
|
|
|
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
|
|
|
myDB$annotation$featureID == ftrID])
|
|
|
|
|
2017-10-30 21:24:09 +00:00
|
|
|
myDB$annotation[myDB$annotation$ID == proID &
|
|
|
|
myDB$annotation$ID == ftrID, ]
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# The annotation record contains the start and end coordinates which we can use
|
|
|
|
# to define the APSES domain sequence with a substr() expression.
|
2017-10-23 03:38:32 +00:00
|
|
|
|
|
|
|
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
|
|
|
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
|
|
|
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
|
|
|
start,
|
|
|
|
end))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
|
|
|
|
# have selected from the sequence column of the protein table the sequence whose
|
2017-10-23 03:38:32 +00:00
|
|
|
# name is "MBP1_SACCE", and selected from the annotation table the start
|
2017-09-12 20:09:20 +00:00
|
|
|
# and end coordinates of the annotation that joins an "APSES fold" feature with
|
|
|
|
# the sequence, and used the start and end coordinates to extract a substring.
|
|
|
|
|
|
|
|
# Let's convert this to an AAstring and assign it:
|
2017-10-23 03:38:32 +00:00
|
|
|
aaMB1_SACCE_APSES <- AAString(apses)
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# Now let's align these two sequences of very different length without end-gap
|
|
|
|
# penalties using the "overlap" type. "overlap" turns the
|
|
|
|
# end-gap penalties off and that is crucially important since
|
|
|
|
# the sequences have very different length.
|
|
|
|
|
|
|
|
aliApses <- pairwiseAlignment(
|
|
|
|
aaMB1_SACCE_APSES,
|
2017-10-04 03:38:48 +00:00
|
|
|
aaMBP1_MYSPE,
|
2017-09-12 20:09:20 +00:00
|
|
|
type = "overlap",
|
|
|
|
substitutionMatrix = "BLOSUM62",
|
|
|
|
gapOpening = 10,
|
|
|
|
gapExtension = 0.5)
|
|
|
|
|
|
|
|
# Inspect the result. The aligned sequences should be clearly
|
|
|
|
# homologous, and have (almost) no indels. The entire "pattern"
|
|
|
|
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
|
|
|
# with the "query". Is this correct?
|
|
|
|
writePairwiseAlignments(aliApses)
|
|
|
|
|
|
|
|
# If this is correct, you can extract the matched sequence from
|
|
|
|
# the alignment object. The syntax is a bit different from what
|
|
|
|
# you have seen before: this is an "S4 object", not a list. No
|
|
|
|
# worries: as.character() returns a normal string.
|
|
|
|
as.character(aliApses@subject)
|
|
|
|
|
|
|
|
# Now, what are the aligned start and end coordinates? You can read them from
|
|
|
|
# the output of writePairwiseAlignments(), or you can get them from the range of
|
|
|
|
# the match.
|
|
|
|
|
|
|
|
str(aliApses@subject@range)
|
|
|
|
|
|
|
|
# start is:
|
|
|
|
aliApses@subject@range@start
|
|
|
|
|
|
|
|
# ... and end is:
|
|
|
|
aliApses@subject@range@start + aliApses@subject@range@width - 1
|
|
|
|
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# = 4 Update your database script =========================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-10-23 03:38:32 +00:00
|
|
|
# Since we have this feature defined now, we can create a feature annotation
|
2017-11-14 07:57:13 +00:00
|
|
|
# right away and store it in myDB.
|
|
|
|
|
|
|
|
# == 4.1 Preparing an annotation file ... ==================================
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-11-14 07:57:13 +00:00
|
|
|
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
|
|
|
#
|
2017-11-14 07:57:13 +00:00
|
|
|
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
|
|
|
# ./data/ directory:
|
|
|
|
#
|
|
|
|
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
|
|
|
# project directory.
|
|
|
|
#
|
|
|
|
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
|
|
|
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
|
|
|
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
|
|
|
# "MBP1_CRYNE").
|
|
|
|
#
|
|
|
|
# - Open the file in the RStudio editor and delete all blocks for
|
|
|
|
# the Mbp1 protein annotations except the first one.
|
|
|
|
#
|
|
|
|
# - From that block, delete all lines except for the line that says:
|
|
|
|
#
|
|
|
|
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
|
|
|
|
#
|
|
|
|
# - Then delete the comma at the end of the line (your file will just have
|
|
|
|
# this one annotation).
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-11-14 07:57:13 +00:00
|
|
|
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
|
|
|
# "start" and "end" features to the coordinates you just discovered for the
|
|
|
|
# APSES domain in your sequence.
|
|
|
|
#
|
|
|
|
# - Save your file.
|
|
|
|
#
|
|
|
|
## - Validate your file online at https://jsonlint.com/
|
|
|
|
#
|
|
|
|
# - Update your "makeProteinDB.R" script to load your new
|
2017-10-23 03:38:32 +00:00
|
|
|
# annotation when you recreate the database. Open the script in the
|
2017-11-14 07:57:13 +00:00
|
|
|
# RStudio editor, and add the following command at the end:
|
|
|
|
#
|
|
|
|
# myDB <- dbAddAnnotation(myDB, fromJSON("<MYSPE>-Annotations.json"))
|
|
|
|
#
|
|
|
|
# - save and close the file.
|
|
|
|
#
|
|
|
|
# Then SKIP the next section.
|
|
|
|
#
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-11-16 21:37:53 +00:00
|
|
|
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
2017-11-14 07:57:13 +00:00
|
|
|
#
|
|
|
|
# You DO already have a file called "<MYSPE>-Annotations.json" in the
|
|
|
|
# ./data/ directory:
|
|
|
|
#
|
|
|
|
# - Open the file in the RStudio editor.
|
|
|
|
#
|
|
|
|
# - Below the last feature lines (but before the closing "]") add the
|
|
|
|
# following feature line (without the "#")
|
|
|
|
#
|
|
|
|
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
|
|
|
|
#
|
|
|
|
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
|
|
|
# "start" and "end" features to the coordinates you just discovered for the
|
|
|
|
# APSES domain in your sequence.
|
|
|
|
#
|
|
|
|
# - Add a comma after the preceding feature line.
|
|
|
|
#
|
|
|
|
# - Save your file.
|
|
|
|
#
|
|
|
|
# - Validate your file online at https://jsonlint.com/
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# == 4.2 Execute and Validate ==============================================
|
|
|
|
#
|
|
|
|
# - source() your database creation script:
|
2017-10-23 03:38:32 +00:00
|
|
|
#
|
|
|
|
# source("makeProteinDB.R")
|
|
|
|
#
|
|
|
|
# This should run without errors or warnings. If it doesn't work and you
|
2017-11-14 07:57:13 +00:00
|
|
|
# can't figure out quickly what's happening, ask on the mailing list for
|
2017-10-23 03:38:32 +00:00
|
|
|
# help.
|
|
|
|
#
|
|
|
|
# - Confirm
|
|
|
|
# The following commands should retrieve the correct start and end
|
2017-11-14 07:57:13 +00:00
|
|
|
# coordinates and sequence of the MBP1_MYSPE APSES domain:
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-11-16 21:01:56 +00:00
|
|
|
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-11-16 21:01:56 +00:00
|
|
|
(proID <- myDB$protein$ID[sel])
|
2017-10-23 03:38:32 +00:00
|
|
|
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
|
|
|
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
2017-11-16 21:01:56 +00:00
|
|
|
myDB$annotation$featureID == ftrID])
|
2017-10-23 03:38:32 +00:00
|
|
|
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
|
|
|
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
|
|
|
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
|
|
|
start,
|
|
|
|
end))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
# [END]
|