bch441-work-abc-units/BIN-ALI-Optimal_sequence_alignment.R

366 lines
13 KiB
R
Raw Normal View History

# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
#
2017-09-12 20:09:20 +00:00
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
2021-09-16 05:29:19 +00:00
# ==============================================================================
# Version: 1.7.1
2017-09-12 20:09:20 +00:00
#
# Date: 2017-09 - 2020-10
2017-09-12 20:09:20 +00:00
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
2020-10-02 08:31:06 +00:00
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
2020-09-25 11:28:17 +00:00
# 1.7 2020 updates
2019-11-15 03:40:45 +00:00
# 1.6 Maintenance
# 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library.
2017-11-16 21:01:56 +00:00
# 1.3 Updated confirmation task with correct logic
2017-11-16 19:46:15 +00:00
# 1.2 Added missing load of seqinr package
# 1.1 Update annotation file logic - it could already have been
# prepared in the BIN-FUNC-Annotation unit.
2017-10-30 21:24:09 +00:00
# 1.0.1 bugfix
# 1.0 First 2017 live version.
2017-09-12 20:09:20 +00:00
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
2017-10-23 03:38:32 +00:00
#
2017-09-12 20:09:20 +00:00
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
2017-10-23 03:38:32 +00:00
#
2017-09-12 20:09:20 +00:00
# ==============================================================================
2017-10-23 03:38:32 +00:00
#TOC> ==========================================================================
2021-10-26 15:04:31 +00:00
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------------
2021-10-26 15:04:31 +00:00
#TOC> 1 Prepare 58
#TOC> 2 Biostrings Pairwise Alignment 75
#TOC> 2.1 Optimal global alignment 93
#TOC> 2.2 Optimal local alignment 156
#TOC> 3 APSES Domain annotation by alignment 180
#TOC> 4 Update your database script 261
#TOC> 4.1 Preparing an annotation file ... 267
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
#TOC> 4.2 Execute and Validate 338
#TOC>
2017-10-23 03:38:32 +00:00
#TOC> ==========================================================================
# = 1 Prepare =============================================================
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# You can get package information with the following commands:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
2017-11-16 19:46:15 +00:00
2017-10-23 03:38:32 +00:00
# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.
2020-09-25 11:28:17 +00:00
source("./myScripts/makeProteinDB.R")
2017-10-23 03:38:32 +00:00
# = 2 Biostrings Pairwise Alignment =======================================
2017-11-16 19:46:15 +00:00
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
2017-10-23 03:38:32 +00:00
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
2017-09-12 20:09:20 +00:00
# Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward.
2017-09-12 20:09:20 +00:00
2017-10-23 03:38:32 +00:00
# == 2.1 Optimal global alignment ==========================================
# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.
2017-09-12 20:09:20 +00:00
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
2017-09-12 20:09:20 +00:00
2017-10-04 03:38:48 +00:00
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
2017-09-12 20:09:20 +00:00
?pairwiseAlignment
# ... and align.
2017-10-23 03:38:32 +00:00
# Global optimal alignment with end-gap penalties is default.
ali1 <- Biostrings::pairwiseAlignment(
2017-09-12 20:09:20 +00:00
aaMBP1_SACCE,
2017-10-04 03:38:48 +00:00
aaMBP1_MYSPE,
2017-09-12 20:09:20 +00:00
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
2017-10-23 03:38:32 +00:00
str(ali1) # ... it's complicated
2017-09-12 20:09:20 +00:00
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
2017-09-12 20:09:20 +00:00
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch
# or work with "normal" R functions
# the alignment length
2019-11-15 03:40:45 +00:00
nchar(as.character(ali1@pattern))
2017-09-12 20:09:20 +00:00
# the number of identities
2019-11-15 03:40:45 +00:00
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject)))
2017-09-12 20:09:20 +00:00
# ... e.g. to calculate the percentage of identities
100 *
2019-11-15 03:40:45 +00:00
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) /
nchar(as.character(ali1@pattern))
2017-09-12 20:09:20 +00:00
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object
return(100 *
sum(seqinr::s2c(as.character(al@pattern)) ==
seqinr::s2c(as.character(al@subject))) /
2019-11-15 03:40:45 +00:00
nchar(as.character(al@pattern)))
2017-09-12 20:09:20 +00:00
}
percentID(ali1)
2017-10-23 03:38:32 +00:00
# == 2.2 Optimal local alignment ===========================================
2017-09-12 20:09:20 +00:00
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- Biostrings::pairwiseAlignment(
2017-09-12 20:09:20 +00:00
aaMBP1_SACCE,
2017-10-04 03:38:48 +00:00
aaMBP1_MYSPE,
2017-09-12 20:09:20 +00:00
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
gapExtension = 10)
Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity:
2017-09-12 20:09:20 +00:00
percentID(ali2)
# == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...
2017-10-23 03:38:32 +00:00
# = 3 APSES Domain annotation by alignment ================================
2017-09-12 20:09:20 +00:00
2017-10-04 03:38:48 +00:00
# In this section we define the MYSPE APSES sequence by performing a global,
2017-10-23 03:38:32 +00:00
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
2017-09-12 20:09:20 +00:00
#
2017-10-23 03:38:32 +00:00
# I have annotated the yeast APSES domain as a feature in the
2017-09-12 20:09:20 +00:00
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
2017-10-23 03:38:32 +00:00
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
2017-09-12 20:09:20 +00:00
# ... and if you look at the feature table, you can identify the feature ID
2017-10-23 03:38:32 +00:00
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
2017-09-12 20:09:20 +00:00
2017-10-23 03:38:32 +00:00
# ... and with the two annotations we can get the corresponding ID from the
2017-09-12 20:09:20 +00:00
# annotation table
2017-10-23 03:38:32 +00:00
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
2017-10-30 21:24:09 +00:00
myDB$annotation[myDB$annotation$ID == proID &
myDB$annotation$ID == ftrID, ]
2017-09-12 20:09:20 +00:00
# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.
2017-10-23 03:38:32 +00:00
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
2017-09-12 20:09:20 +00:00
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
2017-10-23 03:38:32 +00:00
# name is "MBP1_SACCE", and selected from the annotation table the start
2017-09-12 20:09:20 +00:00
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
2017-09-12 20:09:20 +00:00
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- Biostrings::pairwiseAlignment(
2017-09-12 20:09:20 +00:00
aaMB1_SACCE_APSES,
2017-10-04 03:38:48 +00:00
aaMBP1_MYSPE,
2017-09-12 20:09:20 +00:00
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
Biostrings::writePairwiseAlignments(aliApses)
2017-09-12 20:09:20 +00:00
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.
str(aliApses@subject@range)
# start is:
aliApses@subject@range@start
# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1
2017-10-23 03:38:32 +00:00
# = 4 Update your database script =========================================
2017-09-12 20:09:20 +00:00
2017-10-23 03:38:32 +00:00
# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.
# == 4.1 Preparing an annotation file ... ==================================
2017-10-23 03:38:32 +00:00
#
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
2017-10-23 03:38:32 +00:00
#
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
2020-10-02 08:38:01 +00:00
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
2020-10-02 08:38:01 +00:00
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines except for the line that says:
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
#
# - Then delete the comma at the end of the line (your file will just have
# this one annotation).
2017-10-23 03:38:32 +00:00
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
2020-10-02 08:32:29 +00:00
# - Save the file in your myScripts/ directory
#
## - Validate your file online at https://jsonlint.com/
#
2020-10-02 08:38:01 +00:00
# - Update your "./myScripts/makeProteinDB.R" script to load your new
2017-10-23 03:38:32 +00:00
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
2020-10-02 08:31:06 +00:00
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
# - save and close the file.
#
# Then SKIP the next section.
#
2017-10-23 03:38:32 +00:00
#
2021-10-26 15:04:31 +00:00
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
2017-10-23 03:38:32 +00:00
#
#
# You DO already have a file called "<MYSPE>-Annotations.json" in the
2020-10-02 08:38:01 +00:00
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Below the last feature lines (but before the closing "]") add the
# following feature line (without the "#")
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Add a comma after the preceding feature line.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 4.2 Execute and Validate ==============================================
#
# - source() your database creation script:
2017-10-23 03:38:32 +00:00
#
2020-10-02 08:38:01 +00:00
# source("./myScripts/makeProteinDB.R")
2017-10-23 03:38:32 +00:00
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask on the mailing list for
2017-10-23 03:38:32 +00:00
# help.
#
# - Confirm
# The following commands should retrieve the correct start and end
# coordinates and sequence of the MBP1_MYSPE APSES domain:
2017-09-12 20:09:20 +00:00
2017-11-16 21:01:56 +00:00
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
2017-09-12 20:09:20 +00:00
2017-11-16 21:01:56 +00:00
(proID <- myDB$protein$ID[sel])
2017-10-23 03:38:32 +00:00
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
2017-11-16 21:01:56 +00:00
myDB$annotation$featureID == ftrID])
2017-10-23 03:38:32 +00:00
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
2017-09-12 20:09:20 +00:00
# [END]