bch441-work-abc-units/BIN-ALI-Optimal_sequence_alignment.R

# BIN-ALI-Optimal_sequence_alignment.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# Version:  0.1
#
# Date:     2017  08  28
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           0.1    First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================

#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                                   Line
#TOC> -------------------------------------------------------
#TOC>   1        Prepare                                   41
#TOC>   2        Biostrings Pairwise Alignment             49
#TOC>   2.1      Optimal global alignment                  60
#TOC>   2.2      Optimal local alignment                  123
#TOC>   3        APSES Domain annotation by alignment     147
#TOC>   4        Update your database script              228
#TOC>
#TOC> ==========================================================================


# =    1  Prepare  =============================================================

# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.

source("makeProteinDB.R")


# =    2  Biostrings Pairwise Alignment  =======================================

if (!require(Biostrings, quietly=TRUE)) {
  source("https://bioconductor.org/biocLite.R")
  biocLite("Biostrings")
  library(Biostrings)
}

# Biostrings stores sequences in "XString" objects. Once we have onverted our
# traget sequences to AAString objects, the alignment itself is straightforward.

# ==   2.1  Optimal global alignment  ==========================================

# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.

# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])

sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <-   AAString(myDB$protein$sequence[sel])

?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <-  pairwiseAlignment(
  aaMBP1_SACCE,
  aaMBP1_MYSPE,
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
  gapExtension = 0.5)

str(ali1)  # ... it's complicated

# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
writePairwiseAlignments(ali1)   # That should look familiar

# And we can make the internal structure work for us  (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch

# or work with "normal" R functions
# the alignment length
nchar(ali1@pattern)

# the number of identities
sum(s2c(as.character(ali1@pattern)) ==
      s2c(as.character(ali1@subject)))

# ... e.g. to calculate the percentage of identities
100 *
  sum(s2c(as.character(ali1@pattern)) ==
        s2c(as.character(ali1@subject))) /
  nchar(ali1@pattern)
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
  # returns the percent-identity of a Biostrings alignment object
  return(100 *
           sum(s2c(as.character(al@pattern)) ==
                 s2c(as.character(al@subject))) /
           nchar(al@pattern))
}

percentID(ali1)

# ==   2.2  Optimal local alignment  ===========================================

# Compare with local optimal alignment (like EMBOSS Water)
ali2 <-  pairwiseAlignment(
  aaMBP1_SACCE,
  aaMBP1_MYSPE,
  type = "local",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 50,
  gapExtension = 10)

writePairwiseAlignments(ali2)   # This has probably only aligned the N-terminal
                                # DNA binding domain - but that one has quite
                                # high sequence identity:
percentID(ali2)

# == TASK: ==

# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...


# =    3  APSES Domain annotation by alignment  ================================

# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
#

# I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])


# ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])

# ... and with the two annotations we can get the corresponding ID from the
# annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
                             myDB$annotation$featureID == ftrID])

myDB$proteinAnnotation[myDB$proteinAnnotation$protein.ID == proID &
                         myDB$proteinAnnotation$feature.ID == ftrID, ]

# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.

(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
                 start,
                 end))

# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.

# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- AAString(apses)

# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.

aliApses <-  pairwiseAlignment(
  aaMB1_SACCE_APSES,
  aaMBP1_MYSPE,
  type = "overlap",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
  gapExtension = 0.5)

# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF  should be matched
# with the "query". Is this correct?
writePairwiseAlignments(aliApses)

# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)

# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.

str(aliApses@subject@range)

# start is:
aliApses@subject@range@start

# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1


# =    4  Update your database script  =========================================


# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.  Follow the following steps carefully:
#
#
#   - Make a copy of the file "./data/refAnnotations.json" in your project
#     directory and give it a new name that corresponds to MYSPE - e.g. if
#     MYSPE is called "Crptycoccus neoformans", your file should be called
#     "CRYNEAnnotations.json"; in that case "MBP1_CRYNE" would also be the
#     "name" of your protein. Open the file in the RStudio editor and delete
#     all annotations but one for an "APSES fold". Edit that annotation to
#     correspond to the your MBP1_MYSPE protein and enter the start end end
#     coordinates you have just discovered for the APSES domain in your
#     sequence. Save your file.
#
#   - Validate your file online at https://jsonlint.com/
#
#   - Next, you need to update your "makeProteinDB.R" script to load the
#     annotation when you recreate the database. Open the script in the
#     RStudio ediotr, and add the following command at the end:
#
#     myDB <- dbAddAnnotation(myDB, fromJSON("<MYSPE>Annotations.json"))
#
#   - save the file and source() it:
#
#     source("makeProteinDB.R")
#
#     This should run without errors or warnings. If it doesn't work and you
#     can't figure out quickly what's happeneing, ask on the mailing list for
#     help.
#
#   - Confirm
#     The following commands should retrieve the correct start and end
#     coordinates for the MBP1_MYSPE APSES domain:

sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <-   AAString(myDB$protein$sequence[sel])


(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
                             myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
                 start,
                 end))


# [END]