bch441-work-abc-units/BIN-ALI-Optimal_sequence_alignment.R

# BIN-ALI-Optimal_sequence_alignment.R
#
# Purpose:  A Bioinformatics Course:
#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# Version:  1.4
#
# Date:     2017  09   -   2017  11
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
#           1.4    Pull s2c() from seqinr package, rather then loading the
#                    entire library.
#           1.3    Updated confirmation task with correct logic
#           1.2    Added missing load of seqinr package
#           1.1    Update annotation file logic - it could already have been
#                    prepared in the BIN-FUNC-Annotation unit.
#           1.0.1  bugfix
#           1.0    First 2017 live version.
#           0.1    First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================


#TOC> ==========================================================================
#TOC>
#TOC>   Section  Title                                                Line
#TOC> --------------------------------------------------------------------
#TOC>   1        Prepare                                                52
#TOC>   2        Biostrings Pairwise Alignment                          66
#TOC>   2.1      Optimal global alignment                               84
#TOC>   2.2      Optimal local alignment                               147
#TOC>   3        APSES Domain annotation by alignment                  171
#TOC>   4        Update your database script                           252
#TOC>   4.1      Preparing an annotation file ...                      258
#TOC>   4.1.1    If you HAVE NOT done the BIN-FUNC-Annotation unit     260
#TOC>   4.1.2    If you HAVE done the BIN-FUNC-Annotation unit         303
#TOC>   4.2      Execute and Validate                                  327
#TOC>
#TOC> ==========================================================================


# =    1  Prepare  =============================================================

# To simplify code, we pull the function s2c(x) from the seqinr package,
# rather than using the lengthier idiom unlist(strsplit(x, "").
# This assumes that the seqinr package has been installed previously.
s2c <- seqinr::s2c


# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.

source("makeProteinDB.R")


# =    2  Biostrings Pairwise Alignment  =======================================


if (!require(Biostrings, quietly=TRUE)) {
  if (! exists("biocLite")) {
    source("https://bioconductor.org/biocLite.R")
  }
  biocLite("Biostrings")
  library(Biostrings)
}
#  library(help = Biostrings)       # basic information
#  browseVignettes("Biostrings")    # available vignettes
#  data(package = "Biostrings")     # available datasets


# Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward.

# ==   2.1  Optimal global alignment  ==========================================

# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.

# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])

sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <-   AAString(myDB$protein$sequence[sel])

?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <-  pairwiseAlignment(
  aaMBP1_SACCE,
  aaMBP1_MYSPE,
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
  gapExtension = 0.5)

str(ali1)  # ... it's complicated

# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
writePairwiseAlignments(ali1)   # That should look familiar

# And we can make the internal structure work for us  (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch

# or work with "normal" R functions
# the alignment length
nchar(ali1@pattern)

# the number of identities
sum(s2c(as.character(ali1@pattern)) ==
    s2c(as.character(ali1@subject)))

# ... e.g. to calculate the percentage of identities
100 *
  sum(s2c(as.character(ali1@pattern)) ==
      s2c(as.character(ali1@subject))) /
  nchar(ali1@pattern)
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
  # returns the percent-identity of a Biostrings alignment object
  return(100 *
           sum(seqinr::s2c(as.character(al@pattern)) ==
               seqinr::s2c(as.character(al@subject))) /
           nchar(al@pattern))
}

percentID(ali1)

# ==   2.2  Optimal local alignment  ===========================================

# Compare with local optimal alignment (like EMBOSS Water)
ali2 <-  pairwiseAlignment(
  aaMBP1_SACCE,
  aaMBP1_MYSPE,
  type = "local",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 50,
  gapExtension = 10)

writePairwiseAlignments(ali2)   # This has probably only aligned the N-terminal
                                # DNA binding domain - but that one has quite
                                # high sequence identity:
percentID(ali2)

# == TASK: ==

# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...


# =    3  APSES Domain annotation by alignment  ================================

# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
#

# I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])


# ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])

# ... and with the two annotations we can get the corresponding ID from the
# annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
                             myDB$annotation$featureID == ftrID])

myDB$annotation[myDB$annotation$ID == proID &
                myDB$annotation$ID == ftrID, ]

# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.

(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
                 start,
                 end))

# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.

# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- AAString(apses)

# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.

aliApses <-  pairwiseAlignment(
  aaMB1_SACCE_APSES,
  aaMBP1_MYSPE,
  type = "overlap",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
  gapExtension = 0.5)

# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF  should be matched
# with the "query". Is this correct?
writePairwiseAlignments(aliApses)

# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)

# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.

str(aliApses@subject@range)

# start is:
aliApses@subject@range@start

# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1


# =    4  Update your database script  =========================================


# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.

# ==   4.1  Preparing an annotation file ...  ==================================
#
# ===  4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
#
#
#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
#   ./data/ directory:
#
#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
#     project directory.
#
#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
#     if MYSPE is called "Crptycoccus neoformans", your file should be called
#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
#     "MBP1_CRYNE").
#
#   - Open the file in the RStudio editor and delete all blocks for
#     the Mbp1 protein annotations except the first one.
#
#   - From that block, delete all lines except for the line that says:
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
#
#   - Then delete the comma at the end of the line (your file will just have
#     this one annotation).
#
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
#     "start" and "end" features to the coordinates you just discovered for the
#     APSES domain in your sequence.
#
#   - Save your file.
#
##   - Validate your file online at https://jsonlint.com/
#
#   - Update your "makeProteinDB.R" script to load your new
#     annotation when you recreate the database. Open the script in the
#     RStudio editor, and add the following command at the end:
#
#     myDB <- dbAddAnnotation(myDB, fromJSON("<MYSPE>-Annotations.json"))
#
#   - save and close the file.
#
# Then SKIP the next section.
#
#
# ===  4.1.2  If you HAVE done the BIN-FUNC-Annotation unit
#
#
#   You DO already have a file called "<MYSPE>-Annotations.json" in the
#   ./data/ directory:
#
#   - Open the file in the RStudio editor.
#
#   - Below the last feature lines (but before the closing "]") add the
#     following feature line (without the "#")
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
#
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
#     "start" and "end" features to the coordinates you just discovered for the
#     APSES domain in your sequence.
#
#   - Add a comma after the preceding feature line.
#
#   - Save your file.
#
#   - Validate your file online at https://jsonlint.com/
#
#
# ==   4.2  Execute and Validate  ==============================================
#
#   - source() your database creation script:
#
#     source("makeProteinDB.R")
#
#     This should run without errors or warnings. If it doesn't work and you
#     can't figure out quickly what's happening, ask on the mailing list for
#     help.
#
#   - Confirm
#     The following commands should retrieve the correct start and end
#     coordinates and sequence of the MBP1_MYSPE APSES domain:

sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))

(proID <- myDB$protein$ID[sel])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
                             myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
                 start,
                 end))


# [END]
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# BIN-ALI-Optimal_sequence_alignment.R`
			`#`
			`# Purpose: A Bioinformatics Course:`
			`# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.`
			`#`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`# Version: 1.4`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`#`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# Date: 2017 09 - 2017 11`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# Author: Boris Steipe (boris.steipe@utoronto.ca)`
			`#`
			`# Versions:`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`# 1.4 Pull s2c() from seqinr package, rather then loading the`
			`# entire library.`
Updated logic 2017-11-16 21:01:56 +00:00			`# 1.3 Updated confirmation task with correct logic`
load seqinr 2017-11-16 19:46:15 +00:00			`# 1.2 Added missing load of seqinr package`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# 1.1 Update annotation file logic - it could already have been`
			`# prepared in the BIN-FUNC-Annotation unit.`
bugfix (old code carryover) 2017-10-30 21:24:09 +00:00			`# 1.0.1 bugfix`
			`# 1.0 First 2017 live version.`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# 0.1 First code copied from 2016 material.`
			`#`
			`# TODO:`
			`#`
			`#`
			`# == DO NOT SIMPLY source() THIS FILE! =======================================`
New unit 2017-10-23 03:38:32 +00:00			`#`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# If there are portions you don't understand, use R's help system, Google for an`
			`# answer, or ask your instructor. Don't continue if you don't understand what's`
			`# going on. That's not how it works ...`
New unit 2017-10-23 03:38:32 +00:00			`#`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# ==============================================================================`

Added package information code after every library() call. 2017-10-29 03:05:53 +00:00
New unit 2017-10-23 03:38:32 +00:00			`#TOC> ==========================================================================`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`#TOC>`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`#TOC> Section Title Line`
			`#TOC> --------------------------------------------------------------------`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`#TOC> 1 Prepare 52`
			`#TOC> 2 Biostrings Pairwise Alignment 66`
			`#TOC> 2.1 Optimal global alignment 84`
			`#TOC> 2.2 Optimal local alignment 147`
			`#TOC> 3 APSES Domain annotation by alignment 171`
			`#TOC> 4 Update your database script 252`
			`#TOC> 4.1 Preparing an annotation file ... 258`
			`#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260`
			`#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303`
			`#TOC> 4.2 Execute and Validate 327`
			`#TOC>`
New unit 2017-10-23 03:38:32 +00:00			`#TOC> ==========================================================================`


			`# = 1 Prepare =============================================================`

update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`# To simplify code, we pull the function s2c(x) from the seqinr package,`
			`# rather than using the lengthier idiom unlist(strsplit(x, "").`
			`# This assumes that the seqinr package has been installed previously.`
			`s2c <- seqinr::s2c`
load seqinr 2017-11-16 19:46:15 +00:00

New unit 2017-10-23 03:38:32 +00:00			`# You need to recreate the protein database that you have constructed in the`
			`# BIN-Storing_data unit.`

			`source("makeProteinDB.R")`


			`# = 2 Biostrings Pairwise Alignment =======================================`

load seqinr 2017-11-16 19:46:15 +00:00
New unit 2017-10-23 03:38:32 +00:00			`if (!require(Biostrings, quietly=TRUE)) {`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`if (! exists("biocLite")) {`
			`source("https://bioconductor.org/biocLite.R")`
			`}`
New unit 2017-10-23 03:38:32 +00:00			`biocLite("Biostrings")`
			`library(Biostrings)`
			`}`
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`# library(help = Biostrings) # basic information`
			`# browseVignettes("Biostrings") # available vignettes`
			`# data(package = "Biostrings") # available datasets`

Added and updated learning units 2017-09-12 20:09:20 +00:00
Added package information code after every library() call. 2017-10-29 03:05:53 +00:00			`# Biostrings stores sequences in "XString" objects. Once we have converted our`
			`# target sequences to AAString objects, the alignment itself is straightforward.`
Added and updated learning units 2017-09-12 20:09:20 +00:00
New unit 2017-10-23 03:38:32 +00:00			`# == 2.1 Optimal global alignment ==========================================`

			`# The pairwiseAlignment() function was written to behave`
			`# exactly like the functions you encountered on the EMBOSS server.`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# First: make AAString objects ...`
			`sel <- myDB$protein$name == "MBP1_SACCE"`
			`aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])`

Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")`
			`aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`?pairwiseAlignment`
			`# ... and align.`
New unit 2017-10-23 03:38:32 +00:00			`# Global optimal alignment with end-gap penalties is default.`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`ali1 <- pairwiseAlignment(`
			`aaMBP1_SACCE,`
Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`aaMBP1_MYSPE,`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`substitutionMatrix = "BLOSUM62",`
			`gapOpening = 10,`
			`gapExtension = 0.5)`

New unit 2017-10-23 03:38:32 +00:00			`str(ali1) # ... it's complicated`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# This is a Biostrings alignment object. But we can use Biostrings functions to`
			`# tame it:`
			`ali1`
			`writePairwiseAlignments(ali1) # That should look familiar`

			`# And we can make the internal structure work for us (@ is for classes as`
			`# $ is for lists ...)`
			`str(ali1@pattern)`
			`ali1@pattern`
			`ali1@pattern@range`
			`ali1@pattern@indel`
			`ali1@pattern@mismatch`

			`# or work with "normal" R functions`
			`# the alignment length`
			`nchar(ali1@pattern)`

			`# the number of identities`
			`sum(s2c(as.character(ali1@pattern)) ==`
load seqinr 2017-11-16 19:46:15 +00:00			`s2c(as.character(ali1@subject)))`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# ... e.g. to calculate the percentage of identities`
			`100 *`
			`sum(s2c(as.character(ali1@pattern)) ==`
load seqinr 2017-11-16 19:46:15 +00:00			`s2c(as.character(ali1@subject))) /`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`nchar(ali1@pattern)`
			`# ... which should be the same as reported in the writePairwiseAlignments()`
			`# output. Awkward to type? Then it calls for a function:`
			`#`
			`percentID <- function(al) {`
			`# returns the percent-identity of a Biostrings alignment object`
			`return(100 *`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`sum(seqinr::s2c(as.character(al@pattern)) ==`
			`seqinr::s2c(as.character(al@subject))) /`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`nchar(al@pattern))`
			`}`

			`percentID(ali1)`

New unit 2017-10-23 03:38:32 +00:00			`# == 2.2 Optimal local alignment ===========================================`

Added and updated learning units 2017-09-12 20:09:20 +00:00			`# Compare with local optimal alignment (like EMBOSS Water)`
			`ali2 <- pairwiseAlignment(`
			`aaMBP1_SACCE,`
Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`aaMBP1_MYSPE,`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`type = "local",`
			`substitutionMatrix = "BLOSUM62",`
			`gapOpening = 50,`
			`gapExtension = 10)`

			`writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal`
New unit 2017-10-23 03:38:32 +00:00			`# DNA binding domain - but that one has quite`
			`# high sequence identity:`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`percentID(ali2)`

			`# == TASK: ==`

			`# Compare the two alignments. I have weighted the local alignment heavily`
			`# towards an ungapped alignment by setting very high gap penalties. Try changing`
			`# the gap penalties and see what happens: how does the number of indels change,`
			`# how does the length of indels change...`


New unit 2017-10-23 03:38:32 +00:00			`# = 3 APSES Domain annotation by alignment ================================`
Added and updated learning units 2017-09-12 20:09:20 +00:00
Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`# In this section we define the MYSPE APSES sequence by performing a global,`
New unit 2017-10-23 03:38:32 +00:00			`# optimal sequence alignment of the yeast APSES domain with the full length`
			`# protein sequence of the protein that was the most similar to the yeast APSES`
			`# domain.`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`#`

New unit 2017-10-23 03:38:32 +00:00			`# I have annotated the yeast APSES domain as a feature in the`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# database. To view the annotation, we can retrieve it via the proteinID and`
			`# featureID. Here is the yeast protein ID:`
New unit 2017-10-23 03:38:32 +00:00			`(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])`
Added and updated learning units 2017-09-12 20:09:20 +00:00

			`# ... and if you look at the feature table, you can identify the feature ID`
New unit 2017-10-23 03:38:32 +00:00			`(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])`
Added and updated learning units 2017-09-12 20:09:20 +00:00
New unit 2017-10-23 03:38:32 +00:00			`# ... and with the two annotations we can get the corresponding ID from the`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# annotation table`
New unit 2017-10-23 03:38:32 +00:00			`(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &`
			`myDB$annotation$featureID == ftrID])`

bugfix (old code carryover) 2017-10-30 21:24:09 +00:00			`myDB$annotation[myDB$annotation$ID == proID &`
			`myDB$annotation$ID == ftrID, ]`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# The annotation record contains the start and end coordinates which we can use`
			`# to define the APSES domain sequence with a substr() expression.`
New unit 2017-10-23 03:38:32 +00:00
			`(start <- myDB$annotation$start[myDB$annotation$ID == fanID])`
			`(end <- myDB$annotation$end[myDB$annotation$ID == fanID])`
			`(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],`
			`start,`
			`end))`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# Lots of code. But don't get lost. Let's recapitulate what we have done: we`
			`# have selected from the sequence column of the protein table the sequence whose`
New unit 2017-10-23 03:38:32 +00:00			`# name is "MBP1_SACCE", and selected from the annotation table the start`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`# and end coordinates of the annotation that joins an "APSES fold" feature with`
			`# the sequence, and used the start and end coordinates to extract a substring.`

			`# Let's convert this to an AAstring and assign it:`
New unit 2017-10-23 03:38:32 +00:00			`aaMB1_SACCE_APSES <- AAString(apses)`
Added and updated learning units 2017-09-12 20:09:20 +00:00
			`# Now let's align these two sequences of very different length without end-gap`
			`# penalties using the "overlap" type. "overlap" turns the`
			`# end-gap penalties off and that is crucially important since`
			`# the sequences have very different length.`

			`aliApses <- pairwiseAlignment(`
			`aaMB1_SACCE_APSES,`
Changing "YFO" to "MYSPE" 2017-10-04 03:38:48 +00:00			`aaMBP1_MYSPE,`
Added and updated learning units 2017-09-12 20:09:20 +00:00			`type = "overlap",`
			`substitutionMatrix = "BLOSUM62",`
			`gapOpening = 10,`
			`gapExtension = 0.5)`

			`# Inspect the result. The aligned sequences should be clearly`
			`# homologous, and have (almost) no indels. The entire "pattern"`
			`# sequence from QIYSAR ... to ... KPLFDF should be matched`
			`# with the "query". Is this correct?`
			`writePairwiseAlignments(aliApses)`

			`# If this is correct, you can extract the matched sequence from`
			`# the alignment object. The syntax is a bit different from what`
			`# you have seen before: this is an "S4 object", not a list. No`
			`# worries: as.character() returns a normal string.`
			`as.character(aliApses@subject)`

			`# Now, what are the aligned start and end coordinates? You can read them from`
			`# the output of writePairwiseAlignments(), or you can get them from the range of`
			`# the match.`

			`str(aliApses@subject@range)`

			`# start is:`
			`aliApses@subject@range@start`

			`# ... and end is:`
			`aliApses@subject@range@start + aliApses@subject@range@width - 1`


New unit 2017-10-23 03:38:32 +00:00			`# = 4 Update your database script =========================================`
Added and updated learning units 2017-09-12 20:09:20 +00:00

New unit 2017-10-23 03:38:32 +00:00			`# Since we have this feature defined now, we can create a feature annotation`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# right away and store it in myDB.`

			`# == 4.1 Preparing an annotation file ... ==================================`
New unit 2017-10-23 03:38:32 +00:00			`#`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit`
New unit 2017-10-23 03:38:32 +00:00			`#`
			`#`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# You DON'T already have a file called "<MYSPE>-Annotations.json" in the`
			`# ./data/ directory:`
			`#`
			`# - Make a copy of the file "./data/refAnnotations.json" and put it in your`
			`# project directory.`
			`#`
			`# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.`
			`# if MYSPE is called "Crptycoccus neoformans", your file should be called`
			`# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is`
			`# "MBP1_CRYNE").`
			`#`
			`# - Open the file in the RStudio editor and delete all blocks for`
			`# the Mbp1 protein annotations except the first one.`
			`#`
			`# - From that block, delete all lines except for the line that says:`
			`#`
			`# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},`
			`#`
			`# - Then delete the comma at the end of the line (your file will just have`
			`# this one annotation).`
New unit 2017-10-23 03:38:32 +00:00			`#`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the`
			`# "start" and "end" features to the coordinates you just discovered for the`
			`# APSES domain in your sequence.`
			`#`
			`# - Save your file.`
			`#`
			`## - Validate your file online at https://jsonlint.com/`
			`#`
			`# - Update your "makeProteinDB.R" script to load your new`
New unit 2017-10-23 03:38:32 +00:00			`# annotation when you recreate the database. Open the script in the`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# RStudio editor, and add the following command at the end:`
			`#`
			`# myDB <- dbAddAnnotation(myDB, fromJSON("<MYSPE>-Annotations.json"))`
			`#`
			`# - save and close the file.`
			`#`
			`# Then SKIP the next section.`
			`#`
New unit 2017-10-23 03:38:32 +00:00			`#`
update to function <- package::function idiom 2017-11-16 21:37:53 +00:00			`# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit`
New unit 2017-10-23 03:38:32 +00:00			`#`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`#`
			`# You DO already have a file called "<MYSPE>-Annotations.json" in the`
			`# ./data/ directory:`
			`#`
			`# - Open the file in the RStudio editor.`
			`#`
			`# - Below the last feature lines (but before the closing "]") add the`
			`# following feature line (without the "#")`
			`#`
			`# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}`
			`#`
			`# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the`
			`# "start" and "end" features to the coordinates you just discovered for the`
			`# APSES domain in your sequence.`
			`#`
			`# - Add a comma after the preceding feature line.`
			`#`
			`# - Save your file.`
			`#`
			`# - Validate your file online at https://jsonlint.com/`
			`#`
			`#`
			`# == 4.2 Execute and Validate ==============================================`
			`#`
			`# - source() your database creation script:`
New unit 2017-10-23 03:38:32 +00:00			`#`
			`# source("makeProteinDB.R")`
			`#`
			`# This should run without errors or warnings. If it doesn't work and you`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# can't figure out quickly what's happening, ask on the mailing list for`
New unit 2017-10-23 03:38:32 +00:00			`# help.`
			`#`
			`# - Confirm`
			`# The following commands should retrieve the correct start and end`
New unit, and update to annotation file logic 2017-11-14 07:57:13 +00:00			`# coordinates and sequence of the MBP1_MYSPE APSES domain:`
Added and updated learning units 2017-09-12 20:09:20 +00:00
Updated logic 2017-11-16 21:01:56 +00:00			`sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))`
Added and updated learning units 2017-09-12 20:09:20 +00:00
Updated logic 2017-11-16 21:01:56 +00:00			`(proID <- myDB$protein$ID[sel])`
New unit 2017-10-23 03:38:32 +00:00			`(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])`
			`(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &`
Updated logic 2017-11-16 21:01:56 +00:00			`myDB$annotation$featureID == ftrID])`
New unit 2017-10-23 03:38:32 +00:00			`(start <- myDB$annotation$start[myDB$annotation$ID == fanID])`
			`(end <- myDB$annotation$end[myDB$annotation$ID == fanID])`
			`(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],`
			`start,`
			`end))`
Added and updated learning units 2017-09-12 20:09:20 +00:00

			`# [END]`