New unit
This commit is contained in:
parent
50bf936bd5
commit
932c81c8d1
@ -10,43 +10,57 @@
|
|||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
|
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 Biostrings Pairwise Alignment
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -------------------------------------------------------
|
||||||
|
#TOC> 1 Prepare 41
|
||||||
|
#TOC> 2 Biostrings Pairwise Alignment 49
|
||||||
|
#TOC> 2.1 Optimal global alignment 60
|
||||||
|
#TOC> 2.2 Optimal local alignment 123
|
||||||
|
#TOC> 3 APSES Domain annotation by alignment 147
|
||||||
|
#TOC> 4 Update your database script 228
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
# Biostrings is one of the basic packages that the Bioconductor software
|
|
||||||
# landscape builds on. It stores sequences in "AAstring" objects and these are
|
|
||||||
# complex software structures that are designed to be able to handle
|
|
||||||
# genome-scale sequences. Biostrings functions - such as the alignment functions
|
|
||||||
# - expect their input to be Biostrings objects.
|
|
||||||
?AAString
|
|
||||||
|
|
||||||
AAString("ACDE")
|
|
||||||
s <- AAString("ACDE")
|
|
||||||
str(s)
|
|
||||||
# See: it's complicated. This is an "S4" object. Bioconductor uses these objects
|
|
||||||
# almost exclusively, but we will not be poking around in their internals. Just
|
|
||||||
# this: how do we get the sequence back out of an AAString object? The help page
|
|
||||||
# for XString - the parent "class" of AAStrings - mentions the alternatives:
|
|
||||||
|
|
||||||
as.character(s) # the base R version
|
|
||||||
toString(s) # using the Biostrings function toString()
|
|
||||||
|
|
||||||
# While we need to remember to convert our sequences from the character vectors
|
# = 1 Prepare =============================================================
|
||||||
# that we store in our database, to AAStrings that we can align, the alignment
|
|
||||||
# itself is really straightforward. The pairwiseAlignment() function was written
|
# You need to recreate the protein database that you have constructed in the
|
||||||
# to behave exactly like the functions you encountered on the EMBOSS server.
|
# BIN-Storing_data unit.
|
||||||
|
|
||||||
|
source("makeProteinDB.R")
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 Biostrings Pairwise Alignment =======================================
|
||||||
|
|
||||||
|
if (!require(Biostrings, quietly=TRUE)) {
|
||||||
|
source("https://bioconductor.org/biocLite.R")
|
||||||
|
biocLite("Biostrings")
|
||||||
|
library(Biostrings)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Biostrings stores sequences in "XString" objects. Once we have onverted our
|
||||||
|
# traget sequences to AAString objects, the alignment itself is straightforward.
|
||||||
|
|
||||||
|
# == 2.1 Optimal global alignment ==========================================
|
||||||
|
|
||||||
|
# The pairwiseAlignment() function was written to behave
|
||||||
|
# exactly like the functions you encountered on the EMBOSS server.
|
||||||
|
|
||||||
# First: make AAString objects ...
|
# First: make AAString objects ...
|
||||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||||
@ -56,9 +70,8 @@ sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
|||||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
?pairwiseAlignment
|
?pairwiseAlignment
|
||||||
|
|
||||||
# ... and align.
|
# ... and align.
|
||||||
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
|
# Global optimal alignment with end-gap penalties is default.
|
||||||
ali1 <- pairwiseAlignment(
|
ali1 <- pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
aaMBP1_MYSPE,
|
aaMBP1_MYSPE,
|
||||||
@ -66,7 +79,7 @@ ali1 <- pairwiseAlignment(
|
|||||||
gapOpening = 10,
|
gapOpening = 10,
|
||||||
gapExtension = 0.5)
|
gapExtension = 0.5)
|
||||||
|
|
||||||
str(ali1) # Did you think the AAString object was complicated ?
|
str(ali1) # ... it's complicated
|
||||||
|
|
||||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
||||||
# tame it:
|
# tame it:
|
||||||
@ -107,6 +120,8 @@ percentID <- function(al) {
|
|||||||
|
|
||||||
percentID(ali1)
|
percentID(ali1)
|
||||||
|
|
||||||
|
# == 2.2 Optimal local alignment ===========================================
|
||||||
|
|
||||||
# Compare with local optimal alignment (like EMBOSS Water)
|
# Compare with local optimal alignment (like EMBOSS Water)
|
||||||
ali2 <- pairwiseAlignment(
|
ali2 <- pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
@ -117,8 +132,8 @@ ali2 <- pairwiseAlignment(
|
|||||||
gapExtension = 10)
|
gapExtension = 10)
|
||||||
|
|
||||||
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
|
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
|
||||||
# DNA binding domain - but that one has quite
|
# DNA binding domain - but that one has quite
|
||||||
# high sequence identity:
|
# high sequence identity:
|
||||||
percentID(ali2)
|
percentID(ali2)
|
||||||
|
|
||||||
# == TASK: ==
|
# == TASK: ==
|
||||||
@ -128,73 +143,49 @@ percentID(ali2)
|
|||||||
# the gap penalties and see what happens: how does the number of indels change,
|
# the gap penalties and see what happens: how does the number of indels change,
|
||||||
# how does the length of indels change...
|
# how does the length of indels change...
|
||||||
|
|
||||||
# Fine. Please return to the Wiki to study BLAST alignment...
|
|
||||||
|
|
||||||
|
# = 3 APSES Domain annotation by alignment ================================
|
||||||
# ==============================================================================
|
|
||||||
# PART FOUR: APSES Domain annotation by alignment
|
|
||||||
# ==============================================================================
|
|
||||||
|
|
||||||
# In this section we define the MYSPE APSES sequence by performing a global,
|
# In this section we define the MYSPE APSES sequence by performing a global,
|
||||||
# optimal sequence alignment of the yeast domain with the full length protein
|
# optimal sequence alignment of the yeast APSES domain with the full length
|
||||||
# sequence of the protein that was the most similar to the yeast APSES domain.
|
# protein sequence of the protein that was the most similar to the yeast APSES
|
||||||
|
# domain.
|
||||||
#
|
#
|
||||||
|
|
||||||
# I have annotated the yeast APSES domain as a proteinAnnotation in the
|
# I have annotated the yeast APSES domain as a feature in the
|
||||||
# database. To view the annotation, we can retrieve it via the proteinID and
|
# database. To view the annotation, we can retrieve it via the proteinID and
|
||||||
# featureID. Here is the yeast protein ID:
|
# featureID. Here is the yeast protein ID:
|
||||||
myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]
|
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
|
||||||
|
|
||||||
# ... assign it for convenience:
|
|
||||||
proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]
|
|
||||||
|
|
||||||
# ... and if you look at the feature table, you can identify the feature ID
|
# ... and if you look at the feature table, you can identify the feature ID
|
||||||
myDB$feature[ , c("ID", "name", "description")]
|
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||||
myDB$feature$ID[myDB$feature$name == "APSES fold"]
|
|
||||||
|
|
||||||
# ... assign it for convenience:
|
# ... and with the two annotations we can get the corresponding ID from the
|
||||||
ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]
|
|
||||||
|
|
||||||
# ... and with the two annotations we can pull the entry from the protein
|
|
||||||
# annotation table
|
# annotation table
|
||||||
|
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||||
|
myDB$annotation$featureID == ftrID])
|
||||||
|
|
||||||
myDB$proteinAnnotation[myDB$proteinAnnotation$protein.ID == proID &
|
myDB$proteinAnnotation[myDB$proteinAnnotation$protein.ID == proID &
|
||||||
myDB$proteinAnnotation$feature.ID == ftrID, ]
|
myDB$proteinAnnotation$feature.ID == ftrID, ]
|
||||||
|
|
||||||
myDB$proteinAnnotation$ID[myDB$proteinAnnotation$protein.ID == proID &
|
|
||||||
myDB$proteinAnnotation$feature.ID == ftrID]
|
|
||||||
|
|
||||||
# ... assign it for convenience:
|
|
||||||
fanID <- myDB$proteinAnnotation$ID[myDB$proteinAnnotation$protein.ID == proID &
|
|
||||||
myDB$proteinAnnotation$feature.ID == ftrID]
|
|
||||||
|
|
||||||
# The annotation record contains the start and end coordinates which we can use
|
# The annotation record contains the start and end coordinates which we can use
|
||||||
# to define the APSES domain sequence with a substr() expression.
|
# to define the APSES domain sequence with a substr() expression.
|
||||||
substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
|
||||||
myDB$proteinAnnotation$start[myDB$proteinAnnotation$ID == fanID],
|
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||||
myDB$proteinAnnotation$end[myDB$proteinAnnotation$ID == fanID])
|
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||||
|
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||||
|
start,
|
||||||
|
end))
|
||||||
|
|
||||||
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
|
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
|
||||||
# have selected from the sequence column of the protein table the sequence whose
|
# have selected from the sequence column of the protein table the sequence whose
|
||||||
# name is "MBP1_SACCE", and selected from the proteinAnnotation table the start
|
# name is "MBP1_SACCE", and selected from the annotation table the start
|
||||||
# and end coordinates of the annotation that joins an "APSES fold" feature with
|
# and end coordinates of the annotation that joins an "APSES fold" feature with
|
||||||
# the sequence, and used the start and end coordinates to extract a substring.
|
# the sequence, and used the start and end coordinates to extract a substring.
|
||||||
# The expressions get lengthy, but it's not hard to wrap all of this into a
|
|
||||||
# function so that we only need to define name and feature.
|
|
||||||
|
|
||||||
dbGetFeatureSequence
|
|
||||||
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
|
|
||||||
|
|
||||||
|
|
||||||
# Let's convert this to an AAstring and assign it:
|
# Let's convert this to an AAstring and assign it:
|
||||||
aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
|
aaMB1_SACCE_APSES <- AAString(apses)
|
||||||
"MBP1_SACCE",
|
|
||||||
"APSES fold"))
|
|
||||||
|
|
||||||
# To align, we need the MYSPE sequence. Here is it's definition again, just
|
|
||||||
# in case ...
|
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
|
||||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
|
||||||
|
|
||||||
# Now let's align these two sequences of very different length without end-gap
|
# Now let's align these two sequences of very different length without end-gap
|
||||||
# penalties using the "overlap" type. "overlap" turns the
|
# penalties using the "overlap" type. "overlap" turns the
|
||||||
@ -233,57 +224,57 @@ aliApses@subject@range@start
|
|||||||
# ... and end is:
|
# ... and end is:
|
||||||
aliApses@subject@range@start + aliApses@subject@range@width - 1
|
aliApses@subject@range@start + aliApses@subject@range@width - 1
|
||||||
|
|
||||||
# Since we have this section defined now, we can create a feature annotation
|
|
||||||
# right away and store it in myDB. Copy the code-template below to your
|
# = 4 Update your database script =========================================
|
||||||
# myCode.R file, edit it to replace the placeholder items with your data:
|
|
||||||
|
|
||||||
|
# Since we have this feature defined now, we can create a feature annotation
|
||||||
|
# right away and store it in myDB. Follow the following steps carefully:
|
||||||
#
|
#
|
||||||
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
|
|
||||||
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
|
|
||||||
# - <START> and <END> are to be replaced with the coordinates you got above
|
|
||||||
#
|
#
|
||||||
# Then execute the code and continue below the code template. If you make an
|
# - Make a copy of the file "./data/refAnnotations.json" in your project
|
||||||
# error, there are instructions on how to recover, below.
|
# directory and give it a new name that corresponds to MYSPE - e.g. if
|
||||||
|
# MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||||
|
# "CRYNEAnnotations.json"; in that case "MBP1_CRYNE" would also be the
|
||||||
|
# "name" of your protein. Open the file in the RStudio editor and delete
|
||||||
|
# all annotations but one for an "APSES fold". Edit that annotation to
|
||||||
|
# correspond to the your MBP1_MYSPE protein and enter the start end end
|
||||||
|
# coordinates you have just discovered for the APSES domain in your
|
||||||
|
# sequence. Save your file.
|
||||||
#
|
#
|
||||||
# ===== begin code template: add a proteinAnnotation to the database =====
|
# - Validate your file online at https://jsonlint.com/
|
||||||
|
#
|
||||||
|
# - Next, you need to update your "makeProteinDB.R" script to load the
|
||||||
|
# annotation when you recreate the database. Open the script in the
|
||||||
|
# RStudio ediotr, and add the following command at the end:
|
||||||
|
#
|
||||||
|
# myDB <- dbAddAnnotation(myDB, fromJSON("<MYSPE>Annotations.json"))
|
||||||
|
#
|
||||||
|
# - save the file and source() it:
|
||||||
|
#
|
||||||
|
# source("makeProteinDB.R")
|
||||||
|
#
|
||||||
|
# This should run without errors or warnings. If it doesn't work and you
|
||||||
|
# can't figure out quickly what's happeneing, ask on the mailing list for
|
||||||
|
# help.
|
||||||
|
#
|
||||||
|
# - Confirm
|
||||||
|
# The following commands should retrieve the correct start and end
|
||||||
|
# coordinates for the MBP1_MYSPE APSES domain:
|
||||||
|
|
||||||
# == edit all placeholder items!
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
myProteinID <- "<PROTEIN ID>"
|
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||||
myFeatureID <- "<FEATURE ID>"
|
|
||||||
myStart <- <START>
|
|
||||||
myEnd <- <END>
|
|
||||||
|
|
||||||
# == create the proteinAnnotation entry
|
|
||||||
panRow <- data.frame(ID = dbAutoincrement(myDB$proteinAnnotation$ID),
|
|
||||||
protein.ID = myProteinID,
|
|
||||||
feature.ID = myFeatureID,
|
|
||||||
start = myStart,
|
|
||||||
end = myEnd,
|
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
myDB$proteinAnnotation <- rbind(myDB$proteinAnnotation, panRow)
|
|
||||||
|
|
||||||
# == check that this was successful and has the right data
|
|
||||||
myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
|
|
||||||
|
|
||||||
# ===== end code template ===========================================
|
|
||||||
|
|
||||||
# ... continue here.
|
|
||||||
# I expect that a correct result would look something like
|
|
||||||
# ID protein.ID feature.ID start end
|
|
||||||
# 63 my_fan_1 my_pro_1 ref_ftr_1 6 104
|
|
||||||
|
|
||||||
# If you made a mistake, simply overwrite the current version of myDB by loading
|
|
||||||
# your saved, good version: load("myDB.01.RData") and correct your mistake
|
|
||||||
|
|
||||||
# If this is correct, save it
|
|
||||||
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
|
|
||||||
|
|
||||||
# Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
|
|
||||||
# need it for the reverse BLAST search, and return to the course Wiki.
|
|
||||||
|
|
||||||
|
|
||||||
|
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_<MYSSPE>"]) # <<< EDIT
|
||||||
# = 1 Tasks
|
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||||
|
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||||
|
myDB$annotation$featureID == ftrID])
|
||||||
|
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||||
|
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||||
|
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||||
|
start,
|
||||||
|
end))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user