Changing "YFO" to "MYSPE"
This commit is contained in:
parent
a83f2166a5
commit
9ac45565f4
9
.init.R
9
.init.R
@ -24,6 +24,15 @@ if (! file.exists(".myProfile.R")) {
|
|||||||
rm(e, n, conn)
|
rm(e, n, conn)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Patch YFO -> MYSPE if necessary:
|
||||||
|
tmp <- readLines(".myProfile.R")
|
||||||
|
if (length(grep("^YFO", tmp)) > 0) {
|
||||||
|
idx <- grep("^YFO", tmp)
|
||||||
|
tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
|
||||||
|
writeLines(tmp, ".myProfile.R")
|
||||||
|
}
|
||||||
|
rm(tmp)
|
||||||
|
|
||||||
source(".myProfile.R")
|
source(".myProfile.R")
|
||||||
|
|
||||||
source(".utilities.R")
|
source(".utilities.R")
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
# ABC_makeYFOlist.R
|
# ABC_makeMYSPElist.R
|
||||||
#
|
#
|
||||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||||
# Mbp1 homologues.
|
# Mbp1 homologues.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.1.1
|
||||||
#
|
#
|
||||||
# Date: 2016 09 - 2017 09
|
# Date: 2016 09 - 2017 09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
@ -53,8 +53,8 @@
|
|||||||
|
|
||||||
# = 1 The strategy ========================================================
|
# = 1 The strategy ========================================================
|
||||||
|
|
||||||
# This script will create a list of "YFO" species and save it in an R object
|
# This script will create a list of "MYSPE" species and save it in an R object
|
||||||
# YFOspecies that is stored in the data subdirectory of this project from where
|
# MYSPEspecies that is stored in the data subdirectory of this project from where
|
||||||
# it can be loaded. The strategy is as follows: we download a list of all
|
# it can be loaded. The strategy is as follows: we download a list of all
|
||||||
# genome projects and then select species for which protein annotations are
|
# genome projects and then select species for which protein annotations are
|
||||||
# available - i.e. these are all genome-sequenced species that have been
|
# available - i.e. these are all genome-sequenced species that have been
|
||||||
@ -251,7 +251,7 @@ length(BLASTspecies)
|
|||||||
# etc. See here:
|
# etc. See here:
|
||||||
?union
|
?union
|
||||||
|
|
||||||
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
|
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
|
||||||
|
|
||||||
# Again: interpret this:
|
# Again: interpret this:
|
||||||
# - what is the number of GOLDspecies?
|
# - what is the number of GOLDspecies?
|
||||||
@ -272,9 +272,9 @@ YFOspecies <- intersect(GOLDspecies, BLASTspecies)
|
|||||||
|
|
||||||
REFspecies
|
REFspecies
|
||||||
|
|
||||||
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
|
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
|
||||||
|
|
||||||
# save(YFOspecies, file = "data/YFOspecies.RData")
|
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -46,31 +46,31 @@ data(BLOSUM62)
|
|||||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||||
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
|
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
MBP1_YFO <- s2c(myDB$protein$sequence[sel])
|
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
# Check that we have two character vectors of the expected length.
|
# Check that we have two character vectors of the expected length.
|
||||||
str(MBP1_SACCE)
|
str(MBP1_SACCE)
|
||||||
str(MBP1_YFO)
|
str(MBP1_MYSPE)
|
||||||
|
|
||||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
|
# How do we get the pairscore values? Consider: a single pair of amino acids can
|
||||||
# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ...
|
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
|
||||||
MBP1_SACCE[13]
|
MBP1_SACCE[13]
|
||||||
MBP1_YFO[21]
|
MBP1_MYSPE[21]
|
||||||
|
|
||||||
# ... using these as subsetting expressions, we can pull the pairscore
|
# ... using these as subsetting expressions, we can pull the pairscore
|
||||||
# from the MDM
|
# from the MDM
|
||||||
BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]]
|
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
|
||||||
|
|
||||||
# First we build an empty matrix that will hold all pairscores ...
|
# First we build an empty matrix that will hold all pairscores ...
|
||||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)),
|
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
|
||||||
nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO))
|
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
|
||||||
|
|
||||||
# ... then we loop over the sequences and store the scores in the matrix.
|
# ... then we loop over the sequences and store the scores in the matrix.
|
||||||
#
|
#
|
||||||
for (i in 1:length(MBP1_SACCE)) {
|
for (i in 1:length(MBP1_SACCE)) {
|
||||||
for (j in 1:length(MBP1_YFO)) {
|
for (j in 1:length(MBP1_MYSPE)) {
|
||||||
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]]
|
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ for (i in 1:length(MBP1_SACCE)) {
|
|||||||
dotMat[1:10, 1:10]
|
dotMat[1:10, 1:10]
|
||||||
|
|
||||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
|
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
|
||||||
# the matrix correspond to an amino acid from MBP1_YFO.
|
# the matrix correspond to an amino acid from MBP1_MYSPE.
|
||||||
|
|
||||||
# To plot this, we use the image() function. Here, with default parameters.
|
# To plot this, we use the image() function. Here, with default parameters.
|
||||||
|
|
||||||
@ -110,13 +110,13 @@ image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
|
|||||||
|
|
||||||
# ... and labels! Axis labels would be nice ...
|
# ... and labels! Axis labels would be nice ...
|
||||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
|
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
|
||||||
xlab = "MBP1_YFO", ylab = "MBP1_SACCE" )
|
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
|
||||||
|
|
||||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
|
# ... and why don't we have axis-numbers on all four sides? Go, make that right
|
||||||
# too ...
|
# too ...
|
||||||
len <- 200
|
len <- 200
|
||||||
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
|
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
|
||||||
xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE)
|
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
|
||||||
box()
|
box()
|
||||||
axis(1, at = c(1, seq(10, len, by=10)))
|
axis(1, at = c(1, seq(10, len, by=10)))
|
||||||
axis(2, at = c(1, seq(10, len, by=10)))
|
axis(2, at = c(1, seq(10, len, by=10)))
|
||||||
@ -129,8 +129,8 @@ axis(4, at = c(1, seq(10, len, by=10)))
|
|||||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||||
# there already is a dotplot function in the seqinr package:
|
# there already is a dotplot function in the seqinr package:
|
||||||
|
|
||||||
dotPlot(MBP1_SACCE, MBP1_YFO) # seqinr
|
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||||
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO") # Our's
|
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||||
|
|
||||||
# Which one do you prefer? You can probably see the block patterns that arise
|
# Which one do you prefer? You can probably see the block patterns that arise
|
||||||
# from segments of repetitive, low complexity sequence. But you probably have to
|
# from segments of repetitive, low complexity sequence. But you probably have to
|
||||||
@ -153,7 +153,7 @@ myFilter[5, ] <- c( 0, 0, 0, 0, 1)
|
|||||||
|
|
||||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
|
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
|
||||||
|
|
||||||
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter)
|
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
||||||
|
|
||||||
# I think the result shows quite nicely how the two sequences are globally
|
# I think the result shows quite nicely how the two sequences are globally
|
||||||
# related and where the regions of sequence similarity are. Play with this a bit
|
# related and where the regions of sequence similarity are. Play with this a bit
|
||||||
|
@ -52,8 +52,8 @@ toString(s) # using the Biostrings function toString()
|
|||||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||||
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
?pairwiseAlignment
|
?pairwiseAlignment
|
||||||
|
|
||||||
@ -61,7 +61,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
|||||||
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
|
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
|
||||||
ali1 <- pairwiseAlignment(
|
ali1 <- pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
aaMBP1_YFO,
|
aaMBP1_MYSPE,
|
||||||
substitutionMatrix = "BLOSUM62",
|
substitutionMatrix = "BLOSUM62",
|
||||||
gapOpening = 10,
|
gapOpening = 10,
|
||||||
gapExtension = 0.5)
|
gapExtension = 0.5)
|
||||||
@ -110,7 +110,7 @@ percentID(ali1)
|
|||||||
# Compare with local optimal alignment (like EMBOSS Water)
|
# Compare with local optimal alignment (like EMBOSS Water)
|
||||||
ali2 <- pairwiseAlignment(
|
ali2 <- pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
aaMBP1_YFO,
|
aaMBP1_MYSPE,
|
||||||
type = "local",
|
type = "local",
|
||||||
substitutionMatrix = "BLOSUM62",
|
substitutionMatrix = "BLOSUM62",
|
||||||
gapOpening = 50,
|
gapOpening = 50,
|
||||||
@ -135,7 +135,7 @@ percentID(ali2)
|
|||||||
# PART FOUR: APSES Domain annotation by alignment
|
# PART FOUR: APSES Domain annotation by alignment
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# In this section we define the YFO APSES sequence by performing a global,
|
# In this section we define the MYSPE APSES sequence by performing a global,
|
||||||
# optimal sequence alignment of the yeast domain with the full length protein
|
# optimal sequence alignment of the yeast domain with the full length protein
|
||||||
# sequence of the protein that was the most similar to the yeast APSES domain.
|
# sequence of the protein that was the most similar to the yeast APSES domain.
|
||||||
#
|
#
|
||||||
@ -190,11 +190,11 @@ aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
|
|||||||
"MBP1_SACCE",
|
"MBP1_SACCE",
|
||||||
"APSES fold"))
|
"APSES fold"))
|
||||||
|
|
||||||
# To align, we need the YFO sequence. Here is it's definition again, just
|
# To align, we need the MYSPE sequence. Here is it's definition again, just
|
||||||
# in case ...
|
# in case ...
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
# Now let's align these two sequences of very different length without end-gap
|
# Now let's align these two sequences of very different length without end-gap
|
||||||
# penalties using the "overlap" type. "overlap" turns the
|
# penalties using the "overlap" type. "overlap" turns the
|
||||||
@ -203,7 +203,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
|||||||
|
|
||||||
aliApses <- pairwiseAlignment(
|
aliApses <- pairwiseAlignment(
|
||||||
aaMB1_SACCE_APSES,
|
aaMB1_SACCE_APSES,
|
||||||
aaMBP1_YFO,
|
aaMBP1_MYSPE,
|
||||||
type = "overlap",
|
type = "overlap",
|
||||||
substitutionMatrix = "BLOSUM62",
|
substitutionMatrix = "BLOSUM62",
|
||||||
gapOpening = 10,
|
gapOpening = 10,
|
||||||
@ -237,7 +237,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1
|
|||||||
# right away and store it in myDB. Copy the code-template below to your
|
# right away and store it in myDB. Copy the code-template below to your
|
||||||
# myCode.R file, edit it to replace the placeholder items with your data:
|
# myCode.R file, edit it to replace the placeholder items with your data:
|
||||||
#
|
#
|
||||||
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO
|
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
|
||||||
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
|
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
|
||||||
# - <START> and <END> are to be replaced with the coordinates you got above
|
# - <START> and <END> are to be replaced with the coordinates you got above
|
||||||
#
|
#
|
||||||
@ -277,7 +277,7 @@ myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
|
|||||||
# If this is correct, save it
|
# If this is correct, save it
|
||||||
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
|
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
|
||||||
|
|
||||||
# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you
|
# Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
|
||||||
# need it for the reverse BLAST search, and return to the course Wiki.
|
# need it for the reverse BLAST search, and return to the course Wiki.
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ save(myDB, file = "myDB.04.RData") # save the new version
|
|||||||
# from your myCode.R script. Here is again the table of feature IDs:
|
# from your myCode.R script. Here is again the table of feature IDs:
|
||||||
myDB$feature[ , c("ID", "name", "description")]
|
myDB$feature[ , c("ID", "name", "description")]
|
||||||
|
|
||||||
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
|
# Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
|
||||||
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
|
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
|
||||||
# your corrected annotation script again. Execute ...
|
# your corrected annotation script again. Execute ...
|
||||||
myDB$proteinAnnotation
|
myDB$proteinAnnotation
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
# BIN-YFO.R
|
# BIN-MYSPE.R
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-YFO unit
|
# R code accompanying the BIN-MYSPE unit
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.0
|
||||||
#
|
#
|
||||||
# Date: 2017 09 21
|
# Date: 2017 09 21
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist
|
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
|
||||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
@ -29,8 +29,8 @@
|
|||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------
|
#TOC> ---------------------------------------
|
||||||
#TOC> 1 Preparations 38
|
#TOC> 1 Preparations 38
|
||||||
#TOC> 2 Suitable YFO Species 50
|
#TOC> 2 Suitable MYSPE Species 50
|
||||||
#TOC> 3 Adopt "YFO" 64
|
#TOC> 3 Adopt "MYSPE" 64
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -47,39 +47,39 @@ if (! exists("myStudentNumber")) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# = 2 Suitable YFO Species ================================================
|
# = 2 Suitable MYSPE Species ==============================================
|
||||||
|
|
||||||
|
|
||||||
# In this unit we will select one species from a list of genome sequenced fungi
|
# In this unit we will select one species from a list of genome sequenced fungi
|
||||||
# and write it into your personalized profile file. This species will be called
|
# and write it into your personalized profile file. This species will be called
|
||||||
# "YFO" (Your Favourite Organism) for other learning units and exercises.
|
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
|
||||||
|
|
||||||
# A detailed description of the process of compiling the list of genome
|
# A detailed description of the process of compiling the list of genome
|
||||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||||
# ABC-makeYFOlist.R
|
# ABC-makeMYSPElist.R
|
||||||
|
|
||||||
# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
|
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
|
||||||
# selecting and combining data from various public-domain data resources.
|
# selecting and combining data from various public-domain data resources.
|
||||||
|
|
||||||
# = 3 Adopt "YFO" =========================================================
|
# = 3 Adopt "MYSPE" =======================================================
|
||||||
|
|
||||||
|
|
||||||
# In the code below, we load the resulting vector of species name, then pick one
|
# In the code below, we load the resulting vector of species name, then pick one
|
||||||
# of them in a random but reproducible way, determined by your student number.
|
# of them in a random but reproducible way, determined by your student number.
|
||||||
|
|
||||||
load("data/YFOspecies.RData") # load the species names
|
load("data/MYSPEspecies.RData") # load the species names
|
||||||
set.seed(myStudentNumber) # seed the random number generator
|
set.seed(myStudentNumber) # seed the random number generator
|
||||||
YFO <- sample(YFOspecies, 1) # pick a species at random
|
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
||||||
# write the result to your personalized profile data so we can use the result in
|
# write the result to your personalized profile data so we can use the result in
|
||||||
# other functions
|
# other functions
|
||||||
cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
|
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
||||||
|
|
||||||
YFO # so, which species is it ... ?
|
MYSPE # so, which species is it ... ?
|
||||||
biCode(YFO) # and what is it's "BiCode" ... ?
|
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
||||||
|
|
||||||
# Task: Note down the species name and its five letter label on your Student
|
# Task: Note down the species name and its five letter label on your Student
|
||||||
# Wiki user page. Use this species whenever this or future assignments refer
|
# Wiki user page. Use this species whenever this or future assignments refer
|
||||||
# to YFO. In code, we will automatically load it from your.myProfile.R file.
|
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
@ -41,7 +41,7 @@ list.files(pattern = "myDB.*")
|
|||||||
load("myDB.05.RData")
|
load("myDB.05.RData")
|
||||||
|
|
||||||
# The database contains the ten Mbp1 orthologues from the reference species
|
# The database contains the ten Mbp1 orthologues from the reference species
|
||||||
# and the Mbp1 RBM for YFO.
|
# and the Mbp1 RBM for MYSPE.
|
||||||
#
|
#
|
||||||
# We will construct a phylogenetic tree from the proteins' APSES domains.
|
# We will construct a phylogenetic tree from the proteins' APSES domains.
|
||||||
# You have annotated their ranges as a feature.
|
# You have annotated their ranges as a feature.
|
||||||
|
@ -156,7 +156,7 @@ layout(matrix(1), widths=1.0, heights=1.0)
|
|||||||
# ... or we can plot the tree so it corresponds as well as possible to a
|
# ... or we can plot the tree so it corresponds as well as possible to a
|
||||||
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
|
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
|
||||||
# returns for the reference species - we have used it above to make the vector
|
# returns for the reference species - we have used it above to make the vector
|
||||||
# apsMbp1Names. You inserted your YFO name into that vector - but you should
|
# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
|
||||||
# move it to its correct position in the cladogram.
|
# move it to its correct position in the cladogram.
|
||||||
|
|
||||||
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
||||||
|
@ -39,7 +39,7 @@ help(package = seqinr) # shows the available functions
|
|||||||
?computePI
|
?computePI
|
||||||
|
|
||||||
# This takes as input a vector of upper-case AA codes
|
# This takes as input a vector of upper-case AA codes
|
||||||
# Let's retrieve the YFO sequence from our datamodel
|
# Let's retrieve the MYSPE sequence from our datamodel
|
||||||
# (assuming it is the last one that was added):
|
# (assuming it is the last one that was added):
|
||||||
|
|
||||||
db$protein[nrow(db$protein), "sequence"]
|
db$protein[nrow(db$protein), "sequence"]
|
||||||
|
@ -468,8 +468,8 @@ myDB$taxonomy$species[sel]
|
|||||||
# = 3 Add your own data ===================================================
|
# = 3 Add your own data ===================================================
|
||||||
|
|
||||||
|
|
||||||
# You have chosen an organism as "YFO", and you final task will be to find the
|
# You have chosen an organism as "MYSPE", and you final task will be to find the
|
||||||
# protein in YFO that is most similar to yeast Mbp1 and enter its information
|
# protein in MYSPE that is most similar to yeast Mbp1 and enter its information
|
||||||
# into the database.
|
# into the database.
|
||||||
|
|
||||||
|
|
||||||
@ -483,7 +483,7 @@ myDB$taxonomy$species[sel]
|
|||||||
# Protein BLAST.
|
# Protein BLAST.
|
||||||
# - Enter NP_010227 into the "Query Sequence" field.
|
# - Enter NP_010227 into the "Query Sequence" field.
|
||||||
# - Choose "Reference proteins (refseq_protein)" as the "Database".
|
# - Choose "Reference proteins (refseq_protein)" as the "Database".
|
||||||
# - Paste the YFO species name into the "Organism" field.
|
# - Paste the MYSPE species name into the "Organism" field.
|
||||||
#
|
#
|
||||||
# - Click "BLAST".
|
# - Click "BLAST".
|
||||||
|
|
||||||
@ -493,28 +493,28 @@ myDB$taxonomy$species[sel]
|
|||||||
|
|
||||||
# Otherwise, look for the top-hit in the "Alignments" section. In some cases
|
# Otherwise, look for the top-hit in the "Alignments" section. In some cases
|
||||||
# there will be more than one hit with nearly similar E-values. If this is the
|
# there will be more than one hit with nearly similar E-values. If this is the
|
||||||
# case for YFO, choose the one with the higher degree of similarity (more
|
# case for MYSPE, choose the one with the higher degree of similarity (more
|
||||||
# identities) with the N-terminus of the query - i.e. the Query sequence of
|
# identities) with the N-terminus of the query - i.e. the Query sequence of
|
||||||
# the first ~ 100 amino acids.
|
# the first ~ 100 amino acids.
|
||||||
|
|
||||||
# - Follow the link to the protein data page, linked from "Sequence ID".
|
# - Follow the link to the protein data page, linked from "Sequence ID".
|
||||||
# - From there, in a separate tab, open the link to the taxonomy database page
|
# - From there, in a separate tab, open the link to the taxonomy database page
|
||||||
# for YFO which is linked from the "ORGANISM" record.
|
# for MYSPE which is linked from the "ORGANISM" record.
|
||||||
|
|
||||||
|
|
||||||
# == 3.2 Put the information into JSON files ===============================
|
# == 3.2 Put the information into JSON files ===============================
|
||||||
|
|
||||||
|
|
||||||
# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
|
# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
|
||||||
# directory and give it a new name that corresponds to YFO - e.g. if
|
# directory and give it a new name that corresponds to MYSPE - e.g. if
|
||||||
# YFO is called "Crptycoccus neoformans", your file should be called
|
# MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||||
# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
|
# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
|
||||||
# "name" of your protein. Open the file in the RStudio editor and replace
|
# "name" of your protein. Open the file in the RStudio editor and replace
|
||||||
# all of the MBP1_SACCE data with the corresponding data of your protein.
|
# all of the MBP1_SACCE data with the corresponding data of your protein.
|
||||||
#
|
#
|
||||||
# - Do a similar thing for the YFO taxonomy entry. Copy
|
# - Do a similar thing for the MYSPE taxonomy entry. Copy
|
||||||
# "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json".
|
# "./data/refTaxonomy.json" and make a new file named "MYSPEtaxonomy.json".
|
||||||
# Create a valid JSON file with only one single entry - that of YFO.
|
# Create a valid JSON file with only one single entry - that of MYSPE.
|
||||||
#
|
#
|
||||||
# - Validate your two files online at https://jsonlint.com/
|
# - Validate your two files online at https://jsonlint.com/
|
||||||
|
|
||||||
@ -529,7 +529,7 @@ myDB$taxonomy$species[sel]
|
|||||||
# - than add the two commands that add your protein and taxonomy data,
|
# - than add the two commands that add your protein and taxonomy data,
|
||||||
# they should look like:
|
# they should look like:
|
||||||
# myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json"))
|
# myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json"))
|
||||||
# myDB <- dbAddTaxonomy( myDB, fromJSON("YFOtaxonomy.json"))
|
# myDB <- dbAddTaxonomy( myDB, fromJSON("MYSPEtaxonomy.json"))
|
||||||
#
|
#
|
||||||
# - save the file and source() it:
|
# - save the file and source() it:
|
||||||
# source("makeProteinDB.R")
|
# source("makeProteinDB.R")
|
||||||
@ -542,9 +542,9 @@ myDB$taxonomy$species[sel]
|
|||||||
# === 3.3.1 Check and validate
|
# === 3.3.1 Check and validate
|
||||||
|
|
||||||
|
|
||||||
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
|
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
||||||
# And does the taxonomy table contain the systematic name? It should be the same
|
# And does the taxonomy table contain the systematic name? It should be the same
|
||||||
# that you get when you type YFO into the console.
|
# that you get when you type MYSPE into the console.
|
||||||
|
|
||||||
# Let's compute sequence lengths on the fly (with the function nchar() ), and
|
# Let's compute sequence lengths on the fly (with the function nchar() ), and
|
||||||
# open this with the table viewer function View()
|
# open this with the table viewer function View()
|
||||||
@ -562,18 +562,18 @@ View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
|
|||||||
myDB$protein$sequence[nrow(myDB$protein)]
|
myDB$protein$sequence[nrow(myDB$protein)]
|
||||||
|
|
||||||
# If not, don't continue! Fix the problem first.
|
# If not, don't continue! Fix the problem first.
|
||||||
# Let me repeat: If this does not give you the right sequence of the YFO
|
# Let me repeat: If this does not give you the right sequence of the MYSPE
|
||||||
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
|
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
|
||||||
|
|
||||||
# Is that the right taxonomy ID and binomial name for YFO?
|
# Is that the right taxonomy ID and binomial name for MYSPE?
|
||||||
sel <- myDB$taxonomy$species == YFO
|
sel <- myDB$taxonomy$species == MYSPE
|
||||||
myDB$taxonomy[sel, ]
|
myDB$taxonomy[sel, ]
|
||||||
|
|
||||||
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
||||||
# Fix the problem first.
|
# Fix the problem first.
|
||||||
|
|
||||||
# Does this give you the right refseq ID for MBP1_<YFO>?
|
# Does this give you the right refseq ID for MBP1_MYSPE?
|
||||||
sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO))
|
sel <- myDB$protein$name == paste0("MBP1_", biCode(MYSPE))
|
||||||
myDB$protein$RefSeqID[sel]
|
myDB$protein$RefSeqID[sel]
|
||||||
|
|
||||||
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
||||||
@ -589,8 +589,8 @@ myDB$protein$RefSeqID[sel]
|
|||||||
# page on the Student Wiki
|
# page on the Student Wiki
|
||||||
# - Execute the two commands below and show the result on your submission page
|
# - Execute the two commands below and show the result on your submission page
|
||||||
|
|
||||||
biCode(myDB$taxonomy$species) %in% biCode(YFO)
|
biCode(myDB$taxonomy$species) %in% biCode(MYSPE)
|
||||||
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)]
|
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == MYSPE)]
|
||||||
|
|
||||||
# That is all.
|
# That is all.
|
||||||
|
|
||||||
|
BIN
data/MYSPEspecies.RData
Normal file
BIN
data/MYSPEspecies.RData
Normal file
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user