Changing "YFO" to "MYSPE"

This commit is contained in:
hyginn 2017-10-03 23:38:48 -04:00
parent a83f2166a5
commit 9ac45565f4
12 changed files with 102 additions and 93 deletions

View File

@ -24,6 +24,15 @@ if (! file.exists(".myProfile.R")) {
rm(e, n, conn) rm(e, n, conn)
} }
# Patch YFO -> MYSPE if necessary:
tmp <- readLines(".myProfile.R")
if (length(grep("^YFO", tmp)) > 0) {
idx <- grep("^YFO", tmp)
tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
writeLines(tmp, ".myProfile.R")
}
rm(tmp)
source(".myProfile.R") source(".myProfile.R")
source(".utilities.R") source(".utilities.R")

View File

@ -1,9 +1,9 @@
# ABC_makeYFOlist.R # ABC_makeMYSPElist.R
# #
# Purpose: Create a list of genome sequenced fungi with protein annotations and # Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues. # Mbp1 homologues.
# #
# Version: 1.1 # Version: 1.1.1
# #
# Date: 2016 09 - 2017 09 # Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
@ -29,9 +29,9 @@
# those parts. If you only want to study the general workflow, just load() # those parts. If you only want to study the general workflow, just load()
# the respective intermediate results. # the respective intermediate results.
# #
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------- #TOC> ---------------------------------------------------
#TOC> 1 The strategy 54 #TOC> 1 The strategy 54
@ -44,17 +44,17 @@
#TOC> 3.2 Identify species in "hits" 202 #TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247 #TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265 #TOC> 5 Cleanup and finish 265
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> #TOC>
# = 1 The strategy ======================================================== # = 1 The strategy ========================================================
# This script will create a list of "YFO" species and save it in an R object # This script will create a list of "MYSPE" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where # MYSPEspecies that is stored in the data subdirectory of this project from where
# it can be loaded. The strategy is as follows: we download a list of all # it can be loaded. The strategy is as follows: we download a list of all
# genome projects and then select species for which protein annotations are # genome projects and then select species for which protein annotations are
# available - i.e. these are all genome-sequenced species that have been # available - i.e. these are all genome-sequenced species that have been
@ -251,7 +251,7 @@ length(BLASTspecies)
# etc. See here: # etc. See here:
?union ?union
YFOspecies <- intersect(GOLDspecies, BLASTspecies) MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
# Again: interpret this: # Again: interpret this:
# - what is the number of GOLDspecies? # - what is the number of GOLDspecies?
@ -272,9 +272,9 @@ YFOspecies <- intersect(GOLDspecies, BLASTspecies)
REFspecies REFspecies
YFOspecies <- sort(setdiff(YFOspecies, REFspecies)) MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData") # save(MYSPEspecies, file = "data/MYSPEspecies.RData")

View File

@ -46,31 +46,31 @@ data(BLOSUM62)
sel <- myDB$protein$name == "MBP1_SACCE" sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- s2c(myDB$protein$sequence[sel]) MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_YFO <- s2c(myDB$protein$sequence[sel]) MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length. # Check that we have two character vectors of the expected length.
str(MBP1_SACCE) str(MBP1_SACCE)
str(MBP1_YFO) str(MBP1_MYSPE)
# How do we get the pairscore values? Consider: a single pair of amino acids can # How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ... # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
MBP1_SACCE[13] MBP1_SACCE[13]
MBP1_YFO[21] MBP1_MYSPE[21]
# ... using these as subsetting expressions, we can pull the pairscore # ... using these as subsetting expressions, we can pull the pairscore
# from the MDM # from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]] BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
# First we build an empty matrix that will hold all pairscores ... # First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)), dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO)) nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
# ... then we loop over the sequences and store the scores in the matrix. # ... then we loop over the sequences and store the scores in the matrix.
# #
for (i in 1:length(MBP1_SACCE)) { for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_YFO)) { for (j in 1:length(MBP1_MYSPE)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]] dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
} }
} }
@ -80,7 +80,7 @@ for (i in 1:length(MBP1_SACCE)) {
dotMat[1:10, 1:10] dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_YFO. # the matrix correspond to an amino acid from MBP1_MYSPE.
# To plot this, we use the image() function. Here, with default parameters. # To plot this, we use the image() function. Here, with default parameters.
@ -110,13 +110,13 @@ image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ... # ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1), image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE" ) xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right # ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ... # too ...
len <- 200 len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1), image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE) xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
box() box()
axis(1, at = c(1, seq(10, len, by=10))) axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10))) axis(2, at = c(1, seq(10, len, by=10)))
@ -129,8 +129,8 @@ axis(4, at = c(1, seq(10, len, by=10)))
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package: # there already is a dotplot function in the seqinr package:
dotPlot(MBP1_SACCE, MBP1_YFO) # seqinr dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO") # Our's dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise # Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to # from segments of repetitive, low complexity sequence. But you probably have to
@ -153,7 +153,7 @@ myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter) dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally # I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit # related and where the regions of sequence similarity are. Play with this a bit

View File

@ -52,8 +52,8 @@ toString(s) # using the Biostrings function toString()
sel <- myDB$protein$name == "MBP1_SACCE" sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel]) aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel]) aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
?pairwiseAlignment ?pairwiseAlignment
@ -61,7 +61,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle) # Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
ali1 <- pairwiseAlignment( ali1 <- pairwiseAlignment(
aaMBP1_SACCE, aaMBP1_SACCE,
aaMBP1_YFO, aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 10, gapOpening = 10,
gapExtension = 0.5) gapExtension = 0.5)
@ -110,7 +110,7 @@ percentID(ali1)
# Compare with local optimal alignment (like EMBOSS Water) # Compare with local optimal alignment (like EMBOSS Water)
ali2 <- pairwiseAlignment( ali2 <- pairwiseAlignment(
aaMBP1_SACCE, aaMBP1_SACCE,
aaMBP1_YFO, aaMBP1_MYSPE,
type = "local", type = "local",
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 50, gapOpening = 50,
@ -135,7 +135,7 @@ percentID(ali2)
# PART FOUR: APSES Domain annotation by alignment # PART FOUR: APSES Domain annotation by alignment
# ============================================================================== # ==============================================================================
# In this section we define the YFO APSES sequence by performing a global, # In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast domain with the full length protein # optimal sequence alignment of the yeast domain with the full length protein
# sequence of the protein that was the most similar to the yeast APSES domain. # sequence of the protein that was the most similar to the yeast APSES domain.
# #
@ -190,11 +190,11 @@ aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
"MBP1_SACCE", "MBP1_SACCE",
"APSES fold")) "APSES fold"))
# To align, we need the YFO sequence. Here is it's definition again, just # To align, we need the MYSPE sequence. Here is it's definition again, just
# in case ... # in case ...
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel]) aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
# Now let's align these two sequences of very different length without end-gap # Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the # penalties using the "overlap" type. "overlap" turns the
@ -203,7 +203,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
aliApses <- pairwiseAlignment( aliApses <- pairwiseAlignment(
aaMB1_SACCE_APSES, aaMB1_SACCE_APSES,
aaMBP1_YFO, aaMBP1_MYSPE,
type = "overlap", type = "overlap",
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 10, gapOpening = 10,
@ -237,7 +237,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1
# right away and store it in myDB. Copy the code-template below to your # right away and store it in myDB. Copy the code-template below to your
# myCode.R file, edit it to replace the placeholder items with your data: # myCode.R file, edit it to replace the placeholder items with your data:
# #
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO # - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold" # - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
# - <START> and <END> are to be replaced with the coordinates you got above # - <START> and <END> are to be replaced with the coordinates you got above
# #
@ -277,7 +277,7 @@ myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
# If this is correct, save it # If this is correct, save it
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number! save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you # Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
# need it for the reverse BLAST search, and return to the course Wiki. # need it for the reverse BLAST search, and return to the course Wiki.

View File

@ -43,7 +43,7 @@ save(myDB, file = "myDB.04.RData") # save the new version
# from your myCode.R script. Here is again the table of feature IDs: # from your myCode.R script. Here is again the table of feature IDs:
myDB$feature[ , c("ID", "name", "description")] myDB$feature[ , c("ID", "name", "description")]
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make # Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run # mistakes, just reload the latest version (probably "myDB.04.RData"), then run
# your corrected annotation script again. Execute ... # your corrected annotation script again. Execute ...
myDB$proteinAnnotation myDB$proteinAnnotation

View File

@ -1,15 +1,15 @@
# BIN-YFO.R # BIN-MYSPE.R
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-YFO unit # R code accompanying the BIN-MYSPE unit
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2017 09 21 # Date: 2017 09 21
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist # V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R # V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
# #
# TODO: # TODO:
# #
@ -23,17 +23,17 @@
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------- #TOC> ---------------------------------------
#TOC> 1 Preparations 38 #TOC> 1 Preparations 38
#TOC> 2 Suitable YFO Species 50 #TOC> 2 Suitable MYSPE Species 50
#TOC> 3 Adopt "YFO" 64 #TOC> 3 Adopt "MYSPE" 64
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Preparations ======================================================== # = 1 Preparations ========================================================
# #
@ -47,39 +47,39 @@ if (! exists("myStudentNumber")) {
} }
# = 2 Suitable YFO Species ================================================ # = 2 Suitable MYSPE Species ==============================================
# In this unit we will select one species from a list of genome sequenced fungi # In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called # and write it into your personalized profile file. This species will be called
# "YFO" (Your Favourite Organism) for other learning units and exercises. # "MYSPE" (Your Favourite Organism) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome # A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file # sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC-makeYFOlist.R # ABC-makeMYSPElist.R
# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of # Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
# selecting and combining data from various public-domain data resources. # selecting and combining data from various public-domain data resources.
# = 3 Adopt "YFO" ========================================================= # = 3 Adopt "MYSPE" =======================================================
# In the code below, we load the resulting vector of species name, then pick one # In the code below, we load the resulting vector of species name, then pick one
# of them in a random but reproducible way, determined by your student number. # of them in a random but reproducible way, determined by your student number.
load("data/YFOspecies.RData") # load the species names load("data/MYSPEspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator set.seed(myStudentNumber) # seed the random number generator
YFO <- sample(YFOspecies, 1) # pick a species at random MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
# write the result to your personalized profile data so we can use the result in # write the result to your personalized profile data so we can use the result in
# other functions # other functions
cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE) cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
YFO # so, which species is it ... ? MYSPE # so, which species is it ... ?
biCode(YFO) # and what is it's "BiCode" ... ? biCode(MYSPE) # and what is it's "BiCode" ... ?
# Task: Note down the species name and its five letter label on your Student # Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer # Wiki user page. Use this species whenever this or future assignments refer
# to YFO. In code, we will automatically load it from your.myProfile.R file. # to MYSPE. In code, we will automatically load it from your.myProfile.R file.
# [END] # [END]

View File

@ -41,7 +41,7 @@ list.files(pattern = "myDB.*")
load("myDB.05.RData") load("myDB.05.RData")
# The database contains the ten Mbp1 orthologues from the reference species # The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for YFO. # and the Mbp1 RBM for MYSPE.
# #
# We will construct a phylogenetic tree from the proteins' APSES domains. # We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature. # You have annotated their ranges as a feature.

View File

@ -156,7 +156,7 @@ layout(matrix(1), widths=1.0, heights=1.0)
# ... or we can plot the tree so it corresponds as well as possible to a # ... or we can plot the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that NCBI Global Tree # predefined tip ordering. Here we use the ordering that NCBI Global Tree
# returns for the reference species - we have used it above to make the vector # returns for the reference species - we have used it above to make the vector
# apsMbp1Names. You inserted your YFO name into that vector - but you should # apsMbp1Names. You inserted your MYSPE name into that vector - but you should
# move it to its correct position in the cladogram. # move it to its correct position in the cladogram.
# (Nb. we need to reverse the ordering for the plot. This is why we use the # (Nb. we need to reverse the ordering for the plot. This is why we use the

View File

@ -39,7 +39,7 @@ help(package = seqinr) # shows the available functions
?computePI ?computePI
# This takes as input a vector of upper-case AA codes # This takes as input a vector of upper-case AA codes
# Let's retrieve the YFO sequence from our datamodel # Let's retrieve the MYSPE sequence from our datamodel
# (assuming it is the last one that was added): # (assuming it is the last one that was added):
db$protein[nrow(db$protein), "sequence"] db$protein[nrow(db$protein), "sequence"]

View File

@ -23,9 +23,9 @@
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------------ #TOC> ------------------------------------------------------------
#TOC> 1 A Relational Datamodel in R: review 55 #TOC> 1 A Relational Datamodel in R: review 55
@ -48,9 +48,9 @@
#TOC> 3.3 Create an R script to create the database 522 #TOC> 3.3 Create an R script to create the database 522
#TOC> 3.3.1 Check and validate 542 #TOC> 3.3.1 Check and validate 542
#TOC> 3.4 Task: submit for credit (part 2/2) 583 #TOC> 3.4 Task: submit for credit (part 2/2) 583
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 A Relational Datamodel in R: review ================================= # = 1 A Relational Datamodel in R: review =================================
@ -203,7 +203,7 @@ str(philDB)
# go back, re-read, play with it, and ask for help. This is essential. # go back, re-read, play with it, and ask for help. This is essential.
# === 1.1.1 completing the database # === 1.1.1 completing the database
# Next I'll add one more person, and create the other two tables: # Next I'll add one more person, and create the other two tables:
@ -362,7 +362,7 @@ dbSanitizeSequence(x)
# == 2.3 Create a protein table for our data model ========================= # == 2.3 Create a protein table for our data model =========================
# === 2.3.1 Initialize the database # === 2.3.1 Initialize the database
# The function dbInit contains all the code to return a list of empty # The function dbInit contains all the code to return a list of empty
@ -374,7 +374,7 @@ myDB <- dbInit()
str(myDB) str(myDB)
# === 2.3.2 Add data # === 2.3.2 Add data
# fromJSON() returns a dataframe that we can readily process to add data # fromJSON() returns a dataframe that we can readily process to add data
@ -421,7 +421,7 @@ source("./scripts/ABC-createRefDB.R")
str(myDB) str(myDB)
# === 2.4.1 Examples of navigating the database # === 2.4.1 Examples of navigating the database
# You can look at the contents of the tables in the usual way we access # You can look at the contents of the tables in the usual way we access
@ -468,8 +468,8 @@ myDB$taxonomy$species[sel]
# = 3 Add your own data =================================================== # = 3 Add your own data ===================================================
# You have chosen an organism as "YFO", and you final task will be to find the # You have chosen an organism as "MYSPE", and you final task will be to find the
# protein in YFO that is most similar to yeast Mbp1 and enter its information # protein in MYSPE that is most similar to yeast Mbp1 and enter its information
# into the database. # into the database.
@ -483,7 +483,7 @@ myDB$taxonomy$species[sel]
# Protein BLAST. # Protein BLAST.
# - Enter NP_010227 into the "Query Sequence" field. # - Enter NP_010227 into the "Query Sequence" field.
# - Choose "Reference proteins (refseq_protein)" as the "Database". # - Choose "Reference proteins (refseq_protein)" as the "Database".
# - Paste the YFO species name into the "Organism" field. # - Paste the MYSPE species name into the "Organism" field.
# #
# - Click "BLAST". # - Click "BLAST".
@ -493,28 +493,28 @@ myDB$taxonomy$species[sel]
# Otherwise, look for the top-hit in the "Alignments" section. In some cases # Otherwise, look for the top-hit in the "Alignments" section. In some cases
# there will be more than one hit with nearly similar E-values. If this is the # there will be more than one hit with nearly similar E-values. If this is the
# case for YFO, choose the one with the higher degree of similarity (more # case for MYSPE, choose the one with the higher degree of similarity (more
# identities) with the N-terminus of the query - i.e. the Query sequence of # identities) with the N-terminus of the query - i.e. the Query sequence of
# the first ~ 100 amino acids. # the first ~ 100 amino acids.
# - Follow the link to the protein data page, linked from "Sequence ID". # - Follow the link to the protein data page, linked from "Sequence ID".
# - From there, in a separate tab, open the link to the taxonomy database page # - From there, in a separate tab, open the link to the taxonomy database page
# for YFO which is linked from the "ORGANISM" record. # for MYSPE which is linked from the "ORGANISM" record.
# == 3.2 Put the information into JSON files =============================== # == 3.2 Put the information into JSON files ===============================
# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project # - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
# directory and give it a new name that corresponds to YFO - e.g. if # directory and give it a new name that corresponds to MYSPE - e.g. if
# YFO is called "Crptycoccus neoformans", your file should be called # MYSPE is called "Crptycoccus neoformans", your file should be called
# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the # "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
# "name" of your protein. Open the file in the RStudio editor and replace # "name" of your protein. Open the file in the RStudio editor and replace
# all of the MBP1_SACCE data with the corresponding data of your protein. # all of the MBP1_SACCE data with the corresponding data of your protein.
# #
# - Do a similar thing for the YFO taxonomy entry. Copy # - Do a similar thing for the MYSPE taxonomy entry. Copy
# "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json". # "./data/refTaxonomy.json" and make a new file named "MYSPEtaxonomy.json".
# Create a valid JSON file with only one single entry - that of YFO. # Create a valid JSON file with only one single entry - that of MYSPE.
# #
# - Validate your two files online at https://jsonlint.com/ # - Validate your two files online at https://jsonlint.com/
@ -529,7 +529,7 @@ myDB$taxonomy$species[sel]
# - than add the two commands that add your protein and taxonomy data, # - than add the two commands that add your protein and taxonomy data,
# they should look like: # they should look like:
# myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json")) # myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json"))
# myDB <- dbAddTaxonomy( myDB, fromJSON("YFOtaxonomy.json")) # myDB <- dbAddTaxonomy( myDB, fromJSON("MYSPEtaxonomy.json"))
# #
# - save the file and source() it: # - save the file and source() it:
# source("makeProteinDB.R") # source("makeProteinDB.R")
@ -539,12 +539,12 @@ myDB$taxonomy$species[sel]
# in any of the JSON files. Later you will add more information ... # in any of the JSON files. Later you will add more information ...
# === 3.3.1 Check and validate # === 3.3.1 Check and validate
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be. # Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
# And does the taxonomy table contain the systematic name? It should be the same # And does the taxonomy table contain the systematic name? It should be the same
# that you get when you type YFO into the console. # that you get when you type MYSPE into the console.
# Let's compute sequence lengths on the fly (with the function nchar() ), and # Let's compute sequence lengths on the fly (with the function nchar() ), and
# open this with the table viewer function View() # open this with the table viewer function View()
@ -562,18 +562,18 @@ View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
myDB$protein$sequence[nrow(myDB$protein)] myDB$protein$sequence[nrow(myDB$protein)]
# If not, don't continue! Fix the problem first. # If not, don't continue! Fix the problem first.
# Let me repeat: If this does not give you the right sequence of the YFO # Let me repeat: If this does not give you the right sequence of the MYSPE
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem. # Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
# Is that the right taxonomy ID and binomial name for YFO? # Is that the right taxonomy ID and binomial name for MYSPE?
sel <- myDB$taxonomy$species == YFO sel <- myDB$taxonomy$species == MYSPE
myDB$taxonomy[sel, ] myDB$taxonomy[sel, ]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE. # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first. # Fix the problem first.
# Does this give you the right refseq ID for MBP1_<YFO>? # Does this give you the right refseq ID for MBP1_MYSPE?
sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO)) sel <- myDB$protein$name == paste0("MBP1_", biCode(MYSPE))
myDB$protein$RefSeqID[sel] myDB$protein$RefSeqID[sel]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE. # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
@ -589,8 +589,8 @@ myDB$protein$RefSeqID[sel]
# page on the Student Wiki # page on the Student Wiki
# - Execute the two commands below and show the result on your submission page # - Execute the two commands below and show the result on your submission page
biCode(myDB$taxonomy$species) %in% biCode(YFO) biCode(myDB$taxonomy$species) %in% biCode(MYSPE)
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)] myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == MYSPE)]
# That is all. # That is all.

BIN
data/MYSPEspecies.RData Normal file

Binary file not shown.

Binary file not shown.