Changing "YFO" to "MYSPE"
This commit is contained in:
parent
a83f2166a5
commit
9ac45565f4
9
.init.R
9
.init.R
@ -24,6 +24,15 @@ if (! file.exists(".myProfile.R")) {
|
||||
rm(e, n, conn)
|
||||
}
|
||||
|
||||
# Patch YFO -> MYSPE if necessary:
|
||||
tmp <- readLines(".myProfile.R")
|
||||
if (length(grep("^YFO", tmp)) > 0) {
|
||||
idx <- grep("^YFO", tmp)
|
||||
tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
|
||||
writeLines(tmp, ".myProfile.R")
|
||||
}
|
||||
rm(tmp)
|
||||
|
||||
source(".myProfile.R")
|
||||
|
||||
source(".utilities.R")
|
||||
|
@ -1,9 +1,9 @@
|
||||
# ABC_makeYFOlist.R
|
||||
# ABC_makeMYSPElist.R
|
||||
#
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.1.1
|
||||
#
|
||||
# Date: 2016 09 - 2017 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
@ -29,9 +29,9 @@
|
||||
# those parts. If you only want to study the general workflow, just load()
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 The strategy 54
|
||||
@ -44,17 +44,17 @@
|
||||
#TOC> 3.2 Identify species in "hits" 202
|
||||
#TOC> 4 Intersect GOLD and BLAST species 247
|
||||
#TOC> 5 Cleanup and finish 265
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
#TOC>
|
||||
#TOC>
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "YFO" species and save it in an R object
|
||||
# YFOspecies that is stored in the data subdirectory of this project from where
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
# MYSPEspecies that is stored in the data subdirectory of this project from where
|
||||
# it can be loaded. The strategy is as follows: we download a list of all
|
||||
# genome projects and then select species for which protein annotations are
|
||||
# available - i.e. these are all genome-sequenced species that have been
|
||||
@ -251,7 +251,7 @@ length(BLASTspecies)
|
||||
# etc. See here:
|
||||
?union
|
||||
|
||||
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
|
||||
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
|
||||
|
||||
# Again: interpret this:
|
||||
# - what is the number of GOLDspecies?
|
||||
@ -272,9 +272,9 @@ YFOspecies <- intersect(GOLDspecies, BLASTspecies)
|
||||
|
||||
REFspecies
|
||||
|
||||
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
|
||||
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
|
||||
|
||||
# save(YFOspecies, file = "data/YFOspecies.RData")
|
||||
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
|
||||
|
||||
|
||||
|
@ -46,31 +46,31 @@ data(BLOSUM62)
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
||||
MBP1_YFO <- s2c(myDB$protein$sequence[sel])
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
|
||||
|
||||
# Check that we have two character vectors of the expected length.
|
||||
str(MBP1_SACCE)
|
||||
str(MBP1_YFO)
|
||||
str(MBP1_MYSPE)
|
||||
|
||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
|
||||
# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ...
|
||||
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
|
||||
MBP1_SACCE[13]
|
||||
MBP1_YFO[21]
|
||||
MBP1_MYSPE[21]
|
||||
|
||||
# ... using these as subsetting expressions, we can pull the pairscore
|
||||
# from the MDM
|
||||
BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]]
|
||||
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
|
||||
|
||||
# First we build an empty matrix that will hold all pairscores ...
|
||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)),
|
||||
nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO))
|
||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
|
||||
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
|
||||
|
||||
# ... then we loop over the sequences and store the scores in the matrix.
|
||||
#
|
||||
for (i in 1:length(MBP1_SACCE)) {
|
||||
for (j in 1:length(MBP1_YFO)) {
|
||||
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]]
|
||||
for (j in 1:length(MBP1_MYSPE)) {
|
||||
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,7 +80,7 @@ for (i in 1:length(MBP1_SACCE)) {
|
||||
dotMat[1:10, 1:10]
|
||||
|
||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
|
||||
# the matrix correspond to an amino acid from MBP1_YFO.
|
||||
# the matrix correspond to an amino acid from MBP1_MYSPE.
|
||||
|
||||
# To plot this, we use the image() function. Here, with default parameters.
|
||||
|
||||
@ -110,13 +110,13 @@ image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
|
||||
|
||||
# ... and labels! Axis labels would be nice ...
|
||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
|
||||
xlab = "MBP1_YFO", ylab = "MBP1_SACCE" )
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
|
||||
|
||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
|
||||
# too ...
|
||||
len <- 200
|
||||
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
|
||||
xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE)
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
|
||||
box()
|
||||
axis(1, at = c(1, seq(10, len, by=10)))
|
||||
axis(2, at = c(1, seq(10, len, by=10)))
|
||||
@ -129,8 +129,8 @@ axis(4, at = c(1, seq(10, len, by=10)))
|
||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||
# there already is a dotplot function in the seqinr package:
|
||||
|
||||
dotPlot(MBP1_SACCE, MBP1_YFO) # seqinr
|
||||
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO") # Our's
|
||||
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||
|
||||
# Which one do you prefer? You can probably see the block patterns that arise
|
||||
# from segments of repetitive, low complexity sequence. But you probably have to
|
||||
@ -153,7 +153,7 @@ myFilter[5, ] <- c( 0, 0, 0, 0, 1)
|
||||
|
||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
|
||||
|
||||
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter)
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
||||
|
||||
# I think the result shows quite nicely how the two sequences are globally
|
||||
# related and where the regions of sequence similarity are. Play with this a bit
|
||||
|
@ -52,8 +52,8 @@ toString(s) # using the Biostrings function toString()
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
||||
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||
|
||||
?pairwiseAlignment
|
||||
|
||||
@ -61,7 +61,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
||||
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
|
||||
ali1 <- pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_YFO,
|
||||
aaMBP1_MYSPE,
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
gapExtension = 0.5)
|
||||
@ -110,7 +110,7 @@ percentID(ali1)
|
||||
# Compare with local optimal alignment (like EMBOSS Water)
|
||||
ali2 <- pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_YFO,
|
||||
aaMBP1_MYSPE,
|
||||
type = "local",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 50,
|
||||
@ -135,7 +135,7 @@ percentID(ali2)
|
||||
# PART FOUR: APSES Domain annotation by alignment
|
||||
# ==============================================================================
|
||||
|
||||
# In this section we define the YFO APSES sequence by performing a global,
|
||||
# In this section we define the MYSPE APSES sequence by performing a global,
|
||||
# optimal sequence alignment of the yeast domain with the full length protein
|
||||
# sequence of the protein that was the most similar to the yeast APSES domain.
|
||||
#
|
||||
@ -190,11 +190,11 @@ aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
|
||||
"MBP1_SACCE",
|
||||
"APSES fold"))
|
||||
|
||||
# To align, we need the YFO sequence. Here is it's definition again, just
|
||||
# To align, we need the MYSPE sequence. Here is it's definition again, just
|
||||
# in case ...
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
|
||||
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||
|
||||
# Now let's align these two sequences of very different length without end-gap
|
||||
# penalties using the "overlap" type. "overlap" turns the
|
||||
@ -203,7 +203,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
|
||||
|
||||
aliApses <- pairwiseAlignment(
|
||||
aaMB1_SACCE_APSES,
|
||||
aaMBP1_YFO,
|
||||
aaMBP1_MYSPE,
|
||||
type = "overlap",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
@ -237,7 +237,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1
|
||||
# right away and store it in myDB. Copy the code-template below to your
|
||||
# myCode.R file, edit it to replace the placeholder items with your data:
|
||||
#
|
||||
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO
|
||||
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
|
||||
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
|
||||
# - <START> and <END> are to be replaced with the coordinates you got above
|
||||
#
|
||||
@ -277,7 +277,7 @@ myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
|
||||
# If this is correct, save it
|
||||
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
|
||||
|
||||
# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you
|
||||
# Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
|
||||
# need it for the reverse BLAST search, and return to the course Wiki.
|
||||
|
||||
|
||||
|
@ -43,7 +43,7 @@ save(myDB, file = "myDB.04.RData") # save the new version
|
||||
# from your myCode.R script. Here is again the table of feature IDs:
|
||||
myDB$feature[ , c("ID", "name", "description")]
|
||||
|
||||
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
|
||||
# Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
|
||||
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
|
||||
# your corrected annotation script again. Execute ...
|
||||
myDB$proteinAnnotation
|
||||
|
@ -1,15 +1,15 @@
|
||||
# BIN-YFO.R
|
||||
# BIN-MYSPE.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-YFO unit
|
||||
# R code accompanying the BIN-MYSPE unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017 09 21
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist
|
||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
|
||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
@ -23,17 +23,17 @@
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------
|
||||
#TOC> 1 Preparations 38
|
||||
#TOC> 2 Suitable YFO Species 50
|
||||
#TOC> 3 Adopt "YFO" 64
|
||||
#TOC>
|
||||
#TOC> 2 Suitable MYSPE Species 50
|
||||
#TOC> 3 Adopt "MYSPE" 64
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
#
|
||||
@ -47,39 +47,39 @@ if (! exists("myStudentNumber")) {
|
||||
}
|
||||
|
||||
|
||||
# = 2 Suitable YFO Species ================================================
|
||||
# = 2 Suitable MYSPE Species ==============================================
|
||||
|
||||
|
||||
# In this unit we will select one species from a list of genome sequenced fungi
|
||||
# and write it into your personalized profile file. This species will be called
|
||||
# "YFO" (Your Favourite Organism) for other learning units and exercises.
|
||||
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
|
||||
|
||||
# A detailed description of the process of compiling the list of genome
|
||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||
# ABC-makeYFOlist.R
|
||||
# ABC-makeMYSPElist.R
|
||||
|
||||
# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
|
||||
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
|
||||
# selecting and combining data from various public-domain data resources.
|
||||
|
||||
# = 3 Adopt "YFO" =========================================================
|
||||
# = 3 Adopt "MYSPE" =======================================================
|
||||
|
||||
|
||||
# In the code below, we load the resulting vector of species name, then pick one
|
||||
# of them in a random but reproducible way, determined by your student number.
|
||||
|
||||
load("data/YFOspecies.RData") # load the species names
|
||||
set.seed(myStudentNumber) # seed the random number generator
|
||||
YFO <- sample(YFOspecies, 1) # pick a species at random
|
||||
load("data/MYSPEspecies.RData") # load the species names
|
||||
set.seed(myStudentNumber) # seed the random number generator
|
||||
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
||||
# write the result to your personalized profile data so we can use the result in
|
||||
# other functions
|
||||
cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
|
||||
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
||||
|
||||
YFO # so, which species is it ... ?
|
||||
biCode(YFO) # and what is it's "BiCode" ... ?
|
||||
MYSPE # so, which species is it ... ?
|
||||
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
||||
|
||||
# Task: Note down the species name and its five letter label on your Student
|
||||
# Wiki user page. Use this species whenever this or future assignments refer
|
||||
# to YFO. In code, we will automatically load it from your.myProfile.R file.
|
||||
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
|
||||
|
||||
|
||||
# [END]
|
@ -41,7 +41,7 @@ list.files(pattern = "myDB.*")
|
||||
load("myDB.05.RData")
|
||||
|
||||
# The database contains the ten Mbp1 orthologues from the reference species
|
||||
# and the Mbp1 RBM for YFO.
|
||||
# and the Mbp1 RBM for MYSPE.
|
||||
#
|
||||
# We will construct a phylogenetic tree from the proteins' APSES domains.
|
||||
# You have annotated their ranges as a feature.
|
||||
|
@ -156,7 +156,7 @@ layout(matrix(1), widths=1.0, heights=1.0)
|
||||
# ... or we can plot the tree so it corresponds as well as possible to a
|
||||
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
|
||||
# returns for the reference species - we have used it above to make the vector
|
||||
# apsMbp1Names. You inserted your YFO name into that vector - but you should
|
||||
# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
|
||||
# move it to its correct position in the cladogram.
|
||||
|
||||
# (Nb. we need to reverse the ordering for the plot. This is why we use the
|
||||
|
@ -39,7 +39,7 @@ help(package = seqinr) # shows the available functions
|
||||
?computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
# Let's retrieve the YFO sequence from our datamodel
|
||||
# Let's retrieve the MYSPE sequence from our datamodel
|
||||
# (assuming it is the last one that was added):
|
||||
|
||||
db$protein[nrow(db$protein), "sequence"]
|
||||
|
@ -23,9 +23,9 @@
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------
|
||||
#TOC> 1 A Relational Datamodel in R: review 55
|
||||
@ -48,9 +48,9 @@
|
||||
#TOC> 3.3 Create an R script to create the database 522
|
||||
#TOC> 3.3.1 Check and validate 542
|
||||
#TOC> 3.4 Task: submit for credit (part 2/2) 583
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
|
||||
# = 1 A Relational Datamodel in R: review =================================
|
||||
|
||||
@ -203,7 +203,7 @@ str(philDB)
|
||||
# go back, re-read, play with it, and ask for help. This is essential.
|
||||
|
||||
|
||||
# === 1.1.1 completing the database
|
||||
# === 1.1.1 completing the database
|
||||
|
||||
|
||||
# Next I'll add one more person, and create the other two tables:
|
||||
@ -362,7 +362,7 @@ dbSanitizeSequence(x)
|
||||
|
||||
# == 2.3 Create a protein table for our data model =========================
|
||||
|
||||
# === 2.3.1 Initialize the database
|
||||
# === 2.3.1 Initialize the database
|
||||
|
||||
|
||||
# The function dbInit contains all the code to return a list of empty
|
||||
@ -374,7 +374,7 @@ myDB <- dbInit()
|
||||
str(myDB)
|
||||
|
||||
|
||||
# === 2.3.2 Add data
|
||||
# === 2.3.2 Add data
|
||||
|
||||
|
||||
# fromJSON() returns a dataframe that we can readily process to add data
|
||||
@ -421,7 +421,7 @@ source("./scripts/ABC-createRefDB.R")
|
||||
str(myDB)
|
||||
|
||||
|
||||
# === 2.4.1 Examples of navigating the database
|
||||
# === 2.4.1 Examples of navigating the database
|
||||
|
||||
|
||||
# You can look at the contents of the tables in the usual way we access
|
||||
@ -468,8 +468,8 @@ myDB$taxonomy$species[sel]
|
||||
# = 3 Add your own data ===================================================
|
||||
|
||||
|
||||
# You have chosen an organism as "YFO", and you final task will be to find the
|
||||
# protein in YFO that is most similar to yeast Mbp1 and enter its information
|
||||
# You have chosen an organism as "MYSPE", and you final task will be to find the
|
||||
# protein in MYSPE that is most similar to yeast Mbp1 and enter its information
|
||||
# into the database.
|
||||
|
||||
|
||||
@ -483,7 +483,7 @@ myDB$taxonomy$species[sel]
|
||||
# Protein BLAST.
|
||||
# - Enter NP_010227 into the "Query Sequence" field.
|
||||
# - Choose "Reference proteins (refseq_protein)" as the "Database".
|
||||
# - Paste the YFO species name into the "Organism" field.
|
||||
# - Paste the MYSPE species name into the "Organism" field.
|
||||
#
|
||||
# - Click "BLAST".
|
||||
|
||||
@ -493,28 +493,28 @@ myDB$taxonomy$species[sel]
|
||||
|
||||
# Otherwise, look for the top-hit in the "Alignments" section. In some cases
|
||||
# there will be more than one hit with nearly similar E-values. If this is the
|
||||
# case for YFO, choose the one with the higher degree of similarity (more
|
||||
# case for MYSPE, choose the one with the higher degree of similarity (more
|
||||
# identities) with the N-terminus of the query - i.e. the Query sequence of
|
||||
# the first ~ 100 amino acids.
|
||||
|
||||
# - Follow the link to the protein data page, linked from "Sequence ID".
|
||||
# - From there, in a separate tab, open the link to the taxonomy database page
|
||||
# for YFO which is linked from the "ORGANISM" record.
|
||||
# for MYSPE which is linked from the "ORGANISM" record.
|
||||
|
||||
|
||||
# == 3.2 Put the information into JSON files ===============================
|
||||
|
||||
|
||||
# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
|
||||
# directory and give it a new name that corresponds to YFO - e.g. if
|
||||
# YFO is called "Crptycoccus neoformans", your file should be called
|
||||
# directory and give it a new name that corresponds to MYSPE - e.g. if
|
||||
# MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||
# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
|
||||
# "name" of your protein. Open the file in the RStudio editor and replace
|
||||
# all of the MBP1_SACCE data with the corresponding data of your protein.
|
||||
#
|
||||
# - Do a similar thing for the YFO taxonomy entry. Copy
|
||||
# "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json".
|
||||
# Create a valid JSON file with only one single entry - that of YFO.
|
||||
# - Do a similar thing for the MYSPE taxonomy entry. Copy
|
||||
# "./data/refTaxonomy.json" and make a new file named "MYSPEtaxonomy.json".
|
||||
# Create a valid JSON file with only one single entry - that of MYSPE.
|
||||
#
|
||||
# - Validate your two files online at https://jsonlint.com/
|
||||
|
||||
@ -529,7 +529,7 @@ myDB$taxonomy$species[sel]
|
||||
# - than add the two commands that add your protein and taxonomy data,
|
||||
# they should look like:
|
||||
# myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json"))
|
||||
# myDB <- dbAddTaxonomy( myDB, fromJSON("YFOtaxonomy.json"))
|
||||
# myDB <- dbAddTaxonomy( myDB, fromJSON("MYSPEtaxonomy.json"))
|
||||
#
|
||||
# - save the file and source() it:
|
||||
# source("makeProteinDB.R")
|
||||
@ -539,12 +539,12 @@ myDB$taxonomy$species[sel]
|
||||
# in any of the JSON files. Later you will add more information ...
|
||||
|
||||
|
||||
# === 3.3.1 Check and validate
|
||||
# === 3.3.1 Check and validate
|
||||
|
||||
|
||||
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
|
||||
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
||||
# And does the taxonomy table contain the systematic name? It should be the same
|
||||
# that you get when you type YFO into the console.
|
||||
# that you get when you type MYSPE into the console.
|
||||
|
||||
# Let's compute sequence lengths on the fly (with the function nchar() ), and
|
||||
# open this with the table viewer function View()
|
||||
@ -562,18 +562,18 @@ View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
|
||||
myDB$protein$sequence[nrow(myDB$protein)]
|
||||
|
||||
# If not, don't continue! Fix the problem first.
|
||||
# Let me repeat: If this does not give you the right sequence of the YFO
|
||||
# Let me repeat: If this does not give you the right sequence of the MYSPE
|
||||
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
|
||||
|
||||
# Is that the right taxonomy ID and binomial name for YFO?
|
||||
sel <- myDB$taxonomy$species == YFO
|
||||
# Is that the right taxonomy ID and binomial name for MYSPE?
|
||||
sel <- myDB$taxonomy$species == MYSPE
|
||||
myDB$taxonomy[sel, ]
|
||||
|
||||
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
||||
# Fix the problem first.
|
||||
|
||||
# Does this give you the right refseq ID for MBP1_<YFO>?
|
||||
sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO))
|
||||
# Does this give you the right refseq ID for MBP1_MYSPE?
|
||||
sel <- myDB$protein$name == paste0("MBP1_", biCode(MYSPE))
|
||||
myDB$protein$RefSeqID[sel]
|
||||
|
||||
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
|
||||
@ -589,8 +589,8 @@ myDB$protein$RefSeqID[sel]
|
||||
# page on the Student Wiki
|
||||
# - Execute the two commands below and show the result on your submission page
|
||||
|
||||
biCode(myDB$taxonomy$species) %in% biCode(YFO)
|
||||
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)]
|
||||
biCode(myDB$taxonomy$species) %in% biCode(MYSPE)
|
||||
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == MYSPE)]
|
||||
|
||||
# That is all.
|
||||
|
||||
|
BIN
data/MYSPEspecies.RData
Normal file
BIN
data/MYSPEspecies.RData
Normal file
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user