Changing "YFO" to "MYSPE"

This commit is contained in:
hyginn 2017-10-03 23:38:48 -04:00
parent a83f2166a5
commit 9ac45565f4
12 changed files with 102 additions and 93 deletions

View File

@ -24,6 +24,15 @@ if (! file.exists(".myProfile.R")) {
rm(e, n, conn)
}
# Patch YFO -> MYSPE if necessary:
tmp <- readLines(".myProfile.R")
if (length(grep("^YFO", tmp)) > 0) {
idx <- grep("^YFO", tmp)
tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
writeLines(tmp, ".myProfile.R")
}
rm(tmp)
source(".myProfile.R")
source(".utilities.R")

View File

@ -1,9 +1,9 @@
# ABC_makeYFOlist.R
# ABC_makeMYSPElist.R
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1
# Version: 1.1.1
#
# Date: 2016 09 - 2017 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
@ -29,9 +29,9 @@
# those parts. If you only want to study the general workflow, just load()
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 The strategy 54
@ -44,17 +44,17 @@
#TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265
#TOC>
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
# = 1 The strategy ========================================================
# This script will create a list of "YFO" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where
# This script will create a list of "MYSPE" species and save it in an R object
# MYSPEspecies that is stored in the data subdirectory of this project from where
# it can be loaded. The strategy is as follows: we download a list of all
# genome projects and then select species for which protein annotations are
# available - i.e. these are all genome-sequenced species that have been
@ -251,7 +251,7 @@ length(BLASTspecies)
# etc. See here:
?union
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
# Again: interpret this:
# - what is the number of GOLDspecies?
@ -272,9 +272,9 @@ YFOspecies <- intersect(GOLDspecies, BLASTspecies)
REFspecies
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData")
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")

View File

@ -46,31 +46,31 @@ data(BLOSUM62)
sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
MBP1_YFO <- s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length.
str(MBP1_SACCE)
str(MBP1_YFO)
str(MBP1_MYSPE)
# How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ...
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
MBP1_SACCE[13]
MBP1_YFO[21]
MBP1_MYSPE[21]
# ... using these as subsetting expressions, we can pull the pairscore
# from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]]
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
# First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO))
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
# ... then we loop over the sequences and store the scores in the matrix.
#
for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_YFO)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]]
for (j in 1:length(MBP1_MYSPE)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
}
}
@ -80,7 +80,7 @@ for (i in 1:length(MBP1_SACCE)) {
dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_YFO.
# the matrix correspond to an amino acid from MBP1_MYSPE.
# To plot this, we use the image() function. Here, with default parameters.
@ -110,13 +110,13 @@ image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE" )
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ...
len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE)
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
box()
axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10)))
@ -129,8 +129,8 @@ axis(4, at = c(1, seq(10, len, by=10)))
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package:
dotPlot(MBP1_SACCE, MBP1_YFO) # seqinr
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO") # Our's
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to
@ -153,7 +153,7 @@ myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter)
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit

View File

@ -52,8 +52,8 @@ toString(s) # using the Biostrings function toString()
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
@ -61,7 +61,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
ali1 <- pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_YFO,
aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
@ -110,7 +110,7 @@ percentID(ali1)
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_YFO,
aaMBP1_MYSPE,
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
@ -135,7 +135,7 @@ percentID(ali2)
# PART FOUR: APSES Domain annotation by alignment
# ==============================================================================
# In this section we define the YFO APSES sequence by performing a global,
# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast domain with the full length protein
# sequence of the protein that was the most similar to the yeast APSES domain.
#
@ -190,11 +190,11 @@ aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
"MBP1_SACCE",
"APSES fold"))
# To align, we need the YFO sequence. Here is it's definition again, just
# To align, we need the MYSPE sequence. Here is it's definition again, just
# in case ...
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
@ -203,7 +203,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
aliApses <- pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_YFO,
aaMBP1_MYSPE,
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
@ -237,7 +237,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1
# right away and store it in myDB. Copy the code-template below to your
# myCode.R file, edit it to replace the placeholder items with your data:
#
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
# - <START> and <END> are to be replaced with the coordinates you got above
#
@ -277,7 +277,7 @@ myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
# If this is correct, save it
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you
# Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
# need it for the reverse BLAST search, and return to the course Wiki.

View File

@ -43,7 +43,7 @@ save(myDB, file = "myDB.04.RData") # save the new version
# from your myCode.R script. Here is again the table of feature IDs:
myDB$feature[ , c("ID", "name", "description")]
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
# Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
# your corrected annotation script again. Execute ...
myDB$proteinAnnotation

View File

@ -1,15 +1,15 @@
# BIN-YFO.R
# BIN-MYSPE.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-YFO unit
# R code accompanying the BIN-MYSPE unit
#
# Version: 1.0
#
# Date: 2017 09 21
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 Final code, after rewriting BLAST parser and creating current YFOlist
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
# V 1.0 Final code, after rewriting BLAST parser and creating current MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
#
# TODO:
#
@ -23,17 +23,17 @@
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------
#TOC> 1 Preparations 38
#TOC> 2 Suitable YFO Species 50
#TOC> 3 Adopt "YFO" 64
#TOC>
#TOC> 2 Suitable MYSPE Species 50
#TOC> 3 Adopt "MYSPE" 64
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
#
@ -47,39 +47,39 @@ if (! exists("myStudentNumber")) {
}
# = 2 Suitable YFO Species ================================================
# = 2 Suitable MYSPE Species ==============================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "YFO" (Your Favourite Organism) for other learning units and exercises.
# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC-makeYFOlist.R
# ABC-makeMYSPElist.R
# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
# selecting and combining data from various public-domain data resources.
# = 3 Adopt "YFO" =========================================================
# = 3 Adopt "MYSPE" =======================================================
# In the code below, we load the resulting vector of species name, then pick one
# of them in a random but reproducible way, determined by your student number.
load("data/YFOspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator
YFO <- sample(YFOspecies, 1) # pick a species at random
load("data/MYSPEspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
# write the result to your personalized profile data so we can use the result in
# other functions
cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
YFO # so, which species is it ... ?
biCode(YFO) # and what is it's "BiCode" ... ?
MYSPE # so, which species is it ... ?
biCode(MYSPE) # and what is it's "BiCode" ... ?
# Task: Note down the species name and its five letter label on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to YFO. In code, we will automatically load it from your.myProfile.R file.
# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
# [END]

View File

@ -41,7 +41,7 @@ list.files(pattern = "myDB.*")
load("myDB.05.RData")
# The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for YFO.
# and the Mbp1 RBM for MYSPE.
#
# We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature.

View File

@ -156,7 +156,7 @@ layout(matrix(1), widths=1.0, heights=1.0)
# ... or we can plot the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
# returns for the reference species - we have used it above to make the vector
# apsMbp1Names. You inserted your YFO name into that vector - but you should
# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
# move it to its correct position in the cladogram.
# (Nb. we need to reverse the ordering for the plot. This is why we use the

View File

@ -39,7 +39,7 @@ help(package = seqinr) # shows the available functions
?computePI
# This takes as input a vector of upper-case AA codes
# Let's retrieve the YFO sequence from our datamodel
# Let's retrieve the MYSPE sequence from our datamodel
# (assuming it is the last one that was added):
db$protein[nrow(db$protein), "sequence"]

View File

@ -23,9 +23,9 @@
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 A Relational Datamodel in R: review 55
@ -48,9 +48,9 @@
#TOC> 3.3 Create an R script to create the database 522
#TOC> 3.3.1 Check and validate 542
#TOC> 3.4 Task: submit for credit (part 2/2) 583
#TOC>
#TOC>
#TOC> ==========================================================================
# = 1 A Relational Datamodel in R: review =================================
@ -203,7 +203,7 @@ str(philDB)
# go back, re-read, play with it, and ask for help. This is essential.
# === 1.1.1 completing the database
# === 1.1.1 completing the database
# Next I'll add one more person, and create the other two tables:
@ -362,7 +362,7 @@ dbSanitizeSequence(x)
# == 2.3 Create a protein table for our data model =========================
# === 2.3.1 Initialize the database
# === 2.3.1 Initialize the database
# The function dbInit contains all the code to return a list of empty
@ -374,7 +374,7 @@ myDB <- dbInit()
str(myDB)
# === 2.3.2 Add data
# === 2.3.2 Add data
# fromJSON() returns a dataframe that we can readily process to add data
@ -421,7 +421,7 @@ source("./scripts/ABC-createRefDB.R")
str(myDB)
# === 2.4.1 Examples of navigating the database
# === 2.4.1 Examples of navigating the database
# You can look at the contents of the tables in the usual way we access
@ -468,8 +468,8 @@ myDB$taxonomy$species[sel]
# = 3 Add your own data ===================================================
# You have chosen an organism as "YFO", and you final task will be to find the
# protein in YFO that is most similar to yeast Mbp1 and enter its information
# You have chosen an organism as "MYSPE", and you final task will be to find the
# protein in MYSPE that is most similar to yeast Mbp1 and enter its information
# into the database.
@ -483,7 +483,7 @@ myDB$taxonomy$species[sel]
# Protein BLAST.
# - Enter NP_010227 into the "Query Sequence" field.
# - Choose "Reference proteins (refseq_protein)" as the "Database".
# - Paste the YFO species name into the "Organism" field.
# - Paste the MYSPE species name into the "Organism" field.
#
# - Click "BLAST".
@ -493,28 +493,28 @@ myDB$taxonomy$species[sel]
# Otherwise, look for the top-hit in the "Alignments" section. In some cases
# there will be more than one hit with nearly similar E-values. If this is the
# case for YFO, choose the one with the higher degree of similarity (more
# case for MYSPE, choose the one with the higher degree of similarity (more
# identities) with the N-terminus of the query - i.e. the Query sequence of
# the first ~ 100 amino acids.
# - Follow the link to the protein data page, linked from "Sequence ID".
# - From there, in a separate tab, open the link to the taxonomy database page
# for YFO which is linked from the "ORGANISM" record.
# for MYSPE which is linked from the "ORGANISM" record.
# == 3.2 Put the information into JSON files ===============================
# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
# directory and give it a new name that corresponds to YFO - e.g. if
# YFO is called "Crptycoccus neoformans", your file should be called
# directory and give it a new name that corresponds to MYSPE - e.g. if
# MYSPE is called "Crptycoccus neoformans", your file should be called
# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
# "name" of your protein. Open the file in the RStudio editor and replace
# all of the MBP1_SACCE data with the corresponding data of your protein.
#
# - Do a similar thing for the YFO taxonomy entry. Copy
# "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json".
# Create a valid JSON file with only one single entry - that of YFO.
# - Do a similar thing for the MYSPE taxonomy entry. Copy
# "./data/refTaxonomy.json" and make a new file named "MYSPEtaxonomy.json".
# Create a valid JSON file with only one single entry - that of MYSPE.
#
# - Validate your two files online at https://jsonlint.com/
@ -529,7 +529,7 @@ myDB$taxonomy$species[sel]
# - than add the two commands that add your protein and taxonomy data,
# they should look like:
# myDB <- dbAddProtein( myDB, fromJSON("MBP1_<code>.json"))
# myDB <- dbAddTaxonomy( myDB, fromJSON("YFOtaxonomy.json"))
# myDB <- dbAddTaxonomy( myDB, fromJSON("MYSPEtaxonomy.json"))
#
# - save the file and source() it:
# source("makeProteinDB.R")
@ -539,12 +539,12 @@ myDB$taxonomy$species[sel]
# in any of the JSON files. Later you will add more information ...
# === 3.3.1 Check and validate
# === 3.3.1 Check and validate
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
# And does the taxonomy table contain the systematic name? It should be the same
# that you get when you type YFO into the console.
# that you get when you type MYSPE into the console.
# Let's compute sequence lengths on the fly (with the function nchar() ), and
# open this with the table viewer function View()
@ -562,18 +562,18 @@ View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
myDB$protein$sequence[nrow(myDB$protein)]
# If not, don't continue! Fix the problem first.
# Let me repeat: If this does not give you the right sequence of the YFO
# Let me repeat: If this does not give you the right sequence of the MYSPE
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
# Is that the right taxonomy ID and binomial name for YFO?
sel <- myDB$taxonomy$species == YFO
# Is that the right taxonomy ID and binomial name for MYSPE?
sel <- myDB$taxonomy$species == MYSPE
myDB$taxonomy[sel, ]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first.
# Does this give you the right refseq ID for MBP1_<YFO>?
sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO))
# Does this give you the right refseq ID for MBP1_MYSPE?
sel <- myDB$protein$name == paste0("MBP1_", biCode(MYSPE))
myDB$protein$RefSeqID[sel]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
@ -589,8 +589,8 @@ myDB$protein$RefSeqID[sel]
# page on the Student Wiki
# - Execute the two commands below and show the result on your submission page
biCode(myDB$taxonomy$species) %in% biCode(YFO)
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)]
biCode(myDB$taxonomy$species) %in% biCode(MYSPE)
myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == MYSPE)]
# That is all.

BIN
data/MYSPEspecies.RData Normal file

Binary file not shown.

Binary file not shown.