Changing "YFO" to "MYSPE"

2017-10-03 23:38:48 -04:00
parent a83f2166a5
commit 9ac45565f4
12 changed files with 102 additions and 93 deletions
--- a/.init.R
+++ b/.init.R
@@ -24,6 +24,15 @@ if (! file.exists(".myProfile.R")) {
  rm(e, n, conn)
 }
 # Patch YFO -> MYSPE if necessary:
 tmp <- readLines(".myProfile.R")
 if (length(grep("^YFO", tmp)) > 0) {
  idx <- grep("^YFO", tmp)
  tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
  writeLines(tmp, ".myProfile.R")
 }
 rm(tmp)
 source(".myProfile.R")
 source(".utilities.R")
--- a/ABC-makeMYSPElist.R
+++ b/ABC-makeMYSPElist.R
@@ -1,9 +1,9 @@
-# ABC_makeYFOlist.R
+# ABC_makeMYSPElist.R
 #
 # Purpose:  Create a list of genome sequenced fungi with protein annotations and
 #               Mbp1 homologues.
 #
-# Version: 1.1
+# Version: 1.1.1
 #
 # Date:    2016 09 - 2017 09
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
@@ -53,8 +53,8 @@
 # =    1  The strategy  ========================================================
-# This script will create a list of "YFO" species and save it in an R object
+# This script will create a list of "MYSPE" species and save it in an R object
-# YFOspecies that is stored in the data subdirectory of this project from where
+# MYSPEspecies that is stored in the data subdirectory of this project from where
 # it can be loaded. The strategy is as follows: we download a list of all
 # genome projects and then select species for which protein annotations are
 # available - i.e. these are all genome-sequenced species that have been
@@ -251,7 +251,7 @@ length(BLASTspecies)
 # etc. See here:
 ?union
-YFOspecies <- intersect(GOLDspecies, BLASTspecies)
+MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
 # Again: interpret this:
 #  - what is the number of GOLDspecies?
@@ -272,9 +272,9 @@ YFOspecies <- intersect(GOLDspecies, BLASTspecies)
 REFspecies
-YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
+MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
-# save(YFOspecies, file = "data/YFOspecies.RData")
+# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
--- a/BIN-ALI-Dotplot.R
+++ b/BIN-ALI-Dotplot.R
@@ -46,31 +46,31 @@ data(BLOSUM62)
 sel <- myDB$protein$name == "MBP1_SACCE"
 MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
-sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-MBP1_YFO <- s2c(myDB$protein$sequence[sel])
+MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
 # Check that we have two character vectors of the expected length.
 str(MBP1_SACCE)
-str(MBP1_YFO)
+str(MBP1_MYSPE)
 # How do we get the pairscore values? Consider: a single pair of amino acids can
-# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ...
+# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
 MBP1_SACCE[13]
-MBP1_YFO[21]
+MBP1_MYSPE[21]
 # ... using these as subsetting expressions, we can pull the pairscore
 # from the MDM
-BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]]
+BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
 # First we build an empty matrix that will hold all pairscores ...
-dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)),
+dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
-                 nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO))
+                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
 # ... then we loop over the sequences and store the scores in the matrix.
 #
 for (i in 1:length(MBP1_SACCE)) {
-  for (j in 1:length(MBP1_YFO)) {
+  for (j in 1:length(MBP1_MYSPE)) {
-    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]]
+    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
  }
 }
@@ -80,7 +80,7 @@ for (i in 1:length(MBP1_SACCE)) {
 dotMat[1:10, 1:10]
 # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
-# the matrix correspond to an amino acid from MBP1_YFO.
+# the matrix correspond to an amino acid from MBP1_MYSPE.
 # To plot this, we use the image() function. Here, with default parameters.
@@ -110,13 +110,13 @@ image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
 # ... and labels! Axis labels would be nice ...
 image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
-      xlab = "MBP1_YFO", ylab = "MBP1_SACCE" )
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
 # ... and why don't we have axis-numbers on all four sides? Go, make that right
 # too ...
 len <- 200
 image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
-      xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE)
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
 box()
 axis(1, at = c(1, seq(10, len, by=10)))
 axis(2, at = c(1, seq(10, len, by=10)))
@@ -129,8 +129,8 @@ axis(4, at = c(1, seq(10, len, by=10)))
 # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
 # there already is a dotplot function in the seqinr package:
-dotPlot(MBP1_SACCE, MBP1_YFO)                                 # seqinr
+dotPlot(MBP1_SACCE, MBP1_MYSPE)                                 # seqinr
-dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO")  # Our's
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
 # Which one do you prefer? You can probably see the block patterns that arise
 # from segments of repetitive, low complexity sequence. But you probably have to
@@ -153,7 +153,7 @@ myFilter[5, ] <- c( 0, 0, 0, 0, 1)
 # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
-dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter)
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
 # I think the result shows quite nicely how the two sequences are globally
 # related and where the regions of sequence similarity are. Play with this a bit
--- a/BIN-ALI-Optimal_sequence_alignment.R
+++ b/BIN-ALI-Optimal_sequence_alignment.R
@@ -52,8 +52,8 @@ toString(s)      # using the Biostrings function toString()
 sel <- myDB$protein$name == "MBP1_SACCE"
 aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
-sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-aaMBP1_YFO <-   AAString(myDB$protein$sequence[sel])
+aaMBP1_MYSPE <-   AAString(myDB$protein$sequence[sel])
 ?pairwiseAlignment
@@ -61,7 +61,7 @@ aaMBP1_YFO <-   AAString(myDB$protein$sequence[sel])
 # Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
 ali1 <-  pairwiseAlignment(
  aaMBP1_SACCE,
-  aaMBP1_YFO,
+  aaMBP1_MYSPE,
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
  gapExtension = 0.5)
@@ -110,7 +110,7 @@ percentID(ali1)
 # Compare with local optimal alignment (like EMBOSS Water)
 ali2 <-  pairwiseAlignment(
  aaMBP1_SACCE,
-  aaMBP1_YFO,
+  aaMBP1_MYSPE,
  type = "local",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 50,
@@ -135,7 +135,7 @@ percentID(ali2)
 #        PART FOUR: APSES Domain annotation by alignment
 # ==============================================================================
-# In this section we define the YFO APSES sequence by performing a global,
+# In this section we define the MYSPE APSES sequence by performing a global,
 # optimal sequence alignment of the yeast domain with the full length protein
 # sequence of the protein that was the most similar to the yeast APSES domain.
 #
@@ -190,11 +190,11 @@ aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
                                                   "MBP1_SACCE",
                                                   "APSES fold"))
-# To align, we need the YFO sequence. Here is it's definition again, just
+# To align, we need the MYSPE sequence. Here is it's definition again, just
 # in case ...
-sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
+aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
 # Now let's align these two sequences of very different length without end-gap
 # penalties using the "overlap" type. "overlap" turns the
@@ -203,7 +203,7 @@ aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
 aliApses <-  pairwiseAlignment(
  aaMB1_SACCE_APSES,
-  aaMBP1_YFO,
+  aaMBP1_MYSPE,
  type = "overlap",
  substitutionMatrix = "BLOSUM62",
  gapOpening = 10,
@@ -237,7 +237,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1
 # right away and store it in myDB.  Copy the code-template below to your
 # myCode.R file, edit it to replace the placeholder items with your data:
 #
-#  - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO
+#  - The <PROTEIN ID> is to be replaced with the ID of MBP1_MYSPE
 #  - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
 #  - <START> and <END> are to be replaced with the coordinates you got above
 #
@@ -277,7 +277,7 @@ myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
 # If this is correct, save it
 save(myDB, file = "myDB.02.RData")  # Note that it gets a new version number!
-# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you
+# Done with this part. Copy the sequence of the APSES domain of MBP1_MYSPE - you
 # need it for the reverse BLAST search, and return to the course Wiki.
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@@ -43,7 +43,7 @@ save(myDB, file = "myDB.04.RData") # save the new version
 # from your myCode.R script. Here is again the table of feature IDs:
 myDB$feature[ , c("ID", "name", "description")]
-# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
+# Add every SMART annotated feaure for MBP1_MYSPE to the database. If you make
 # mistakes, just reload the latest version (probably "myDB.04.RData"), then run
 # your corrected annotation script again. Execute ...
 myDB$proteinAnnotation
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -1,15 +1,15 @@
-# BIN-YFO.R
+# BIN-MYSPE.R
 #
 # Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-YFO unit
+#              R code accompanying the BIN-MYSPE unit
 #
 # Version: 1.0
 #
 # Date:    2017  09  21
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
-# V 1.0    Final code, after rewriting BLAST parser and creating current YFOlist
+# V 1.0    Final code, after rewriting BLAST parser and creating current MYSPElist
-# V 0.1    First code copied from BCH441_A03_makeYFOlist.R
+# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
 #
 # TODO:
 #
@@ -29,8 +29,8 @@
 #TOC>   Section  Title                   Line
 #TOC> ---------------------------------------
 #TOC>   1        Preparations              38
-#TOC>   2        Suitable YFO Species      50
+#TOC>   2        Suitable MYSPE Species    50
-#TOC>   3        Adopt "YFO"               64
+#TOC>   3        Adopt "MYSPE"             64
 #TOC>
 #TOC> ==========================================================================
@@ -47,39 +47,39 @@ if (! exists("myStudentNumber")) {
 }
-# =    2  Suitable YFO Species  ================================================
+# =    2  Suitable MYSPE Species  ==============================================
 # In this unit we will select one species from a list of genome sequenced fungi
 # and write it into your personalized profile file. This species will be called
-# "YFO" (Your Favourite Organism) for other learning units and exercises.
+# "MYSPE" (Your Favourite Organism) for other learning units and exercises.
 # A detailed description of the process of compiling the list of genome
 # sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ABC-makeYFOlist.R
+# ABC-makeMYSPElist.R
-# Task: Study ABC-makeYFOlist.R, it implements a rather typical workflow of
+# Task: Study ABC-makeMYSPElist.R, it implements a rather typical workflow of
 # selecting and combining data from various public-domain data resources.
-# =    3  Adopt "YFO"  =========================================================
+# =    3  Adopt "MYSPE"  =======================================================
 # In the code below, we load the resulting vector of species name, then pick one
 # of them in a random but reproducible way, determined by your student number.
-load("data/YFOspecies.RData")  # load the species names
+load("data/MYSPEspecies.RData")     # load the species names
-set.seed(myStudentNumber)      # seed the random number generator
+set.seed(myStudentNumber)           # seed the random number generator
-YFO <- sample(YFOspecies, 1)   # pick a species at random
+MYSPE <- sample(MYSPEspecies, 1)    # pick a species at random
 # write the result to your personalized profile data so we can use the result in
 # other functions
-cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
+cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
-YFO         # so, which species is it ... ?
+MYSPE         # so, which species is it ... ?
-biCode(YFO) # and what is it's "BiCode" ... ?
+biCode(MYSPE) # and what is it's "BiCode" ... ?
 # Task: Note down the species name and its five letter label on your Student
 # Wiki user page. Use this species whenever this or future assignments refer
-# to YFO. In code, we will automatically load it from your.myProfile.R file.
+# to MYSPE. In code, we will automatically load it from your.myProfile.R file.
 # [END]
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@@ -41,7 +41,7 @@ list.files(pattern = "myDB.*")
 load("myDB.05.RData")
 # The database contains the ten Mbp1 orthologues from the reference species
-# and the Mbp1 RBM for YFO.
+# and the Mbp1 RBM for MYSPE.
 #
 # We will construct a phylogenetic tree from the proteins' APSES domains.
 # You have annotated their ranges as a feature.
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -156,7 +156,7 @@ layout(matrix(1), widths=1.0, heights=1.0)
 # ... or we can plot the tree so it corresponds as well as possible to a
 # predefined tip ordering. Here we use the ordering that NCBI Global Tree
 # returns for the reference species - we have used it above to make the vector
-# apsMbp1Names. You inserted your YFO name into that vector - but you should
+# apsMbp1Names. You inserted your MYSPE name into that vector - but you should
 # move it to its correct position in the cladogram.
 # (Nb. we need to reverse the ordering for the plot. This is why we use the
--- a/BIN-SEQA-Comparison.R
+++ b/BIN-SEQA-Comparison.R
@@ -39,7 +39,7 @@ help(package = seqinr) # shows the available functions
 ?computePI
 # This takes as input a vector of upper-case AA codes
-# Let's retrieve the YFO sequence from our datamodel
+# Let's retrieve the MYSPE sequence from our datamodel
 # (assuming it is the last one that was added):
 db$protein[nrow(db$protein), "sequence"]
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
@@ -468,8 +468,8 @@ myDB$taxonomy$species[sel]
 # =    3  Add your own data  ===================================================
-# You have chosen an organism as "YFO", and you final task will be to find the
+# You have chosen an organism as "MYSPE", and you final task will be to find the
-# protein in YFO that is most similar to yeast Mbp1 and enter its information
+# protein in MYSPE that is most similar to yeast Mbp1 and enter its information
 # into the database.
@@ -483,7 +483,7 @@ myDB$taxonomy$species[sel]
 #   Protein BLAST.
 # - Enter NP_010227 into the "Query Sequence" field.
 # - Choose "Reference proteins (refseq_protein)" as the "Database".
-# - Paste the YFO species name into the "Organism" field.
+# - Paste the MYSPE species name into the "Organism" field.
 #
 # - Click "BLAST".
@@ -493,28 +493,28 @@ myDB$taxonomy$species[sel]
 # Otherwise, look for the top-hit in the "Alignments" section. In some cases
 # there will be more than one hit with nearly similar E-values. If this is the
-# case for YFO, choose the one with the higher degree of similarity (more
+# case for MYSPE, choose the one with the higher degree of similarity (more
 # identities) with the N-terminus of the query - i.e. the Query sequence of
 # the first ~ 100 amino acids.
 # -  Follow the link to the protein data page, linked from "Sequence ID".
 # -  From there, in a separate tab, open the link to the taxonomy database page
-#      for YFO which is linked from the "ORGANISM" record.
+#      for MYSPE which is linked from the "ORGANISM" record.
 # ==   3.2  Put the information into JSON files  ===============================
 # - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
-#     directory and give it a new name that corresponds to YFO - e.g. if
+#     directory and give it a new name that corresponds to MYSPE - e.g. if
-#     YFO is called "Crptycoccus neoformans", your file should be called
+#     MYSPE is called "Crptycoccus neoformans", your file should be called
 #     "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
 #     "name" of your protein. Open the file in the RStudio editor and replace
 #     all of the MBP1_SACCE data with the corresponding data of your protein.
 #
-# - Do a similar thing for the YFO taxonomy entry. Copy
+# - Do a similar thing for the MYSPE taxonomy entry. Copy
-#     "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json".
+#     "./data/refTaxonomy.json" and make a new file named "MYSPEtaxonomy.json".
-#     Create a valid JSON file with only one single entry - that of YFO.
+#     Create a valid JSON file with only one single entry - that of MYSPE.
 #
 # - Validate your two files online at https://jsonlint.com/
@@ -529,7 +529,7 @@ myDB$taxonomy$species[sel]
 # - than add the two commands that add your protein and taxonomy data,
 #     they should look like:
 #     myDB <- dbAddProtein(    myDB, fromJSON("MBP1_<code>.json"))
-#     myDB <- dbAddTaxonomy(   myDB, fromJSON("YFOtaxonomy.json"))
+#     myDB <- dbAddTaxonomy(   myDB, fromJSON("MYSPEtaxonomy.json"))
 #
 # - save the file and source() it:
 #     source("makeProteinDB.R")
@@ -542,9 +542,9 @@ myDB$taxonomy$species[sel]
 # ===  3.3.1  Check and validate
-# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
+# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
 # And does the taxonomy table contain the systematic name? It should be the same
-# that you get when you type YFO into the console.
+# that you get when you type MYSPE into the console.
 # Let's compute sequence lengths on the fly (with the function nchar() ), and
 # open this with the table viewer function View()
@@ -562,18 +562,18 @@ View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
 myDB$protein$sequence[nrow(myDB$protein)]
 # If not, don't continue! Fix the problem first.
-# Let me repeat: If this does not give you the right sequence of the YFO
+# Let me repeat: If this does not give you the right sequence of the MYSPE
 #                Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
-# Is that the right taxonomy ID and binomial name for YFO?
+# Is that the right taxonomy ID and binomial name for MYSPE?
-sel <- myDB$taxonomy$species == YFO
+sel <- myDB$taxonomy$species == MYSPE
 myDB$taxonomy[sel, ]
 # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
 # Fix the problem first.
-# Does this give you the right refseq ID for MBP1_<YFO>?
+# Does this give you the right refseq ID for MBP1_MYSPE?
-sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO))
+sel <- myDB$protein$name == paste0("MBP1_", biCode(MYSPE))
 myDB$protein$RefSeqID[sel]
 # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
@@ -589,8 +589,8 @@ myDB$protein$RefSeqID[sel]
 #     page on the Student Wiki
 # - Execute the two commands below and show the result on your submission page
-biCode(myDB$taxonomy$species) %in% biCode(YFO)
+biCode(myDB$taxonomy$species) %in% biCode(MYSPE)
-myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)]
+myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == MYSPE)]
 # That is all.
--- a/data/MYSPEspecies.RData
+++ b/data/MYSPEspecies.RData
--- a/data/YFOspecies.RData
+++ b/data/YFOspecies.RData