Updated MYSPE data and entire workflow. Changed all .RData to .rds

2020-09-21 14:28:24 +10:00
parent 3d91337e70
commit 7536473c5d
30 changed files with 1509 additions and 247 deletions
--- a/.myProfile.R
+++ b/.myProfile.R
@@ -6,9 +6,6 @@
 #                            UofT eMail address.
 # myStudentNumber  numeric   Your UofT student number. Take care to have this
 #                            correct.
 # MYSPE            char      A string with the species name of a genome-
 #                            sequenced fungus. Which species this is will be
 #                            determined in the BIN-MYSPE unit.
 #
 # NOTE:
 # After you have updated this script, save the file in your "myScripts" folder.
@@ -18,6 +15,5 @@
 myEMail <- "<your-e-mail-address-here>"        # e.g. "u.franklin@utoronto.ca"
 myStudentNumber <- <your-student-number-here>  # e.g. 1003141592
 MYSPE <- "<your-adopted-species-here>"         # e.g. "Neurospora crassa OR74A"
 # [END]
--- a/.utilities.R
+++ b/.utilities.R
@@ -1,11 +1,12 @@
-# .utilities.R
+# tocID <- "./.utilities.R"
 #
 # Miscellaneous R code to suppport the project
 #
-# Version: 1.3.1
+# Version: 1.4
-# Date:    2017  09 - 2019  11
+# Date:    2017-09 - 2020-09
 # Author:  Boris Steipe
 #
 # V 1.4    Maintenance
 # V 1.3.1  prefix Biostrings:: to subseq()
 # V 1.3    load msa support functions
 # V 1.2    update database utilities to support 2017 version of JSON sources
@@ -17,15 +18,39 @@
 #
 # ==============================================================================
-# ====== SCRIPTS =============================================================
+
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                       Line
 #TOC> -----------------------------------------------------------
 #TOC>   1        SCRIPTS TO SOURCE                             42
 #TOC>   2        SUPPORT FUNCTIONS                             49
 #TOC>   2.1        objectInfo()                                52
 #TOC>   2.2        biCode()                                    80
 #TOC>   2.3        pBar()                                     114
 #TOC>   2.4        waitTimer()                                136
 #TOC>   2.5        fetchMSAmotif()                            164
 #TOC>   2.6        H() (Shannon entropy)                      208
 #TOC>   3        DATA                                         222
 #TOC>   3.1        REFspecies                                 224
 #TOC>   4        FUNCTIONS TO CUSTOMIZE ASSIGNMENTS           239
 #TOC>   4.1        getMYSPE()                                 242
 #TOC>   4.2        selectPDBrep()                             251
 #TOC> 
 #TOC> ==========================================================================
 # =    1  SCRIPTS TO SOURCE  ===================================================
 source("./scripts/ABC-dbUtilities.R")
 source("./scripts/ABC-writeALN.R")
 source("./scripts/ABC-writeMFA.R")
-# ====== SUPPORT FUNCTIONS =====================================================
+# =    2  SUPPORT FUNCTIONS  ===================================================
 # ==   2.1  objectInfo()  ======================================================
 objectInfo <- function(x) {
    # Function to combine various information items about R objects
    #
@@ -53,6 +78,7 @@ objectInfo <- function(x) {
 }
 # ==   2.2  biCode()  ==========================================================
 biCode <- function(s) {
  # Make a 5 character "biCode" from a binomial name by concatening
  # the uppercased first three letter of the first word and the first
@@ -86,6 +112,7 @@ biCode <- function(s) {
 }
 # ==   2.3  pBar()  ============================================================
 pBar <- function(i, l, nCh = 50) {
  # Draw a progress bar in the console
  # i: the current iteration
@@ -107,6 +134,7 @@ pBar <- function(i, l, nCh = 50) {
 }
 # ==   2.4  waitTimer()  =======================================================
 waitTimer <- function(t, nIntervals = 50) {
  # pause and wait for t seconds and display a progress bar as
  # you are waiting
@@ -134,6 +162,7 @@ waitTimer <- function(t, nIntervals = 50) {
 }
 # ==   2.5  fetchMSAmotif()  ===================================================
 fetchMSAmotif <- function(ali, mot) {
  # Retrieve a subset from ali that spans the sequence in mot.
  # Biostrings package must be installed.
@@ -177,40 +206,23 @@ fetchMSAmotif <- function(ali, mot) {
 }
-
+# ==   2.6  H() (Shannon entropy)  =============================================
-
+H <- function(x, N) {
-# ====== PDB ID selection ======================================================
+  # calculate the Shannon entropy of the vector x given N possible states
-
+  # (in bits).
-selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
+  # H(x) = - sum_i(P(x_i) * log2(P(x_i)); 0 * log(0) == 0
-  # Select n PDB IDs from a list of high-resolution, non-homologous, single
+  t <- table(x)
-  # domain, single chain structure files that represent a CATH topology
+  if (missing(N)) { N <- length(t) }
-  # group.
+  if (length(t) > N ) { stop("N can't be smaller than observed states.") }
-  # Parameters:
+  h <- sum(- (t / length(x)) * log2(t / length(x)))
-  #   n     num     number of IDs to return
+  return(h)
  #   seed  num     a seed for the RNG
  #
  # Value:          char  PDB IDs
  #
  # Note: the list is loaded from an RData file in the "./data" directory.
  # If you use this function for a course submissio, it MUST be invoked as:
  #
  #         selectPDBrep(n, seed = myStudentNumber)
  #
  # ... and myStudentNumber MUST be correctly initialized
  load("./data/pdbRep.RData")  # loads pdbRep
  if (n > length(pdbRep)) {
    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
                 length(pdbRep)))
  }
  set.seed(seed)
  return(sample(pdbRep, n))
 }
 # ====== DATA ==================================================================
 # =    3  DATA  ================================================================
 # ==   3.1  REFspecies  ========================================================
 # 10 species of fungi for reference analysis.
 # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 REFspecies <- c("Aspergillus nidulans",
@@ -222,8 +234,52 @@ REFspecies <- c("Aspergillus nidulans",
                "Saccharomyces cerevisiae",
                "Schizosaccharomyces pombe",
                "Ustilago maydis",
-                "Wallemia mellicola"
+                "Wallemia mellicola")
-)
+
 # =    4  FUNCTIONS TO CUSTOMIZE ASSIGNMENTS  ==================================
 # ==   4.1  getMYSPE()  ========================================================
 getMYSPE <- function(x) {
  dat <- readRDS("./data/sDat.rds")
  map <- readRDS("./data/MYSPEmap.rds")
  key <- gsub(".+(....).$", "\\1", x)
  return(dat$species[map[key, "iMYSPE"]])
 }
 # ==   4.2  selectPDBrep()  ====================================================
 selectPDBrep <- function(n, seed) {
  # Select n PDB IDs from a list of high-resolution, non-homologous, single
  # domain, single chain structure files that represent a CATH topology
  # group.
  # Parameters:
  #   n     num     number of IDs to return
  #   seed  num     a seed for the RNG
  #
  # Value:          char  PDB IDs
  #
  # Note: the list is loaded from an .rds file in the "./data" directory.
  # If you use this function for a course submission, it MUST be invoked as:
  #
  #         selectPDBrep(n, seed = myStudentNumber)
  #
  # ... and myStudentNumber MUST be correctly initialized
  pdbRep <- readRDS("./data/pdbRep.rds")  # loads pdbRep
  if (n > length(pdbRep)) {
    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
                 length(pdbRep)))
  }
  oldSeed <- .Random.seed
  set.seed(seed)
  PDBset <- sample(pdbRep, n)
  .Random.seed <- oldSeed
  return(PDBset)
 }
 # [END]
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -9,11 +9,12 @@
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
 #
-# Version: 1.0.1
+# Version: 1.1
 #
-# Date:    2017  09  21
+# Date:    2020-09-18
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
 # V 1.1    2020 Workflow changes
 # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
 # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
@@ -36,9 +37,9 @@
 #TOC> 
 #TOC>   Section  Title                           Line
 #TOC> -----------------------------------------------
-#TOC>   1        Preparations                      39
+#TOC>   1        Preparations                      47
-#TOC>   2        Suitable MYSPE Species            51
+#TOC>   2        Suitable MYSPE Species            59
-#TOC>   3        Adopt "MYSPE"                     65
+#TOC>   3        Adopt "MYSPE"                     83
 #TOC> 
 #TOC> ==========================================================================
@@ -47,7 +48,7 @@
 #
 # Execute the two conditionals below:
-if (! file.exists(".myProfile.R")) {
+if (! file.exists("scripts/.myProfile.R")) {
  stop("PANIC: profile file does not exist. Fix problem or ask for help.")
 }
 if (! exists("myStudentNumber")) {
@@ -64,31 +65,80 @@ if (! exists("myStudentNumber")) {
 # A detailed description of the process of compiling the list of genome
 # sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ./scripts/ABC-makeMYSPElist.R
+# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
 # was retrieved from https://fungi.ensembl.org; a search for homologues to
 # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
 # A representative organism at each genus-level was chosen from those hits
 # that actual;ly have a homologue. Finally, a mapping table was constructed to
 # asymmetrically retrieve unique species: a student number will retrieve
 # a species, but (public) knowledge of the species cannot reconstruct the
 # student number.
 # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
-#       of selecting and combining data from  public-domain data resources.
+#       of selecting and combining data from various data resources. Studying
 #       it will give you a better sense of how such workflows can be
 #       implemented in practice.
 # =    3  Adopt "MYSPE"  =======================================================
 # Execute:
 ( MYSPE <- getMYSPE(myStudentNumber) )
-# In the code below, we load the resulting vector of species name, then pick one
+# If this produced an error, this session has not been properly set up. You
-# of them in a random but reproducible way, determined by your student number.
+# may not yet have run  init()  and edited  .myProfile.R , or that file is not
 # in your  myScripts/  folder. Fix this, and execute  source(".Rprofile") .
-load("data/MYSPEspecies.RData")     # load the species names
+# If this produced NA, your Student Number may not be correct, or you are not
-set.seed(myStudentNumber)           # seed the random number generator
+# in my class-list. Contact me.
-MYSPE <- sample(MYSPEspecies, 1)    # pick a species at random
+# Otherwise, this should have printed a species name. Your unique species
-set.seed(NULL)                      # reset the random number generator
+# for this course.
 # write the result to your personalized profile data so we can use the result in
 # other functions
 cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
 MYSPE         # so, which species is it ... ?
 biCode(MYSPE) # and what is it's "BiCode" ... ?
-# Task: Note down the species name and its five letter label on your Student
+# Task: Note down the species name and its five letter BiCode on your Student
 # Wiki user page. Use this species whenever this or future assignments refer
-# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
+# to MYSPE. Whenever you start a session, it will automatically be loaded
 # from  myScripts/.myProfile.R  and is available as  MYSPE .
 # Here is some more information:
 fungiDat <- read.csv("data/Species.csv")
 # number of sequenced fungal genomes:
 nrow(fungiDat)
 # sequenced genomes of species:
 sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
 ( x <- fungiDat[sel, "Name"] )
 Nspc <- length(x)  # save this for later ...
 # sequenced genomes of genus:
 sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
 ( x <- fungiDat[sel, "Name"] )
 Ngen <- length(x) - Nspc
 # order:
 ( x <- unique(fungiDat[sel, "Classification"]) )
 Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
 Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
 # proportions
 pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
 pie(c(Nspc, Ngen, Nord, Nfng),
    labels = "",
    radius = 1,
    main = "MYSPE in genome-sequenced fungi",
    sub = MYSPE,
    col = pCol,
    clockwise = TRUE,
    init.angle = 90)
 legend(x = 1.3, y = 0.8,     # position
       legend = c("Species", "Genus", "Order", "Fungi"),
       y.intersp = 1.5,      # line spacing for labels
       cex = 0.9,            # character size for labels
       bty = "n",            # "no" box around the legend
       pt.cex = 2,           # size of colour boxes
       pch = 15,
       col = pCol)
 # [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -110,7 +110,7 @@ ape::nodelabels(text = fungiTree$node.label,
 # We load the APSES sequence tree that you produced in the
 # BIN-PHYLO-Tree_building unit:
-load(file = "APSEStreeRproml.RData")
+apsTree <- readRDS(file = "APSEStreeRproml.rds")
 plot(apsTree) # default type is "phylogram"
 plot(apsTree, type = "unrooted")
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_building unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10.  31
+# Date:     2017-10   2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    deprecate save()/load() for saveRDS()/readRDS()
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #           1.0    First 2017 version
@@ -139,7 +140,7 @@ apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
 plot(apsTree)
 # save your tree:
-save(apsTree, file = "APSEStreeRproml.RData")
+saveRDS(apsTree, file = "APSEStreeRproml.rds")
 # If this did not work, ask for advice.
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
 #
-# Version:   1.1
+# Version:   1.2
 #
-# Date:     2017  08  -  2019  01
+# Date:     2017-08  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.2    Deprecate save()/load() for saveRDS()/readRDS()
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -72,7 +73,7 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
 # a fungal proteome. You can load the saved dataframe here (To read more about
 # what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
-load("./data/STRINGedges.RData")
+STRINGedges <- readRDS("./data/STRINGedges.rds")
 head(STRINGedges)
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@@ -198,7 +198,7 @@ Biostrings::translate(biosDNAseq[4:15])
 Biostrings::toString(biosDNAseq[4:15])
-# save() and load() works like on all other R objects.
+# saveRDS() and readRDS() works like on all other R objects.
 # =    5  More  ================================================================
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR_GEO2R unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
-# Date:     2017 09  -  2019 01
+# Date:     2017-09  -  2020-09
 # Author:   Boris Steipe <boris.steipe@utoronto.ca>
 #
 # Versions:
 #           1.3    use saveRDS()/readRDS() rather than save()/load()
 #           1.2    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -122,7 +123,9 @@ GSE3635 <- GSE3635[[idx]]
 # FALLBACK
 # ... in case the GEO server is not working, load the "GSE3635" object from
 # the data directory:
-# load(file="./data/GSE3635.RData")
+#
 # GSE3635 <- readRDS(file="./data/GSE3635.rds")
 # Checkpoint ...
 if (! exists("GSE3635")) {
--- a/data/ABC-INT-Mutation_impact.RData
+++ b/data/ABC-INT-Mutation_impact.RData
--- a/data/BLASThits.RData
+++ b/data/BLASThits.RData
--- a/data/BLASThits.rds
+++ b/data/BLASThits.rds
--- a/data/GOLDdata.RData
+++ b/data/GOLDdata.RData
--- a/data/GSE3635.RData
+++ b/data/GSE3635.RData
--- a/data/GSE3635.rds
+++ b/data/GSE3635.rds
--- a/data/MYSPEmap.rds
+++ b/data/MYSPEmap.rds
--- a/data/MYSPEspecies.RData
+++ b/data/MYSPEspecies.RData
--- a/data/REFdat.rds
+++ b/data/REFdat.rds
--- a/data/STRINGedges.RData
+++ b/data/STRINGedges.RData
--- a/data/STRINGedges.rds
+++ b/data/STRINGedges.rds
--- a/data/Species.csv
+++ b/data/Species.csv
--- a/data/myCodons.rds
+++ b/data/myCodons.rds
--- a/data/pdbRep.RData
+++ b/data/pdbRep.RData
--- a/data/pdbRep.rds
+++ b/data/pdbRep.rds
--- a/data/refDB.rds
+++ b/data/refDB.rds
--- a/data/sDat.rds
+++ b/data/sDat.rds
--- a/data/scCCnet.RData
+++ b/data/scCCnet.RData
--- a/data/scCCnet.rds
+++ b/data/scCCnet.rds
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -1,14 +1,16 @@
-# ABC_makeMYSPElist.R
+# tocID <- "scripts/ABC-makeMYSPElist.R"
 #
 # Purpose:  Create a list of genome sequenced fungi with protein annotations and
 #               Mbp1 homologues.
 #
-# Version: 1.2
+# Version: 1.3
 #
-# Date:    2016  09  -  2019  01
+# Date:    2016  09  -  2020  09
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions
 #          1.3    Rewrite to change datasource. NCBI has not been updated
 #                   since 2012. Use ensembl fungi as initial source.
 #          1.2    Change from require() to requireNamespace()
 #          1.1.2  Moved BLAST.R to ./scripts directory
 #          1.1    Update 2017
@@ -37,17 +39,16 @@
 #TOC> ==========================================================================
 #TOC> 
 #TOC>   Section  Title                                    Line
-#TOC> ---------------------------------------------------------
+#TOC> --------------------------------------------------------
-#TOC>   1        The strategy                                55
+#TOC>   1        The strategy                               56
-#TOC>   2        GOLD species                                67
+#TOC>   2        PACKAGES AND INITIALIZATIONS               68
-#TOC>   2.1        Initialize                                72
+#TOC>   3        ENSEMBL FUNGI                              76
-#TOC>   2.2        Import                                    79
+#TOC>   3.1        Import                                   79
-#TOC>   2.3        Unique species                           131
+#TOC>   4        BLAST SEARCH                              156
-#TOC>   3        BLAST species                              173
+#TOC>   4.1        find homologous proteins                162
-#TOC>   3.1        find homologous proteins                 180
+#TOC>   4.2        Identify species in "hits"              193
-#TOC>   3.2        Identify species in "hits"               204
+#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           283
-#TOC>   4        Intersect GOLD and BLAST species           249
+#TOC>   6        STUDENT NUMBERS                           366
 #TOC>   5        Cleanup and finish                         267
 #TOC> 
 #TOC> ==========================================================================
@@ -55,129 +56,110 @@
 # =    1  The strategy  ========================================================
 # This script will create a list of "MYSPE" species and save it in an R object
-# MYSPEspecies that is stored in the data subdirectory of this project from where
+# MYSPEspecies that is stored in the data subdirectory of this project from
-# it can be loaded. The strategy is as follows: we download a list of all
+# where it can be loaded. The strategy is as follows: we download a list of
-# genome projects and then select species for which protein annotations are
+# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
-# available - i.e. these are all genome-sequenced species that have been
+# species that have been annotated.
-# annotated. Then we search for fungal species that have homologues to MBP1.
+# Next we perform a BLAST search, to identify fungal species that have
-# Then we intersect the two lists to give us genome-sequenced species that
+# genes that are homologous to yeast MBP1.
-# also have Mbp1 homologues ...
+#
 # ...
-
+# =    2  PACKAGES AND INITIALIZATIONS  ========================================
 # =    2  GOLD species  ========================================================
 #  Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
 #  (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
 # ==   2.1  Initialize  ========================================================
 # httr provides interfaces to Webservers on the Internet
 if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }
 # ==   2.2  Import  ============================================================
-# The URL of the genome data directory at the NCBI:
+# =    3  ENSEMBL FUNGI  =======================================================
 # is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
 # Note the relative size of the prokaryotes and the eukaryotes data.
 # What's in this directory?
 URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
 GOLDreadme <- readLines(URL) # read the file into a vector
 cat(GOLDreadme, sep = "\n")  # display the contents
 # Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
 # into a dataframe. This will take a few moments.
 # URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
 # GOLDdata <- read.csv(URL,
 #                      header = TRUE,
 #                      sep = "\t",
 #                      stringsAsFactors = FALSE)
 # save(GOLDdata, file="data/GOLDdata.RData")
 # or ...
 load(file="data/GOLDdata.RData")
-# What columns does the table have, how is it structured?
+# ==   3.1  Import  ============================================================
 str(GOLDdata)
-# What groups of organisms are in the table? How many of each?
+# Navigate to https://fungi.ensembl.org and click on the link to the full
-table(GOLDdata$Group)
+# list of all species: https://fungi.ensembl.org/species.html
 # On the page, click on the spreadsheet symbol top right and choose
 # "download whole table". The file will be named  "Species.csv", in your
 # usual downloads folder. Move it to the data folder, and read it.
-# What subgroups of fungi do we have?
+sDat <- read.csv("./data/Species.csv")
-table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
+str(sDat)
-# How many of the fungi have protein annotations? The README file told us that
+# The most obvious way to partition these is according to Classification ...
-# the column "Proteins" contains "Number of Proteins annotated in the assembly".
+# (poking around a bit in the UniProt taxonomy database shows that the
-# Looking at a few ...
+#  classification used here is the taxonomic rank of "order").
-head(GOLDdata$Proteins, 30)
+# how many classifications do we have?
-# ... we see that the number varies, and some have a hyphen, i.e. no
+length(unique(sDat$Classification))  # 66
 # annotations. The hyphens make this a char type column (as per: all elements
 # of a vector must have the same type). Therefore we can't read this as numbers
 # and filter by some value > 0. But we can filter for all genomes that don't
 # have the hyphen:
 sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
-# Subset the data, with fungi that have protein annotations
+# To have a good set for the class, we should have about 100.
-GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
+# Let's see for which of these we can find Mbp1 homologues.
-                          GOLDdata$Proteins != "-" , ]
+# First, we'll keep only the colums for name, classification, and taxID, and
 # drop the rest ...
 sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
 colnames(sDat) <- c("name", "order", "taxID")
-# check what we have in the table
+# Next, we make an extra column: genus - the first part of the binomial name.
-nrow(GOLDfungi)
+# We'll use the gsub() function, and for that we need a "regular expression"
-head(GOLDfungi)
+# that matches to all characters from the first blank to the end of the string:
 myPatt <- "\\s.*$"  # one whitespace (\\s) ...
                    # followed by any character (.) 0..n times (*) ...
                    # until the end of the string
 # using gsub() we substitue all matching characters with the empty string "" -
 # this deletes the matching characters
 # Test this:
 gsub(myPatt, "", "Genus")                      # one word: unchanged
 gsub(myPatt, "", "gEnus species")              # two words: return only first
 gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
-# ==   2.3  Unique species  ====================================================
+# apply this to the "name" column and add the result as a separate column
 # called "genus"
 sDat$genus <- gsub(myPatt, "", sDat$name)
 # what do we get?
 c(head(unique(sDat$genus)),
  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
                             # is a problem that we have to keep in mind.
                             # (Always inspect your results!)
 # Drop all rows for which the genus contains special chracters -
 # like "[Candida]"
 sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
 length(table(sDat$genus))    # how many genus?
 hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
                                              # most genus have very few, but
                                              # some have very many species.
 sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
 # We should have at least one species from each taxonomic order, but we can
 # add a few genus until we have about 100 validated species.
 # Let's add a column for species, by changing our regular expression a bit,
 # using ^ (start of string), \\S (NOT a whitespace),
 # and + (one or more matches), capturing the match (...), and returning
 # it as the substitution (\\1) ...
 myPatt <- "^(\\S+\\s\\S+)\\s.*$"
 sDat$species <- gsub(myPatt, "\\1", sDat$name)
 # And we reorder the columns, just for aesthetics:
 sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
 # Final check:
 any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
 # For our purpose of defining species, we will select only species, not strains
 # from this list. To do this, we pick the first two words i.e. the systematic
 # binomial name from the "X.Organism.Name" column, and then we remove redundant
 # species. Here is a function:
 #
 # Now we check which of these have Mbp1 homologues ...
-getBinom <- function(s) {
+# =    4  BLAST SEARCH  ========================================================
    # Fetch the first two words from a string.
    # Parameters:
    #   s: char  a string which is expected to contain a binomial species name
    #            as the first two words, possibly followed by other text.
    # Value: char  the first two words separated by a single blank
    #
    x <- unlist(strsplit(s, "\\s+"))     # split s on one or more whitespace
    return(paste(x[1:2], collapse=" "))  # return first two elements
 }
 # iterate through GOLDdata and extract species names
 GOLDspecies <- character()
 for (i in 1:nrow(GOLDfungi)) {
    GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
 }
 head(GOLDspecies)
 length(GOLDspecies)
 # N.b. this would be more efficiently (but perhaps less explicitly) coded with
 # one of the apply() functions, instead of a for-loop.
 # GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
 # Species of great interest may appear more than once, one for each sequenced
 # strain: e.g. brewer's yeast:
 sum(GOLDspecies == "Saccharomyces cerevisiae")
 # Therefore we use the function unique() to throw out duplicates. Simple:
 GOLDspecies <- unique(GOLDspecies)
 length(GOLDspecies)
 # i.e. we got rid of about 40% of the species by removing duplicates.
-# =    3  BLAST species  =======================================================
+# We run a BLAST search to find all proteins related to yeast Mbp1 in any
-#
+# fungus. With the results, we'll annotate our sDat table.
-# Next, we filter our list by species that have homologues to the yeast Mbp1
+
-# gene. To do this we run a BLAST search to find all related proteins in any
+# ==   4.1  find homologous proteins  ==========================================
 # fungus. We list the species that appear in that list, and then we select those
 # that appear in our GOLD table as well.
 #
 # ==   3.1  find homologous proteins  ==========================================
 #
 # Use BLAST to fetch proteins related to Mbp1 and identify the species that
 # contain them.
@@ -190,18 +172,25 @@ length(GOLDspecies)
 # standard task of communicating with servers and parsing responses - everyday
 # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
 # parser in currently available packages.
-
+#
-# source("./scripts/BLAST.R")   # load the function and its utilities
+# DON'T use this for BLAST searches unless you have read the NCBI policy
 # for automated tasks. If you indicriminately pound on the NCBI's BLAST
 # server, they will blacklist your IP-address. See:
 # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 #
 # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 #                    db = "refseq_protein",        # database to search in
-#                    nHits = 3000,                 # 720 hits in 2017
+#                    nHits = 3000,                 # 945 hits in 2020
 #                    E = 0.01,                     #
 #                    limits = "txid4751[ORGN]")    # = fungi
-# save(BLASThits, file="data/BLASThits.RData")
+# saveRDS(BLASThits, file="data/BLASThits.rds")
-load(file="data/BLASThits.RData")
+#
 # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
 #
 BLASThits <- readRDS(file = "data/BLASThits.rds")
-# ==   3.2  Identify species in "hits"  ========================================
+# ==   4.2  Identify species in "hits"  ========================================
 # This is a very big list that can't be usefully analyzed manually. Here
 # we are only interested in the species names that it contains.
@@ -230,55 +219,202 @@ for (i in seq_along(BLASThits$hits)) {
 # You can confirm that BLASTspecies has the expected size.
 length(BLASTspecies)
 # if we delete some of these later on, we still want to remember which hit
 # they came from. Thus we name() the elements with their index, which is the
 # same as the index of the hit in BLASThits
 names(BLASTspecies) <- 1:length(BLASTspecies)
 # let's plot the distribution of E-values
 eVals <- numeric()
 for (i in seq_along(BLASThits$hits)) {
  eVals[i] <- BLASThits$hits[[i]]$E
 }
 range(eVals)
 sum(eVals == 0)
 # let's plot the log of all values > 0 to see how they are distributed
 # plotting only one vectyor of numbers plots their index as x, and
 # their value as y ...
 plot(log(eVals[eVals > 0]), col = "#CC0000")
 # This is very informative: I would suspect that the first ten or so are
 # virtually identical to the yeast protein, then we have about 700 hits with
 # decreasing similarity, and then about 200 more that may actually be false
 # positives. Also - we plotted them by index, that means the table is SORTED:
 # Lower E-values strictly come before higher E-values.
 # Again, some species appear more than once, e.g. ...
 sum(BLASTspecies == "Saccharomyces cerevisiae")
 # ... corresponding to the five homologous gene sequences (paralogues) of yeast.
-# Therefore we use unique() to throw out duplicates:
+# Therefore we remove duplicates. Removing duplicates will leave the FIRST
-BLASTspecies <- unique(BLASTspecies)
+# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
 # species, we will retain only the protein that has the highest similarity
 # to yeast Mbp1, not any of its more distant paralogues.
 sel <- ! duplicated(BLASTspecies)
 BLASTspecies <- BLASTspecies[sel]
 length(BLASTspecies)
 # i.e. we got rid of about two thirds of the hits.
 tail(BLASTspecies)  # see how the names are useful!
                    # again - there are some special characters ...
                    # what are they?
 BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
-# You should think about this: what is the biological interpretation of the
+# remove the brackets ...
-# finding that on average we have three sequences that are similar to Mbp1 in
+BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
-# other species?
+# drop any new duplicates ...
 BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
 # check the number again:
 length(BLASTspecies)
 # Think a bit about this: what may be the biological reason to find that
 # on average, in 300 fungi across the entire phylogenetic tree, we have
 # three sequences that are homologous to yeast Mbp1?
 # Let's look at the distribution of E-values in this selection (Subsetting FTW):
 # we plot all values that are TRUE in the vector "sel" that we created above,
 # AND greater than 0
 plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
-# =    4  Intersect GOLD and BLAST species  ====================================
+# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
-# Now we can compare the two lists for species that appear in both sources: the
+# Next we add the blast result to our sDat dataframe. We'll store the index,
-# simplest way is to use the set operation functions union(), intersection()
+# the E-value, and the Query-bounds from which we can estimate which domains
-# etc. See here:
+# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
-?union
+# Mbp1's N-terminal APSES domain.)
 MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
 # Again: interpret this:
 #  - what is the number of GOLDspecies?
 #  - what is the number of BLAST species?
 #  - how many species are present in both lists?
 #  - what does it mean if a species is in GOLD but not in the BLAST list?
 #  - what does it mean if a species has been found during BLAST, but it
 #    is not in GOLD?
 # =    5  Cleanup and finish  ==================================================
 # One final thing: some of the species will be our so-called "reference" species
 # which we use for model solutions and examples in the course. They are defined
 # in the .utilities.R file of this project. We remove them from the list so that
 # we don't inadvertently assign them.
 #
 # First we pull the hits we wanted from the BLASTspecies:
 iHits <- as.numeric(names(BLASTspecies))
 length(iHits)     # one index for each TRUE in sel
-REFspecies
+# add columns to sDat
 l <- nrow(sDat)
 sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
 sDat$eVal   <- numeric(l)  # E-value of the hit
 sDat$lAli   <- numeric(l)  # length of the aligned region
-MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
+# extract and merge
 for (iHit in iHits) {
  thisSp <- BLASThits$hits[[iHit]]$species
  sel <- sDat$species == thisSp
-# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
+  sDat$iHit[sel]   <- iHit
  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
 }
 # Are all reference species accounted for?
 selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
 REFspecies %in% sDat$species[selA]     # yes, all there
 selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
 sum(selA & selB)   # How many rows?
 # sDat of course includes all duplicates. Some may be multiply sequenced, some
 # may be different strains. We'll use the same strategy as before and keep
 # only the best hit: order the rows by E-value, then drop all rows which
 # are duplicated.
 # drop all rows without BLAST hits ...
 sDat <- sDat[ ! (sDat$iHit == 0) , ]
 # order sDat by E-value ...
 sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
 # drop all rows with duplicated species ...
 sDat <- sDat[ ! duplicated(sDat$species) , ]
 # Lets look at the E-values ...
 plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
 # and alignment lengths ...
 plot(sDat$lAli, col = "#00DDAA")
 # How many ...
 length(unique(sDat$name))
 length(unique(sDat$species))
 length(unique(sDat$genus))
 length(unique(sDat$order))
 # To get the final dataset, we remove the reference species with their
 # entire orders ...
 REForders <- unique(sDat$order[sDat$species %in% REFspecies])
 sel <- sDat$order %in% REForders
 REFdat <- sDat[sel , ]
 sDat   <- sDat[ ! sel , ]
 # REFdat should now contain only the REFspecies ...
 ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
 # ... but all of them
 sum(REFspecies %in% REFdat$species)
 # ... and we have enough left in sDat to prune sDat to unique genus ...
 sDat <- sDat[ ! duplicated(sDat$genus) , ]
 # saveRDS(sDat, file = "data/sDat.rds")
 # saveRDS(REFdat, file = "data/REFdat.rds")
 # =    6  STUDENT NUMBERS  =====================================================
 #
 # An asymmetric function to retrieve a MYSPE species
 students <- read.csv("../BCH441-2020-students.csv")
 sN <- students$Student.Number
 range(sN)
 any(duplicated(gsub(".+(.......)$", "\\1", sN)))
 N <- 7
 x <- numeric(N)
 for (i in 1:N) {
  x[i] <- H(substr(gsub(".+(.......)$", "\\1", sN), i, i))
 }
 plot(x, col = "#BB0000", type = "b")
 keys <- as.numeric(gsub(".+(....).$", "\\1", sN))
 any(duplicated(keys))
 # =====
 set.seed(112358)
 names(sN) <- sample(1:nrow(sDat), length(sN))
 MYSPEmap <- data.frame(keys = sprintf("%04d", 0:9999),
                       iMYSPE = sample(1:nrow(sDat), 10000, replace = TRUE))
 rownames(MYSPEmap) <- MYSPEmap$keys
 for (i in 1:length(sN)) {
  rMap <- gsub(".+(....).$", "\\1", sN[i])
  MYSPEmap[rMap, "iMYSPE"] <- as.integer(names(sN)[i])
 }
 # saveRDS(MYSPEmap, "./data/MYSPEmap.rds")
 getMYSPE <- function(x) {
  dat <- readRDS("./data/sDat.rds")
  map <- readRDS("./data/MYSPEmap.rds")
  key <- gsub(".+(....).$", "\\1", x)
  return(dat$species[map[key, "iMYSPE"]])
 }
 # === validate
 l <- length(sN)
 sp <- character(l)
 for(i in 1:l) {
  sp[i] <- getMYSPE(sN[i])
 }
 any(duplicated(sp))
 length(unique(sp))
 which(! sDat$species %in% sp)  # these can be assigned to late-comers
 # Done.
 # [END]
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@@ -119,10 +119,10 @@ scCCnet <- scCCnet[! duplicated(x), ]
 # length(unique(c(mySubnet$protein1, mySubnet$protein2)))  # 261, no change
 # Network has 261 nodes, 1280 edges
-save(scCCnet, file = "./data/scCCnet.RData")
+saveRDS(scCCnet, file = "./data/scCCnet.rds")
-# load("./data/scCCnet.RData")   # <<<- use this to load the object when
+# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
-                                 # needed
+                                             #      object when needed
 # [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -4,23 +4,23 @@
 #          This script uses the BLAST URL-API
 #          (Application Programming Interface) at the NCBI.
 #          Read about the constraints here:
-#          https://ncbi.github.io/blast-cloud/dev/api.html
+#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 #
 #
-# Version: 3.1
+# Version: 3.2
-# Date:    2016 09 - 2019 01
+# Date:    2016 09 - 2020 09
 # Author:  Boris Steipe
 #
 # Versions:
 #    3.2   2020 updates
 #    3.1   Change from require() to requireNamespace(),
 #          use <package>::<function>() idiom throughout
-#    3     parsing logic had not been fully implemented; Fixed.
+#    3.0   parsing logic had not been fully implemented; Fixed.
 #    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 #          refactored parseBLASTalignment() to handle lists with multiple hits.
 #    2.0   Completely rewritten because the interface completely changed.
 #          Code adpated in part from NCBI Perl sample code:
 #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
 #
 #    1.0   first version posted for BCH441 2016, based on BLAST - API
 #
 # ToDo:
@@ -31,47 +31,50 @@
 # ==============================================================================
-if (! requireNamespace(httr, quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }
-BLAST <- function(q,
+BLAST <- function(Q,
                  db = "refseq_protein",
                  nHits = 30,
                  E = 0.1,
                  limits = "",
                  rid = "",
                  query = "",
                  quietly = FALSE,
                  myTimeout = 120) {
    # Purpose:
    #     Basic BLAST search
    #
    # Parameters:
-    #     q: query - either a valid ID or a sequence
+    #     Q: query - either a valid ID or a sequence
    #     db: "refseq_protein" by default,
-    #         other legal valuses include: "nr", "pdb", "swissprot" ...
+    #         other legal values include: "nr", "pdb", "swissprot" ...
    #     nHits: number of hits to maximally return
    #     E: E-value cutoff. Do not return hits whose score would be expected
    #        to occur E or more times in a database of random sequence.
    #     limits: a valid ENTREZ filter
-    #     rid: a request ID - to retrieve earleir search results
+    #     rid: a request ID - to retrieve earlier search results
    #     query: the actual query string (needed when retrieving results
    #            with an rid)
    #     quietly: controls printing of wait-time progress bar
    #     timeout: how much longer _after_ rtoe to wait for a result
    #              before giving up (seconds)
    # Value:
-    #     result: list of resulting hits and some metadata
+    #     result: list of process status or resulting hits, and some metadata
    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
    results <- list()
    results$query = query
    results$rid <- rid
    results$rtoe <- 0
-    if (rid == "") {  # if rid is not the empty string we skip the
+    if (rid == "") {  # If no rid is available, spawn a search.
-                      # initial search and and proceed directly to retrieval
+                      # Else, proceed directly to retrieval.
      # prepare query, GET(), and parse rid and rtoe from BLAST server response
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
@@ -141,7 +144,8 @@ BLAST <- function(q,
        if (myTimeout <= 0) { # abort
          cat("BLAST search not concluded before timeout. Aborting.\n")
-          cat(sprintf("You could check back later with rid \"%s\"\n",
+          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
                      "Trying checking back later with >",
                      results$rid))
          return(results)
        }
@@ -370,7 +374,7 @@ if (FALSE) {
                nHits = 100,
                E = 0.001,
                rid = "",
-                limits = "txid4751[ORGN]")
+                limits = "txid4751[ORGN]")  # Fungi
  str(test)
  length(test$hits)
 }