Updated MYSPE data and entire workflow. Changed all .RData to .rds

2020-09-21 14:28:24 +10:00
parent 3d91337e70
commit 7536473c5d
30 changed files with 1509 additions and 247 deletions
--- a/.myProfile.R
+++ b/.myProfile.R
@@ -6,9 +6,6 @@
 #                            UofT eMail address.
 # myStudentNumber  numeric   Your UofT student number. Take care to have this
 #                            correct.
-# MYSPE            char      A string with the species name of a genome-
-#                            sequenced fungus. Which species this is will be
-#                            determined in the BIN-MYSPE unit.
 #
 # NOTE:
 # After you have updated this script, save the file in your "myScripts" folder.
@@ -18,6 +15,5 @@

 myEMail <- "<your-e-mail-address-here>"        # e.g. "u.franklin@utoronto.ca"
 myStudentNumber <- <your-student-number-here>  # e.g. 1003141592
-MYSPE <- "<your-adopted-species-here>"         # e.g. "Neurospora crassa OR74A"

 # [END]
--- a/.utilities.R
+++ b/.utilities.R
@@ -1,11 +1,12 @@
-# .utilities.R
+# tocID <- "./.utilities.R"
 #
 # Miscellaneous R code to suppport the project
 #
-# Version: 1.3.1
-# Date:    2017  09 - 2019  11
+# Version: 1.4
+# Date:    2017-09 - 2020-09
 # Author:  Boris Steipe
 #
+# V 1.4    Maintenance
 # V 1.3.1  prefix Biostrings:: to subseq()
 # V 1.3    load msa support functions
 # V 1.2    update database utilities to support 2017 version of JSON sources
@@ -17,15 +18,39 @@
 #
 # ==============================================================================

-# ====== SCRIPTS =============================================================
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                       Line
+#TOC> -----------------------------------------------------------
+#TOC>   1        SCRIPTS TO SOURCE                             42
+#TOC>   2        SUPPORT FUNCTIONS                             49
+#TOC>   2.1        objectInfo()                                52
+#TOC>   2.2        biCode()                                    80
+#TOC>   2.3        pBar()                                     114
+#TOC>   2.4        waitTimer()                                136
+#TOC>   2.5        fetchMSAmotif()                            164
+#TOC>   2.6        H() (Shannon entropy)                      208
+#TOC>   3        DATA                                         222
+#TOC>   3.1        REFspecies                                 224
+#TOC>   4        FUNCTIONS TO CUSTOMIZE ASSIGNMENTS           239
+#TOC>   4.1        getMYSPE()                                 242
+#TOC>   4.2        selectPDBrep()                             251
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  SCRIPTS TO SOURCE  ===================================================

 source("./scripts/ABC-dbUtilities.R")
 source("./scripts/ABC-writeALN.R")
 source("./scripts/ABC-writeMFA.R")


-# ====== SUPPORT FUNCTIONS =====================================================
+# =    2  SUPPORT FUNCTIONS  ===================================================

+
+# ==   2.1  objectInfo()  ======================================================
 objectInfo <- function(x) {
    # Function to combine various information items about R objects
    #
@@ -53,6 +78,7 @@ objectInfo <- function(x) {
 }


+# ==   2.2  biCode()  ==========================================================
 biCode <- function(s) {
  # Make a 5 character "biCode" from a binomial name by concatening
  # the uppercased first three letter of the first word and the first
@@ -86,6 +112,7 @@ biCode <- function(s) {
 }


+# ==   2.3  pBar()  ============================================================
 pBar <- function(i, l, nCh = 50) {
  # Draw a progress bar in the console
  # i: the current iteration
@@ -107,6 +134,7 @@ pBar <- function(i, l, nCh = 50) {
 }


+# ==   2.4  waitTimer()  =======================================================
 waitTimer <- function(t, nIntervals = 50) {
  # pause and wait for t seconds and display a progress bar as
  # you are waiting
@@ -134,6 +162,7 @@ waitTimer <- function(t, nIntervals = 50) {
 }


+# ==   2.5  fetchMSAmotif()  ===================================================
 fetchMSAmotif <- function(ali, mot) {
  # Retrieve a subset from ali that spans the sequence in mot.
  # Biostrings package must be installed.
@@ -177,40 +206,23 @@ fetchMSAmotif <- function(ali, mot) {
 }


-
-
-# ====== PDB ID selection ======================================================
-
-selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
-  # Select n PDB IDs from a list of high-resolution, non-homologous, single
-  # domain, single chain structure files that represent a CATH topology
-  # group.
-  # Parameters:
-  #   n     num     number of IDs to return
-  #   seed  num     a seed for the RNG
-  #
-  # Value:          char  PDB IDs
-  #
-  # Note: the list is loaded from an RData file in the "./data" directory.
-  # If you use this function for a course submissio, it MUST be invoked as:
-  #
-  #         selectPDBrep(n, seed = myStudentNumber)
-  #
-  # ... and myStudentNumber MUST be correctly initialized
-
-  load("./data/pdbRep.RData")  # loads pdbRep
-  if (n > length(pdbRep)) {
-    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
-                 length(pdbRep)))
-  }
-  set.seed(seed)
-  return(sample(pdbRep, n))
+# ==   2.6  H() (Shannon entropy)  =============================================
+H <- function(x, N) {
+  # calculate the Shannon entropy of the vector x given N possible states
+  # (in bits).
+  # H(x) = - sum_i(P(x_i) * log2(P(x_i)); 0 * log(0) == 0
+  t <- table(x)
+  if (missing(N)) { N <- length(t) }
+  if (length(t) > N ) { stop("N can't be smaller than observed states.") }
+  h <- sum(- (t / length(x)) * log2(t / length(x)))
+  return(h)
 }


-# ====== DATA ==================================================================

+# =    3  DATA  ================================================================

+# ==   3.1  REFspecies  ========================================================
 # 10 species of fungi for reference analysis.
 # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 REFspecies <- c("Aspergillus nidulans",
@@ -222,8 +234,52 @@ REFspecies <- c("Aspergillus nidulans",
                "Saccharomyces cerevisiae",
                "Schizosaccharomyces pombe",
                "Ustilago maydis",
-                "Wallemia mellicola"
-)
+                "Wallemia mellicola")
+
+
+# =    4  FUNCTIONS TO CUSTOMIZE ASSIGNMENTS  ==================================
+
+
+# ==   4.1  getMYSPE()  ========================================================
+getMYSPE <- function(x) {
+  dat <- readRDS("./data/sDat.rds")
+  map <- readRDS("./data/MYSPEmap.rds")
+  key <- gsub(".+(....).$", "\\1", x)
+  return(dat$species[map[key, "iMYSPE"]])
+}
+
+
+# ==   4.2  selectPDBrep()  ====================================================
+selectPDBrep <- function(n, seed) {
+  # Select n PDB IDs from a list of high-resolution, non-homologous, single
+  # domain, single chain structure files that represent a CATH topology
+  # group.
+  # Parameters:
+  #   n     num     number of IDs to return
+  #   seed  num     a seed for the RNG
+  #
+  # Value:          char  PDB IDs
+  #
+  # Note: the list is loaded from an .rds file in the "./data" directory.
+  # If you use this function for a course submission, it MUST be invoked as:
+  #
+  #         selectPDBrep(n, seed = myStudentNumber)
+  #
+  # ... and myStudentNumber MUST be correctly initialized
+
+  pdbRep <- readRDS("./data/pdbRep.rds")  # loads pdbRep
+
+  if (n > length(pdbRep)) {
+    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
+                 length(pdbRep)))
+  }
+  oldSeed <- .Random.seed
+  set.seed(seed)
+  PDBset <- sample(pdbRep, n)
+  .Random.seed <- oldSeed
+  return(PDBset)
+}
+


 # [END]
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@@ -9,11 +9,12 @@
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
 #
-# Version: 1.0.1
+# Version: 1.1
 #
-# Date:    2017  09  21
+# Date:    2020-09-18
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
+# V 1.1    2020 Workflow changes
 # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
 # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
@@ -36,9 +37,9 @@
 #TOC> 
 #TOC>   Section  Title                           Line
 #TOC> -----------------------------------------------
-#TOC>   1        Preparations                      39
-#TOC>   2        Suitable MYSPE Species            51
-#TOC>   3        Adopt "MYSPE"                     65
+#TOC>   1        Preparations                      47
+#TOC>   2        Suitable MYSPE Species            59
+#TOC>   3        Adopt "MYSPE"                     83
 #TOC> 
 #TOC> ==========================================================================

@@ -47,7 +48,7 @@
 #

 # Execute the two conditionals below:
-if (! file.exists(".myProfile.R")) {
+if (! file.exists("scripts/.myProfile.R")) {
  stop("PANIC: profile file does not exist. Fix problem or ask for help.")
 }
 if (! exists("myStudentNumber")) {
@@ -64,31 +65,80 @@ if (! exists("myStudentNumber")) {

 # A detailed description of the process of compiling the list of genome
 # sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ./scripts/ABC-makeMYSPElist.R
+# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
+# was retrieved from https://fungi.ensembl.org; a search for homologues to
+# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
+# A representative organism at each genus-level was chosen from those hits
+# that actual;ly have a homologue. Finally, a mapping table was constructed to
+# asymmetrically retrieve unique species: a student number will retrieve
+# a species, but (public) knowledge of the species cannot reconstruct the
+# student number.

 # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
-#       of selecting and combining data from  public-domain data resources.
+#       of selecting and combining data from various data resources. Studying
+#       it will give you a better sense of how such workflows can be
+#       implemented in practice.
+

 # =    3  Adopt "MYSPE"  =======================================================

+# Execute:
+( MYSPE <- getMYSPE(myStudentNumber) )

-# In the code below, we load the resulting vector of species name, then pick one
-# of them in a random but reproducible way, determined by your student number.
+# If this produced an error, this session has not been properly set up. You
+# may not yet have run  init()  and edited  .myProfile.R , or that file is not
+# in your  myScripts/  folder. Fix this, and execute  source(".Rprofile") .

-load("data/MYSPEspecies.RData")     # load the species names
-set.seed(myStudentNumber)           # seed the random number generator
-MYSPE <- sample(MYSPEspecies, 1)    # pick a species at random
-set.seed(NULL)                      # reset the random number generator
-# write the result to your personalized profile data so we can use the result in
-# other functions
-cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
+# If this produced NA, your Student Number may not be correct, or you are not
+# in my class-list. Contact me.
+# Otherwise, this should have printed a species name. Your unique species
+# for this course.

-MYSPE         # so, which species is it ... ?
 biCode(MYSPE) # and what is it's "BiCode" ... ?

-# Task: Note down the species name and its five letter label on your Student
+# Task: Note down the species name and its five letter BiCode on your Student
 # Wiki user page. Use this species whenever this or future assignments refer
-# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
+# to MYSPE. Whenever you start a session, it will automatically be loaded
+# from  myScripts/.myProfile.R  and is available as  MYSPE .

+# Here is some more information:
+fungiDat <- read.csv("data/Species.csv")
+
+# number of sequenced fungal genomes:
+nrow(fungiDat)
+
+# sequenced genomes of species:
+sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
+( x <- fungiDat[sel, "Name"] )
+Nspc <- length(x)  # save this for later ...
+
+# sequenced genomes of genus:
+sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
+( x <- fungiDat[sel, "Name"] )
+Ngen <- length(x) - Nspc
+
+# order:
+( x <- unique(fungiDat[sel, "Classification"]) )
+Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
+Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
+
+# proportions
+pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
+pie(c(Nspc, Ngen, Nord, Nfng),
+    labels = "",
+    radius = 1,
+    main = "MYSPE in genome-sequenced fungi",
+    sub = MYSPE,
+    col = pCol,
+    clockwise = TRUE,
+    init.angle = 90)
+legend(x = 1.3, y = 0.8,     # position
+       legend = c("Species", "Genus", "Order", "Fungi"),
+       y.intersp = 1.5,      # line spacing for labels
+       cex = 0.9,            # character size for labels
+       bty = "n",            # "no" box around the legend
+       pt.cex = 2,           # size of colour boxes
+       pch = 15,
+       col = pCol)

 # [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@@ -110,7 +110,7 @@ ape::nodelabels(text = fungiTree$node.label,

 # We load the APSES sequence tree that you produced in the
 # BIN-PHYLO-Tree_building unit:
-load(file = "APSEStreeRproml.RData")
+apsTree <- readRDS(file = "APSEStreeRproml.rds")

 plot(apsTree) # default type is "phylogram"
 plot(apsTree, type = "unrooted")
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_building unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10.  31
+# Date:     2017-10   2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    deprecate save()/load() for saveRDS()/readRDS()
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #           1.0    First 2017 version
@@ -139,7 +140,7 @@ apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
 plot(apsTree)

 # save your tree:
-save(apsTree, file = "APSEStreeRproml.RData")
+saveRDS(apsTree, file = "APSEStreeRproml.rds")

 # If this did not work, ask for advice.

--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
 #
-# Version:   1.1
+# Version:   1.2
 #
-# Date:     2017  08  -  2019  01
+# Date:     2017-08  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    Deprecate save()/load() for saveRDS()/readRDS()
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -72,7 +73,7 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
 # a fungal proteome. You can load the saved dataframe here (To read more about
 # what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).

-load("./data/STRINGedges.RData")
+STRINGedges <- readRDS("./data/STRINGedges.rds")

 head(STRINGedges)

--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@@ -198,7 +198,7 @@ Biostrings::translate(biosDNAseq[4:15])

 Biostrings::toString(biosDNAseq[4:15])

-# save() and load() works like on all other R objects.
+# saveRDS() and readRDS() works like on all other R objects.


 # =    5  More  ================================================================
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@@ -9,12 +9,13 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR_GEO2R unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
-# Date:     2017 09  -  2019 01
+# Date:     2017-09  -  2020-09
 # Author:   Boris Steipe <boris.steipe@utoronto.ca>
 #
 # Versions:
+#           1.3    use saveRDS()/readRDS() rather than save()/load()
 #           1.2    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@@ -122,7 +123,9 @@ GSE3635 <- GSE3635[[idx]]
 # FALLBACK
 # ... in case the GEO server is not working, load the "GSE3635" object from
 # the data directory:
-# load(file="./data/GSE3635.RData")
+#
+# GSE3635 <- readRDS(file="./data/GSE3635.rds")
+

 # Checkpoint ...
 if (! exists("GSE3635")) {
--- a/data/ABC-INT-Mutation_impact.RData
+++ b/data/ABC-INT-Mutation_impact.RData
--- a/data/BLASThits.RData
+++ b/data/BLASThits.RData
--- a/data/BLASThits.rds
+++ b/data/BLASThits.rds
--- a/data/GOLDdata.RData
+++ b/data/GOLDdata.RData
--- a/data/GSE3635.RData
+++ b/data/GSE3635.RData
--- a/data/GSE3635.rds
+++ b/data/GSE3635.rds
--- a/data/MYSPEmap.rds
+++ b/data/MYSPEmap.rds
--- a/data/MYSPEspecies.RData
+++ b/data/MYSPEspecies.RData
--- a/data/REFdat.rds
+++ b/data/REFdat.rds
--- a/data/STRINGedges.RData
+++ b/data/STRINGedges.RData
--- a/data/STRINGedges.rds
+++ b/data/STRINGedges.rds
--- a/data/Species.csv
+++ b/data/Species.csv
--- a/data/myCodons.rds
+++ b/data/myCodons.rds
--- a/data/pdbRep.RData
+++ b/data/pdbRep.RData
--- a/data/pdbRep.rds
+++ b/data/pdbRep.rds
--- a/data/refDB.rds
+++ b/data/refDB.rds
--- a/data/sDat.rds
+++ b/data/sDat.rds
--- a/data/scCCnet.RData
+++ b/data/scCCnet.RData
--- a/data/scCCnet.rds
+++ b/data/scCCnet.rds
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -1,14 +1,16 @@
-# ABC_makeMYSPElist.R
+# tocID <- "scripts/ABC-makeMYSPElist.R"
 #
 # Purpose:  Create a list of genome sequenced fungi with protein annotations and
 #               Mbp1 homologues.
 #
-# Version: 1.2
+# Version: 1.3
 #
-# Date:    2016  09  -  2019  01
+# Date:    2016  09  -  2020  09
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions
+#          1.3    Rewrite to change datasource. NCBI has not been updated
+#                   since 2012. Use ensembl fungi as initial source.
 #          1.2    Change from require() to requireNamespace()
 #          1.1.2  Moved BLAST.R to ./scripts directory
 #          1.1    Update 2017
@@ -36,18 +38,17 @@

 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                                     Line
-#TOC> ---------------------------------------------------------
-#TOC>   1        The strategy                                55
-#TOC>   2        GOLD species                                67
-#TOC>   2.1        Initialize                                72
-#TOC>   2.2        Import                                    79
-#TOC>   2.3        Unique species                           131
-#TOC>   3        BLAST species                              173
-#TOC>   3.1        find homologous proteins                 180
-#TOC>   3.2        Identify species in "hits"               204
-#TOC>   4        Intersect GOLD and BLAST species           249
-#TOC>   5        Cleanup and finish                         267
+#TOC>   Section  Title                                    Line
+#TOC> --------------------------------------------------------
+#TOC>   1        The strategy                               56
+#TOC>   2        PACKAGES AND INITIALIZATIONS               68
+#TOC>   3        ENSEMBL FUNGI                              76
+#TOC>   3.1        Import                                   79
+#TOC>   4        BLAST SEARCH                              156
+#TOC>   4.1        find homologous proteins                162
+#TOC>   4.2        Identify species in "hits"              193
+#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           283
+#TOC>   6        STUDENT NUMBERS                           366
 #TOC> 
 #TOC> ==========================================================================

@@ -55,129 +56,110 @@
 # =    1  The strategy  ========================================================

 # This script will create a list of "MYSPE" species and save it in an R object
-# MYSPEspecies that is stored in the data subdirectory of this project from where
-# it can be loaded. The strategy is as follows: we download a list of all
-# genome projects and then select species for which protein annotations are
-# available - i.e. these are all genome-sequenced species that have been
-# annotated. Then we search for fungal species that have homologues to MBP1.
-# Then we intersect the two lists to give us genome-sequenced species that
-# also have Mbp1 homologues ...
+# MYSPEspecies that is stored in the data subdirectory of this project from
+# where it can be loaded. The strategy is as follows: we download a list of
+# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
+# species that have been annotated.
+# Next we perform a BLAST search, to identify fungal species that have
+# genes that are homologous to yeast MBP1.
+#
+# ...

-
-# =    2  GOLD species  ========================================================
-
-#  Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
-#  (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
-
-# ==   2.1  Initialize  ========================================================
+# =    2  PACKAGES AND INITIALIZATIONS  ========================================

 # httr provides interfaces to Webservers on the Internet
 if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }

-# ==   2.2  Import  ============================================================

-# The URL of the genome data directory at the NCBI:
-# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
-# Note the relative size of the prokaryotes and the eukaryotes data.
-
-# What's in this directory?
-URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
-GOLDreadme <- readLines(URL) # read the file into a vector
-cat(GOLDreadme, sep = "\n")  # display the contents
-
-# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
-# into a dataframe. This will take a few moments.
-# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
-# GOLDdata <- read.csv(URL,
-#                      header = TRUE,
-#                      sep = "\t",
-#                      stringsAsFactors = FALSE)
-# save(GOLDdata, file="data/GOLDdata.RData")
-# or ...
-load(file="data/GOLDdata.RData")
+# =    3  ENSEMBL FUNGI  =======================================================


-# What columns does the table have, how is it structured?
-str(GOLDdata)
+# ==   3.1  Import  ============================================================

-# What groups of organisms are in the table? How many of each?
-table(GOLDdata$Group)
+# Navigate to https://fungi.ensembl.org and click on the link to the full
+# list of all species: https://fungi.ensembl.org/species.html
+# On the page, click on the spreadsheet symbol top right and choose
+# "download whole table". The file will be named  "Species.csv", in your
+# usual downloads folder. Move it to the data folder, and read it.

-# What subgroups of fungi do we have?
-table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
+sDat <- read.csv("./data/Species.csv")
+str(sDat)

-# How many of the fungi have protein annotations? The README file told us that
-# the column "Proteins" contains "Number of Proteins annotated in the assembly".
-# Looking at a few ...
-head(GOLDdata$Proteins, 30)
-# ... we see that the number varies, and some have a hyphen, i.e. no
-# annotations. The hyphens make this a char type column (as per: all elements
-# of a vector must have the same type). Therefore we can't read this as numbers
-# and filter by some value > 0. But we can filter for all genomes that don't
-# have the hyphen:
-sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
+# The most obvious way to partition these is according to Classification ...
+# (poking around a bit in the UniProt taxonomy database shows that the
+#  classification used here is the taxonomic rank of "order").
+# how many classifications do we have?
+length(unique(sDat$Classification))  # 66

-# Subset the data, with fungi that have protein annotations
-GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
-                          GOLDdata$Proteins != "-" , ]
+# To have a good set for the class, we should have about 100.
+# Let's see for which of these we can find Mbp1 homologues.
+# First, we'll keep only the colums for name, classification, and taxID, and
+# drop the rest ...
+sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
+colnames(sDat) <- c("name", "order", "taxID")

-# check what we have in the table
-nrow(GOLDfungi)
-head(GOLDfungi)
+# Next, we make an extra column: genus - the first part of the binomial name.
+# We'll use the gsub() function, and for that we need a "regular expression"
+# that matches to all characters from the first blank to the end of the string:
+myPatt <- "\\s.*$"  # one whitespace (\\s) ...
+                    # followed by any character (.) 0..n times (*) ...
+                    # until the end of the string

+# using gsub() we substitue all matching characters with the empty string "" -
+# this deletes the matching characters
+# Test this:
+gsub(myPatt, "", "Genus")                      # one word: unchanged
+gsub(myPatt, "", "gEnus species")              # two words: return only first
+gsub(myPatt, "", "geNus species strain 123")   # many words: return only first

-# ==   2.3  Unique species  ====================================================
+# apply this to the "name" column and add the result as a separate column
+# called "genus"
+sDat$genus <- gsub(myPatt, "", sDat$name)
+
+# what do we get?
+c(head(unique(sDat$genus)),
+  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
+                             # is a problem that we have to keep in mind.
+                             # (Always inspect your results!)
+# Drop all rows for which the genus contains special chracters -
+# like "[Candida]"
+sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
+
+length(table(sDat$genus))    # how many genus?
+hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
+                                              # most genus have very few, but
+                                              # some have very many species.
+sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
+
+# We should have at least one species from each taxonomic order, but we can
+# add a few genus until we have about 100 validated species.
+
+# Let's add a column for species, by changing our regular expression a bit,
+# using ^ (start of string), \\S (NOT a whitespace),
+# and + (one or more matches), capturing the match (...), and returning
+# it as the substitution (\\1) ...
+
+myPatt <- "^(\\S+\\s\\S+)\\s.*$"
+sDat$species <- gsub(myPatt, "\\1", sDat$name)
+
+# And we reorder the columns, just for aesthetics:
+sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
+
+# Final check:
+any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters

-# For our purpose of defining species, we will select only species, not strains
-# from this list. To do this, we pick the first two words i.e. the systematic
-# binomial name from the "X.Organism.Name" column, and then we remove redundant
-# species. Here is a function:
 #
+# Now we check which of these have Mbp1 homologues ...

-getBinom <- function(s) {
-    # Fetch the first two words from a string.
-    # Parameters:
-    #   s: char  a string which is expected to contain a binomial species name
-    #            as the first two words, possibly followed by other text.
-    # Value: char  the first two words separated by a single blank
-    #
-    x <- unlist(strsplit(s, "\\s+"))     # split s on one or more whitespace
-    return(paste(x[1:2], collapse=" "))  # return first two elements
-}
-
-# iterate through GOLDdata and extract species names
-GOLDspecies <- character()
-for (i in 1:nrow(GOLDfungi)) {
-    GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
-}
-head(GOLDspecies)
-length(GOLDspecies)
-
-# N.b. this would be more efficiently (but perhaps less explicitly) coded with
-# one of the apply() functions, instead of a for-loop.
-# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
-
-# Species of great interest may appear more than once, one for each sequenced
-# strain: e.g. brewer's yeast:
-sum(GOLDspecies == "Saccharomyces cerevisiae")
-
-# Therefore we use the function unique() to throw out duplicates. Simple:
-GOLDspecies <- unique(GOLDspecies)
-
-length(GOLDspecies)
-# i.e. we got rid of about 40% of the species by removing duplicates.
+# =    4  BLAST SEARCH  ========================================================


-# =    3  BLAST species  =======================================================
-#
-# Next, we filter our list by species that have homologues to the yeast Mbp1
-# gene. To do this we run a BLAST search to find all related proteins in any
-# fungus. We list the species that appear in that list, and then we select those
-# that appear in our GOLD table as well.
-#
-# ==   3.1  find homologous proteins  ==========================================
+# We run a BLAST search to find all proteins related to yeast Mbp1 in any
+# fungus. With the results, we'll annotate our sDat table.
+
+# ==   4.1  find homologous proteins  ==========================================
 #
 # Use BLAST to fetch proteins related to Mbp1 and identify the species that
 # contain them.
@@ -188,20 +170,27 @@ length(GOLDspecies)
 # to make a BLAST interface (demo-quality, not research-quality) is in the file
 # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
 # standard task of communicating with servers and parsing responses - everyday
-# fare in thebioinformatics lab. Surprisingly, there seems to be no good BLAST
+# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
 # parser in currently available packages.
-
-# source("./scripts/BLAST.R")   # load the function and its utilities
+#
+# DON'T use this for BLAST searches unless you have read the NCBI policy
+# for automated tasks. If you indicriminately pound on the NCBI's BLAST
+# server, they will blacklist your IP-address. See:
+# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#
 # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 #                    db = "refseq_protein",        # database to search in
-#                    nHits = 3000,                 # 720 hits in 2017
+#                    nHits = 3000,                 # 945 hits in 2020
 #                    E = 0.01,                     #
 #                    limits = "txid4751[ORGN]")    # = fungi
-# save(BLASThits, file="data/BLASThits.RData")
-load(file="data/BLASThits.RData")
+# saveRDS(BLASThits, file="data/BLASThits.rds")
+#
+# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
+#
+BLASThits <- readRDS(file = "data/BLASThits.rds")

-# ==   3.2  Identify species in "hits"  ========================================
+# ==   4.2  Identify species in "hits"  ========================================

 # This is a very big list that can't be usefully analyzed manually. Here
 # we are only interested in the species names that it contains.
@@ -224,61 +213,208 @@ str(BLASThits$hit[[277]])

 BLASTspecies <- character()
 for (i in seq_along(BLASThits$hits)) {
-    BLASTspecies[i] <-BLASThits$hits[[i]]$species
+    BLASTspecies[i] <- BLASThits$hits[[i]]$species
 }

 # You can confirm that BLASTspecies has the expected size.
 length(BLASTspecies)

+# if we delete some of these later on, we still want to remember which hit
+# they came from. Thus we name() the elements with their index, which is the
+# same as the index of the hit in BLASThits
+names(BLASTspecies) <- 1:length(BLASTspecies)
+
+
+# let's plot the distribution of E-values
+eVals <- numeric()
+for (i in seq_along(BLASThits$hits)) {
+  eVals[i] <- BLASThits$hits[[i]]$E
+}
+range(eVals)
+sum(eVals == 0)
+
+# let's plot the log of all values > 0 to see how they are distributed
+# plotting only one vectyor of numbers plots their index as x, and
+# their value as y ...
+plot(log(eVals[eVals > 0]), col = "#CC0000")
+
+# This is very informative: I would suspect that the first ten or so are
+# virtually identical to the yeast protein, then we have about 700 hits with
+# decreasing similarity, and then about 200 more that may actually be false
+# positives. Also - we plotted them by index, that means the table is SORTED:
+# Lower E-values strictly come before higher E-values.
+
 # Again, some species appear more than once, e.g. ...
 sum(BLASTspecies == "Saccharomyces cerevisiae")

 # ... corresponding to the five homologous gene sequences (paralogues) of yeast.

-# Therefore we use unique() to throw out duplicates:
-BLASTspecies <- unique(BLASTspecies)
+# Therefore we remove duplicates. Removing duplicates will leave the FIRST
+# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
+# species, we will retain only the protein that has the highest similarity
+# to yeast Mbp1, not any of its more distant paralogues.
+sel <- ! duplicated(BLASTspecies)
+BLASTspecies <- BLASTspecies[sel]

 length(BLASTspecies)
 # i.e. we got rid of about two thirds of the hits.
+tail(BLASTspecies)  # see how the names are useful!
+                    # again - there are some special characters ...
+                    # what are they?
+BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]

-# You should think about this: what is the biological interpretation of the
-# finding that on average we have three sequences that are similar to Mbp1 in
-# other species?
+# remove the brackets ...
+BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
+# drop any new duplicates ...
+BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
+
+# check the number again:
+length(BLASTspecies)
+# Think a bit about this: what may be the biological reason to find that
+# on average, in 300 fungi across the entire phylogenetic tree, we have
+# three sequences that are homologous to yeast Mbp1?
+
+# Let's look at the distribution of E-values in this selection (Subsetting FTW):
+# we plot all values that are TRUE in the vector "sel" that we created above,
+# AND greater than 0
+plot(log(eVals[sel & eVals > 0]), col = "#00CC00")


-# =    4  Intersect GOLD and BLAST species  ====================================
+# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================

-# Now we can compare the two lists for species that appear in both sources: the
-# simplest way is to use the set operation functions union(), intersection()
-# etc. See here:
-?union
-
-MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
-
-# Again: interpret this:
-#  - what is the number of GOLDspecies?
-#  - what is the number of BLAST species?
-#  - how many species are present in both lists?
-#  - what does it mean if a species is in GOLD but not in the BLAST list?
-#  - what does it mean if a species has been found during BLAST, but it
-#    is not in GOLD?
-
-
-# =    5  Cleanup and finish  ==================================================
-
-# One final thing: some of the species will be our so-called "reference" species
-# which we use for model solutions and examples in the course. They are defined
-# in the .utilities.R file of this project. We remove them from the list so that
-# we don't inadvertently assign them.
+# Next we add the blast result to our sDat dataframe. We'll store the index,
+# the E-value, and the Query-bounds from which we can estimate which domains
+# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
+# Mbp1's N-terminal APSES domain.)
 #
+# First we pull the hits we wanted from the BLASTspecies:
+iHits <- as.numeric(names(BLASTspecies))
+length(iHits)     # one index for each TRUE in sel

-REFspecies
+# add columns to sDat
+l <- nrow(sDat)
+sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
+sDat$eVal   <- numeric(l)  # E-value of the hit
+sDat$lAli   <- numeric(l)  # length of the aligned region

-MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
+# extract and merge
+for (iHit in iHits) {
+  thisSp <- BLASThits$hits[[iHit]]$species
+  sel <- sDat$species == thisSp

-# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
+  sDat$iHit[sel]   <- iHit
+  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
+  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
+}
+
+# Are all reference species accounted for?
+selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
+REFspecies %in% sDat$species[selA]     # yes, all there
+
+selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
+
+sum(selA & selB)   # How many rows?
+
+# sDat of course includes all duplicates. Some may be multiply sequenced, some
+# may be different strains. We'll use the same strategy as before and keep
+# only the best hit: order the rows by E-value, then drop all rows which
+# are duplicated.
+
+
+# drop all rows without BLAST hits ...
+sDat <- sDat[ ! (sDat$iHit == 0) , ]
+
+# order sDat by E-value ...
+sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
+
+# drop all rows with duplicated species ...
+sDat <- sDat[ ! duplicated(sDat$species) , ]
+
+# Lets look at the E-values ...
+plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
+
+# and alignment lengths ...
+plot(sDat$lAli, col = "#00DDAA")
+
+# How many ...
+length(unique(sDat$name))
+length(unique(sDat$species))
+length(unique(sDat$genus))
+length(unique(sDat$order))
+
+# To get the final dataset, we remove the reference species with their
+# entire orders ...
+REForders <- unique(sDat$order[sDat$species %in% REFspecies])
+sel <- sDat$order %in% REForders
+REFdat <- sDat[sel , ]
+sDat   <- sDat[ ! sel , ]
+
+# REFdat should now contain only the REFspecies ...
+( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
+
+# ... but all of them
+sum(REFspecies %in% REFdat$species)
+
+# ... and we have enough left in sDat to prune sDat to unique genus ...
+sDat <- sDat[ ! duplicated(sDat$genus) , ]
+
+# saveRDS(sDat, file = "data/sDat.rds")
+# saveRDS(REFdat, file = "data/REFdat.rds")



+# =    6  STUDENT NUMBERS  =====================================================
+#
+# An asymmetric function to retrieve a MYSPE species
+
+students <- read.csv("../BCH441-2020-students.csv")
+
+sN <- students$Student.Number
+range(sN)
+any(duplicated(gsub(".+(.......)$", "\\1", sN)))
+
+N <- 7
+x <- numeric(N)
+for (i in 1:N) {
+  x[i] <- H(substr(gsub(".+(.......)$", "\\1", sN), i, i))
+}
+plot(x, col = "#BB0000", type = "b")
+
+keys <- as.numeric(gsub(".+(....).$", "\\1", sN))
+any(duplicated(keys))
+
+# =====
+set.seed(112358)
+names(sN) <- sample(1:nrow(sDat), length(sN))
+
+MYSPEmap <- data.frame(keys = sprintf("%04d", 0:9999),
+                       iMYSPE = sample(1:nrow(sDat), 10000, replace = TRUE))
+rownames(MYSPEmap) <- MYSPEmap$keys
+
+for (i in 1:length(sN)) {
+  rMap <- gsub(".+(....).$", "\\1", sN[i])
+  MYSPEmap[rMap, "iMYSPE"] <- as.integer(names(sN)[i])
+}
+
+# saveRDS(MYSPEmap, "./data/MYSPEmap.rds")
+
+getMYSPE <- function(x) {
+  dat <- readRDS("./data/sDat.rds")
+  map <- readRDS("./data/MYSPEmap.rds")
+  key <- gsub(".+(....).$", "\\1", x)
+  return(dat$species[map[key, "iMYSPE"]])
+}
+
+# === validate
+l <- length(sN)
+sp <- character(l)
+for(i in 1:l) {
+  sp[i] <- getMYSPE(sN[i])
+}
+any(duplicated(sp))
+length(unique(sp))
+which(! sDat$species %in% sp)  # these can be assigned to late-comers
+
+# Done.

 # [END]
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@@ -119,10 +119,10 @@ scCCnet <- scCCnet[! duplicated(x), ]
 # length(unique(c(mySubnet$protein1, mySubnet$protein2)))  # 261, no change
 # Network has 261 nodes, 1280 edges

-save(scCCnet, file = "./data/scCCnet.RData")
+saveRDS(scCCnet, file = "./data/scCCnet.rds")

-# load("./data/scCCnet.RData")   # <<<- use this to load the object when
-                                 # needed
+# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
+                                             #      object when needed


 # [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -4,23 +4,23 @@
 #          This script uses the BLAST URL-API
 #          (Application Programming Interface) at the NCBI.
 #          Read about the constraints here:
-#          https://ncbi.github.io/blast-cloud/dev/api.html
+#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 #
 #
-# Version: 3.1
-# Date:    2016 09 - 2019 01
+# Version: 3.2
+# Date:    2016 09 - 2020 09
 # Author:  Boris Steipe
 #
 # Versions:
+#    3.2   2020 updates
 #    3.1   Change from require() to requireNamespace(),
 #          use <package>::<function>() idiom throughout
-#    3     parsing logic had not been fully implemented; Fixed.
+#    3.0   parsing logic had not been fully implemented; Fixed.
 #    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 #          refactored parseBLASTalignment() to handle lists with multiple hits.
 #    2.0   Completely rewritten because the interface completely changed.
 #          Code adpated in part from NCBI Perl sample code:
 #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
-#
 #    1.0   first version posted for BCH441 2016, based on BLAST - API
 #
 # ToDo:
@@ -31,47 +31,50 @@
 # ==============================================================================


-if (! requireNamespace(httr, quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }


-BLAST <- function(q,
+BLAST <- function(Q,
                  db = "refseq_protein",
                  nHits = 30,
                  E = 0.1,
                  limits = "",
                  rid = "",
+                  query = "",
                  quietly = FALSE,
                  myTimeout = 120) {
    # Purpose:
    #     Basic BLAST search
    #
    # Parameters:
-    #     q: query - either a valid ID or a sequence
+    #     Q: query - either a valid ID or a sequence
    #     db: "refseq_protein" by default,
-    #         other legal valuses include: "nr", "pdb", "swissprot" ...
+    #         other legal values include: "nr", "pdb", "swissprot" ...
    #     nHits: number of hits to maximally return
    #     E: E-value cutoff. Do not return hits whose score would be expected
    #        to occur E or more times in a database of random sequence.
    #     limits: a valid ENTREZ filter
-    #     rid: a request ID - to retrieve earleir search results
+    #     rid: a request ID - to retrieve earlier search results
+    #     query: the actual query string (needed when retrieving results
+    #            with an rid)
    #     quietly: controls printing of wait-time progress bar
    #     timeout: how much longer _after_ rtoe to wait for a result
    #              before giving up (seconds)
    # Value:
-    #     result: list of resulting hits and some metadata
+    #     result: list of process status or resulting hits, and some metadata


    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done

    results <- list()
+    results$query = query
    results$rid <- rid
    results$rtoe <- 0

-    if (rid == "") {  # if rid is not the empty string we skip the
-                      # initial search and and proceed directly to retrieval
-
+    if (rid == "") {  # If no rid is available, spawn a search.
+                      # Else, proceed directly to retrieval.

      # prepare query, GET(), and parse rid and rtoe from BLAST server response
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
@@ -141,7 +144,8 @@ BLAST <- function(q,

        if (myTimeout <= 0) { # abort
          cat("BLAST search not concluded before timeout. Aborting.\n")
-          cat(sprintf("You could check back later with rid \"%s\"\n",
+          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
+                      "Trying checking back later with >",
                      results$rid))
          return(results)
        }
@@ -370,7 +374,7 @@ if (FALSE) {
                nHits = 100,
                E = 0.001,
                rid = "",
-                limits = "txid4751[ORGN]")
+                limits = "txid4751[ORGN]")  # Fungi
  str(test)
  length(test$hits)
 }