2020 updates - deactivate for maintenance

2020-09-18 21:56:30 +10:00 · 2020-09-18 21:56:30 +10:00 · 37ef655d47
commit 37ef655d47
parent 89bdd14d1c
42 changed files with 447 additions and 243 deletions
--- a/.init.R
+++ b/.init.R
@ -1,42 +0,0 @@
-# .init.R
-# Functions to initialize this collection of learning units
-# Boris Steipe
-# ====================================================================
-
-# Create a local copy of myScript.R if required, and not been done yet.
-if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
-    file.copy(".tmp.R", "myScript.R")
-}
-
-# If it doesn't exist yet, set up a profile:
-if (! file.exists(".myProfile.R")) {
-  # setup profile data
-  cat("\nPlease enter the requested values correctly, no spaces, and\n")
-  cat("press <enter>.\n")
-  e <- readline("Please enter your UofT eMail address: ")
-  n <- readline("Please enter your Student Number: ")
-
-  conn <- file(".myProfile.R")
-  writeLines(c(sprintf("myEMail <- \"%s\"", e),
-               sprintf("myStudentNumber <- %d", as.numeric(n))),
-             conn)
-  close(conn)
-  rm(e, n, conn)
-}
-
-# Patch YFO -> MYSPE if necessary:
-tmp <- readLines(".myProfile.R")
-if (length(grep("^YFO", tmp)) > 0) {
-  idx <- grep("^YFO", tmp)
-  tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
-  writeLines(tmp, ".myProfile.R")
-}
-rm(tmp)
-
-source(".myProfile.R")
-
-source(".utilities.R")
-
-file.edit("ABC-units.R")
-
-# [End]
--- a/.utilities.R
+++ b/.utilities.R
@ -181,19 +181,29 @@ fetchMSAmotif <- function(ali, mot) {

 # ====== PDB ID selection ======================================================

-selectPDBrep <- function(n) {
+selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
  # Select n PDB IDs from a list of high-resolution, non-homologous, single
  # domain, single chain structure files that represent a CATH topology
  # group.
-  # Parameters   n  num   number of IDs to return.
+  # Parameters:
+  #   n     num     number of IDs to return
+  #   seed  num     a seed for the RNG
+  #
  # Value:          char  PDB IDs
-  # Note: the list is loaded from an RData file in the data directory
+  #
+  # Note: the list is loaded from an RData file in the "./data" directory.
+  # If you use this function for a course submissio, it MUST be invoked as:
+  #
+  #         selectPDBrep(n, seed = myStudentNumber)
+  #
+  # ... and myStudentNumber MUST be correctly initialized

  load("./data/pdbRep.RData")  # loads pdbRep
  if (n > length(pdbRep)) {
-    stop(sprintf("You can select no more than %d IDs.", length(pdbRep)))
+    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
+                 length(pdbRep)))
  }
-  set.seed(as.numeric(Sys.time()))
+  set.seed(seed)
  return(sample(pdbRep, n))
 }

--- a/ABC-units.R
+++ b/ABC-units.R
@ -2,11 +2,16 @@
 #
 # Purpose: A Bioinformatics Course: R code for learning units
 #
-# Version: 0.1
+# Version: 4.0
 #
-# Date:    2017  08  18
+# Date:    2020  09  16
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
+# Versions:
+# V 4.0    2020 version
+# V 3.0    2019 version
+# V 2.0    2018 version
+# V 1.0    2017 version
 # V 0.1    First code
 #
 # TODO:
@ -14,23 +19,19 @@
 #
 # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 #
-# Expect that the learning unit files will be continuously updated.
-#
+# The R-scripts and datasets in this project will be continuously updated,
+# and updates will be posted on GitHub. To bring your version into the latest
+# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
+# repository. However, this will overwrite locally edited version of files.

-# If you wish to edit any of the code, for example to add your own comments and
-# examples, save any edited version under a different name. Otherwise you will
-# have problems with git when you update the project to a new version.
+# To edit code and experiment with it, for example to add your own comments and
+# examples, save your edited version into the "myScripts" folder. Otherwise you
+# may have problems with git when you update the project to a new version. It's
+# good practice to change the filename, for example by prepending your initials.
+# This helps distinguish the files you are working with e.g. in a list of
+# recent files. For example if your name is Honjo Tasuku, your edited
+# BIN-Sequence.R might be named HT-BIN-Sequence.R

-# DO NOT SIMPLY  source()  THESE FILES!
-
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# While this file itself should not be edited by you this is YOUR project
-# directory, and files that you create (notes etc.) will not be harmed when you
-# pull updated version of the master, or other new files, from github.
-#
 # If you pull from github and get the following type of error ...
 #     ---------------
 #     error: Your local changes to the following files would be
@ -41,8 +42,11 @@
 # ... then, you need to bring the offending file into its original state.
 # Open the Commit window, select the file, and click on the Revert button.
 #
-# Of course, you can save a local copy under a different name before you revert,
-# in case you want to keep your changes.
+# When working with these script DO NOT SIMPLY  source()  THESE FILES!
+
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
 #
 #
 # ==============================================================================
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@ -1,4 +1,10 @@
-# BIN-ALI-BLAST.R
+# tocID <- "BIN-ALI-BLAST.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-BLAST unit.
@ -29,13 +35,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                               Line
 #TOC> ---------------------------------------------------
 #TOC>   1        Defining the APSES domain             42
 #TOC>   2        Executing the BLAST search            64
 #TOC>   3        Analysing results                     86
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-ALI-Dotplot.R
+++ b/BIN-ALI-Dotplot.R
@ -1,4 +1,10 @@
-# BIN-ALI-Dotplot.R
+# tocID <- "BIN-ALI-Dotplot.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Dotplot unit.
@ -27,12 +33,12 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                  Line
 #TOC> --------------------------------------
 #TOC>   1        ___Section___            39
 #TOC>   2        Tasks                   187
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
@ -1,4 +1,10 @@
-# BIN-ALI-MSA.R
+# tocID <- "BIN-ALI-MSA.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-MSA unit.
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                              Line
 #TOC> ------------------------------------------------------------------
 #TOC>   1        Preparations                                         54
@ -47,7 +53,7 @@
 #TOC>   6        Sequence Logos                                      546
 #TOC>   6.1        Subsetting an alignment by motif                  555
 #TOC>   6.2        Plot a Sequence Logo                              604
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -239,7 +245,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
 #   -  adjust the sequence names
 #   -  convert to msaAAMultipleAlignment object

-# ===   4.1.1  importing an .aln file                   
+# ===   4.1.1  importing an .aln file

 # The seqinr package has a function to read CLUSTAL W formatted .aln files ...
 if (! requireNamespace("seqinr", quietly=TRUE)) {
--- a/BIN-ALI-Optimal_sequence_alignment.R
+++ b/BIN-ALI-Optimal_sequence_alignment.R
@ -1,4 +1,10 @@
-# BIN-ALI-Optimal_sequence_alignment.R
+# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
--- a/BIN-ALI-Similarity.R
+++ b/BIN-ALI-Similarity.R
@ -1,4 +1,10 @@
-# BIN-ALI-Similarity.R
+# tocID <- "BIN-ALI-Similarity.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Similarity unit.
@ -28,13 +34,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                          Line
 #TOC> ----------------------------------------------
 #TOC>   1        Amino Acid Properties            41
 #TOC>   2        Mutation Data matrix            158
 #TOC>   3        Background score                199
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-Data_integration.R
+++ b/BIN-Data_integration.R
@ -1,4 +1,10 @@
-# BIN-Data_integration.R
+# tocID <- "BIN-Data_integration.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Data_integration unit.
@ -30,12 +36,12 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                             Line
 #TOC> -------------------------------------------------
 #TOC>   1        Identifier mapping                  42
 #TOC>   2        Cross-referencing tables           165
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@ -1,4 +1,10 @@
-# BIN-FUNC-Domain_annotation.R
+# tocID <- "BIN-FUNC-Domain_annotation.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC-Domain_annotation unit.
@ -25,7 +31,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                               Line
 #TOC> -----------------------------------------------------------------------------------
 #TOC>   1        Update your database script                                           41
@ -34,7 +40,7 @@
 #TOC>   1.1.2    If you HAVE done the BIN-ALI-Optimal_sequence_alignment               93
 #TOC>   1.2      Execute and Validate                                                 119
 #TOC>   2        Plot Annotations                                                     144
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -90,7 +96,7 @@
 # Then SKIP the next section.
 #
 #
-# ===  1.1.2  If you HAVE done the BIN-ALI-Optimal_sequence_alignment         
+# ===  1.1.2  If you HAVE done the BIN-ALI-Optimal_sequence_alignment
 #
 #
 #   You DO already have a file called "<MYSPE>-Annotations.json" in the
--- a/BIN-FUNC-Semantic_similarity.R
+++ b/BIN-FUNC-Semantic_similarity.R
@ -1,4 +1,10 @@
-# BIN-FUNC_Semantic_similarity.R
+# tocID <- "BIN-FUNC_Semantic_similarity.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC_Semantic_similarity unit.
@ -28,14 +34,14 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                Line
 #TOC> --------------------------------------------------------------------
 #TOC>   1        Preparations: Packages, AnnotationDB, Setup            42
 #TOC>   2        Fetch GO Annotations                                   98
 #TOC>   3        Semantic Similarities                                 107
 #TOC>   4        GO Term Enrichment in Gene Sets                       125
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -158,27 +164,9 @@ myEnr <- GOenrichment(mySet, allGenes)

 sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...

-#Yes: most significantly enriched is GO:0071931. What is this?
-getGOTerm("GO:0071931")  # ... makes sense.
+#Most significantly enriched is GO:0071931. What is this?
+annotate::getGOTerm("GO:0071931")  # ... makes sense.

-(fullSet <- myEnr$genes$`GO:0071931`)  # What genes are annotated to this term?
-
-intersect(mySet, fullSet) # These are in both sets
-setdiff(mySet, fullSet)   # These mySet members are not annotated to that term
-setdiff(fullSet, mySet)   # These are annotated to that term but not in mySet.
-                          # ... that's the most interesting set. From a set of
-                          # genes we have identified a function that they
-                          # share, and that shared function has allowed us
-                          # to identify
-
-# What are these genes?
-# Select annotations from the annotation database:
-AnnotationDbi::select(org.Sc.sgd.db,
-                      keys = setdiff(fullSet, mySet),
-                      columns = c("COMMON", "DESCRIPTION"))
-
-# Note that these annotations are partially redundant to several different
-# aliases of the same three genes.



--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@ -1,4 +1,10 @@
-# BIN-MYSPE.R
+# tocID <- "BIN-MYSPE.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Data_preparation.R
+# tocID <- "BIN-PHYLO-Data_preparation.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Data_preparation unit.
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                     Line
 #TOC> ---------------------------------------------------------
 #TOC>   1        Preparations                                44
@ -37,7 +43,7 @@
 #TOC>   3        Multiple Sequence Alignment                117
 #TOC>   4        Reviewing and Editing Alignments           136
 #TOC>   4.1        Masking workflow                         152
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Tree_analysis.R
+# tocID <- "BIN-PHYLO-Tree_analysis.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_analysis unit.
@ -31,7 +37,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                              Line
 #TOC> --------------------------------------------------
 #TOC>   1        Preparation and Tree Plot            46
@ -39,7 +45,7 @@
 #TOC>   2.1        Rooting Trees                     145
 #TOC>   2.2        Rotating Clades                   190
 #TOC>   2.3        Computing tree distances          241
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Tree_building.R
+# tocID <- "BIN-PHYLO-Tree_building.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_building unit.
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                       Line
 #TOC> -----------------------------------------------------------
 #TOC>   1        Calculating Trees                             46
@ -39,7 +45,7 @@
 #TOC>   1.1.3          ... on Linux                            96
 #TOC>   1.1.4          Confirming PROMLPATH                   101
 #TOC>   1.2        Building a maximum likelihood tree         110
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -68,7 +74,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
 # on your computer Phylip has been installed and define the path
 # to the proml program that calculates a maximum-likelihood tree.

-# ===   1.1.1  ... on the Mac                    
+# ===   1.1.1  ... on the Mac
 # On the Mac, the standard installation places a phylip folder
 # in the /Applications directory. That folder contains all the
 # individual phylip programs as <name>.app files. These are not
@ -79,7 +85,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
 # directly to that subdirectory to find the program it needs:
 # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"

-# ===   1.1.2  ... on Windows                    
+# ===   1.1.2  ... on Windows
 # On Windows you need to know where the programs have been installed, and you
 # need to specify a path that is correct for the Windows OS. Find the folder
 # that is named "exe", and right-click to inspect its properties. The path
@ -93,12 +99,12 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
 # I have heard that your path must not contain spaces, and it is prudent to
 # avoid other special characters as well.

-# ===   1.1.3  ... on Linux                      
+# ===   1.1.3  ... on Linux
 # If you are running Linux I trust you know what to do. It's probably
 # something like
 # PROMLPATH <- "/usr/local/phylip-3.695/bin"

-# ===   1.1.4  Confirming PROMLPATH              
+# ===   1.1.4  Confirming PROMLPATH
 # Confirm that the settings are right.
 PROMLPATH                # returns the path
 list.dirs(PROMLPATH)     # returns the directories in that path
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@ -1,4 +1,10 @@
-# BIN-PPI-Analysis.R
+# tocID <- "BIN-PPI-Analysis.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                           Line
 #TOC> ---------------------------------------------------------------
 #TOC>   1        Setup and data                                    46
@ -39,7 +45,7 @@
 #TOC>   2.3        Betweenness Centrality                         180
 #TOC>   3        biomaRt                                          226
 #TOC>   4        Task for submission                              296
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-SEQA-Composition.R
+++ b/BIN-SEQA-Composition.R
@ -1,4 +1,10 @@
-# BIN-SEQA-Composition.R
+# tocID <- "BIN-SEQA-Composition.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-SEQA-Comparison unit
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                      Line
 #TOC> ----------------------------------------------------------
 #TOC>   1        Preparation                                  47
@ -40,7 +46,7 @@
 #TOC>   3.3        Plotting log ratios                       185
 #TOC>   3.4        Sort by frequency                         200
 #TOC>   3.5        Color by amino acid type                  215
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -1,4 +1,10 @@
-# BIN-Sequence.R
+# tocID <- "BIN-Sequence.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
@ -30,7 +36,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                Line
 #TOC> ----------------------------------------------------
 #TOC>   1        Prepare                                63
@ -50,7 +56,7 @@
 #TOC>   7.2        Sampling                            306
 #TOC>   7.2.1          Equiprobable characters         308
 #TOC>   7.2.2          Defined probability vector      350
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -171,16 +177,16 @@ cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))

 # =    6  Changing strings  ====================================================

-# ===   6.1.1  Changing case              
+# ===   6.1.1  Changing case
 tolower(s)
 toupper(tolower(s))


-# ===   6.1.2  Reverse                    
+# ===   6.1.2  Reverse
 reverse(s)


-# ===   6.1.3  Change characters          
+# ===   6.1.3  Change characters
 # chartr(old, new, x) maps all characters in x that appear in "old" to the
 # correpsonding character in "new."

@ -208,7 +214,7 @@ chartr(myCypher, lett, x)
 # (Nb. substitution cyphers are easy to crack!)


-# ===   6.1.4  Substitute characters      
+# ===   6.1.4  Substitute characters
 (s <- gsub("IV", "i-v", s))  # gsub can change length, first argument is
                             # a "regular expression"!

@ -305,7 +311,7 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the

 # ==   7.2  Sampling  ==========================================================

-# ===   7.2.1  Equiprobable characters    
+# ===   7.2.1  Equiprobable characters

 # Assume you need a large random-nucleotide string for some statistical model.
 # How to create such a string? sample() can easily create it:
@ -347,7 +353,7 @@ length(unlist(x))
 # of the smaller number of Cs and Gs - before biology even comes into play. How
 # do we account for that?

-# ===   7.2.2  Defined probability vector 
+# ===   7.2.2  Defined probability vector

 # This is where we need to know how to create samples with specific probability
 # distributions. A crude hack would be to create a sampling source vector with
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
@ -1,4 +1,10 @@
-# BIN-Storing_data.R
+# tocID <- "BIN-Storing_data.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-Storing_data unit
@ -27,7 +33,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                   Line
 #TOC> -----------------------------------------------------------------------
 #TOC>   1        A Relational Datamodel in R: review                       57
@ -50,7 +56,7 @@
 #TOC>   3.3        Create an R script to create your own database         535
 #TOC>   3.3.1          Check and validate                                 555
 #TOC>   3.4        Task: submit for credit (part 2/2)                     596
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -205,7 +211,7 @@ str(philDB)
 # go back, re-read, play with it, and ask for help. This is essential.


-# ===   1.1.1  completing the database                       
+# ===   1.1.1  completing the database


 # Next I'll add one more person, and create the other two tables:
@ -369,7 +375,7 @@ dbSanitizeSequence(x)

 # ==   2.3  Create a protein table for our data model  =========================

-# ===   2.3.1  Initialize the database                       
+# ===   2.3.1  Initialize the database


 # The function dbInit contains all the code to return a list of empty
@ -381,7 +387,7 @@ myDB <- dbInit()
 str(myDB)


-# ===   2.3.2  Add data                                      
+# ===   2.3.2  Add data


 # fromJSON() returns a dataframe that we can readily process to add data
@ -428,7 +434,7 @@ source("./scripts/ABC-createRefDB.R")
 str(myDB)


-# ===   2.4.1  Examples of navigating the database           
+# ===   2.4.1  Examples of navigating the database


 # You can look at the contents of the tables in the usual way we access
@ -552,7 +558,7 @@ myDB$taxonomy$species[sel]
 # in any of the JSON files. Later you will add more information ...


-# ===   3.3.1  Check and validate                            
+# ===   3.3.1  Check and validate


 # Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
--- a/FND-Genetic_code.R
+++ b/FND-Genetic_code.R
@ -1,4 +1,10 @@
-# FND-Genetic_code.R
+# tocID <- "FND-Genetic_code.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-Genetic_code unit.
@ -28,7 +34,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                            Line
 #TOC> ----------------------------------------------------------------
 #TOC>   1        Storing the genetic code                           45
@ -38,7 +44,7 @@
 #TOC>   3        An alternative representation: 3D array           212
 #TOC>   3.1        Print a Genetic code table                      246
 #TOC>   4        Tasks                                             272
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
@ -1,4 +1,10 @@
-# FND-MAT-Graphs_and_networks.R
+# tocID <- "FND-MAT-Graphs_and_networks.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-MAT-Graphs_and_networks unit.
@ -29,7 +35,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                        Line
 #TOC> ------------------------------------------------------------
 #TOC>   1        Review                                         50
@ -43,7 +49,7 @@
 #TOC>   4        RANDOM GRAPHS AND GRAPH METRICS               539
 #TOC>   4.1        Diameter                                    576
 #TOC>   5        GRAPH CLUSTERING                              645
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -280,7 +286,7 @@ plot(GBA,
     vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
     vertex.size = 200 + (30 * igraph::degree(GBA)),
     vertex.label = NA)
-par(oPar)                              # restore grphics state
+par(oPar)                              # restore graphics state

 # This is a very obviously different graph! Some biological networks have
 # features that look like that - but in my experience the hub nodes are usually
--- a/FND-STA-Information_theory.R
+++ b/FND-STA-Information_theory.R
@ -1,14 +1,21 @@
-# FND-STA-Information_theory.R
+# tocID <- "FND-STA-Information_theory.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Information_theory unit.
 #
-# Version:  0.2
+# Version:  0.2.1
 #
-# Date:     2017  MM  DD
+# Date:     2017 - 2019
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           0.2.1  Maintenance
 #           0.2    Under development
 #           0.1    First code copied from 2016 material.
 #
@ -58,11 +65,33 @@ AAref["Y"] <- 0.0294
 sum(AAref)

 # Function to calculate Shannon entropy
-H <- function(v) {
-  # Shannon entropy (bits)
-  return(-sum(v * (log(v) / log(2))))
+H <- function(pmf) {
+  # Calculate Shannon entropy
+  # Parameters:
+  #   pmf (numeric) probability mass function: a vector of states and
+  #                 associated probabilities. Each element of
+  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
+  # Value:
+  #   Shannon entropy in bits.
+  # Examples:
+  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
+  #                                         # nucleotide sequence
+  #   H(1)     # If all elements are the same, entropy is zero
+  #
+  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
+    stop("Input is not a discrete probability distribution.")
+  }
+  H <- -sum(pmf * (log(pmf) / log(2)))
+  return(H)
 }

+# Why use all.equal()? Exact comparisons with floating point numbers are
+# brittle. Consider for example:
+1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
+print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
+# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
+
+

 # Entropy of the database frequencies (in bits):
 (Href <- H(AAref))
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -1,4 +1,10 @@
-# FND-STA-Probability_distribution.R
+# tocID <- "FND-STA-Probability_distribution.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
@ -28,7 +34,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                         Line
 #TOC> -----------------------------------------------------------------------------
 #TOC>   1        Introduction                                                    52
@ -45,7 +51,7 @@
 #TOC>   4.2.1          An example from tossing dice                             463
 #TOC>   4.2.2          An example from lognormal distributions                  586
 #TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions         629
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -460,7 +466,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
 # be applied to discrete distributions. But we need to talk a bit about
 # converting counts to p.m.f.'s.

-# ===   4.2.1  An example from tossing dice                        
+# ===   4.2.1  An example from tossing dice

 #  The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
 #  there is an issue when we convert sampled counts to frequencies, and estimate
@ -583,7 +589,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
 # somewhat but not drastically atypical.


-# ===   4.2.2  An example from lognormal distributions             
+# ===   4.2.2  An example from lognormal distributions

 # We had compared a set of lognormal and gamma distributions above, now we
 # can use KL-divergence to quantify their similarity:
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -1,4 +1,10 @@
-# FND-STA-Significance.R
+# tocID <- "FND-STA-Significance.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Significance unit.
@ -25,7 +31,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                              Line
 #TOC> ------------------------------------------------------------------
 #TOC>   1        Significance and p-value                             43
@ -36,7 +42,7 @@
 #TOC>   3        Significance by integration                         198
 #TOC>   4        Significance by simulation or permutation           204
 #TOC>   5        Final tasks                                         312
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -100,7 +106,7 @@ print(x, digits = 22)
 # curve, as a fraction of the whole.


-# ===   1.2.1  p-value illustrated                      
+# ===   1.2.1  p-value illustrated

 # Let's illustrate. First we draw a million random values from our
 # standard, normal distribution:
--- a/README.md
+++ b/README.md
@ -1,2 +1,4 @@
 # ABC-units
 A Bioinformatics Course: R modules for learning units
+
+Follow the instructions in the learning unit to install your local copy of this R-project.
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@ -1,4 +1,10 @@
-# RPR-Biostrings.R
+# tocID <- "RPR-Biostrings.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Biostrings unit.
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@ -1,4 +1,10 @@
-# RPR-FASTA.R
+# tocID <- "RPR-FASTA.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-FASTA unit.
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@ -1,4 +1,10 @@
-# RPR_GEO2R.R
+# tocID <- "RPR_GEO2R.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR_GEO2R unit.
@ -34,7 +40,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                      Line
 #TOC> --------------------------------------------------------------------------
 #TOC>   1        Preparations                                                 56
@ -49,7 +55,7 @@
 #TOC>   5.1        Final task: Gene descriptions                             504
 #TOC>   6        Improving on Discovery by Differential Expression           510
 #TOC>   7        Annotation data                                             594
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -1,4 +1,10 @@
-# RPR-Genetic_code_optimality.R
+# tocID <- "RPR-Genetic_code_optimality.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Genetic_code_optimality unit.
@ -30,7 +36,7 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                          Line
 #TOC> --------------------------------------------------------------
 #TOC>   1        Designing a computational experiment             57
@ -43,7 +49,7 @@
 #TOC>   2.2.4          measure effect                            213
 #TOC>   3        Run the experiment                              260
 #TOC>   4        Task solutions                                  356
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


@ -142,7 +148,7 @@ swappedGC <- function(GC) {
 #   - we count the number of mutations and evaluate their severity.


-# ===   2.2.1  reverse-translate                    
+# ===   2.2.1  reverse-translate

 # To reverse-translate an amino acid vector, we randomly pick one of its
 # codons from a genetic code, and assemble all codons to a sequence.
@ -167,7 +173,7 @@ traRev <- function(s, GC) {
 }


-# ===   2.2.2  Randomly mutate                      
+# ===   2.2.2  Randomly mutate

 # To mutate, we split a codon into it's three nucleotides, then randomly replace
 # one of the three with another nucleotide.
@ -192,7 +198,7 @@ randMut <- function(vC) {



-# ===   2.2.3  Forward- translate                   
+# ===   2.2.3  Forward- translate

 traFor <- function(vC, GC) {
  # Parameters:
@ -210,7 +216,7 @@ traFor <- function(vC, GC) {
  }


-# ===   2.2.4  measure effect                       
+# ===   2.2.4  measure effect

 # How do we evaluate the effect of the mutation? We'll take a simple ad hoc
 # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
--- a/RPR-Introduction.R
+++ b/RPR-Introduction.R
@ -1,4 +1,10 @@
-# RPR-Introduction.R
+# tocID <- "RPR-Introduction.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the RPR-Introduction unit
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@ -1,4 +1,10 @@
-# RPR-PROSITE_POST.R
+# tocID <- "RPR-PROSITE_POST.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
@ -29,13 +35,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                                 Line
 #TOC> ---------------------------------------------------------------------
 #TOC>   1        Constructing a POST command from a Web query            42
 #TOC>   1.1        Task - fetchPrositeFeatures() function               142
 #TOC>   2        Task solutions                                         150
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -1,4 +1,10 @@
-# RPR-RegEx.R
+# tocID <- "RPR-RegEx.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the RPR-RegEx unit
--- a/RPR-SX-PDB.R
+++ b/RPR-SX-PDB.R
@ -1,4 +1,10 @@
-# RPR-SX-PDB.R
+# tocID <- "RPR-SX-PDB.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-SX-PDB unit.
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@ -1,4 +1,10 @@
-# RPR-UniProt_GET.R
+# tocID <- "RPR-UniProt_GET.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
@ -28,13 +34,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                      Line
 #TOC> ----------------------------------------------------------
 #TOC>   1        UniProt files via GET                        41
 #TOC>   1.1        Task - fetchUniProtSeq() function         103
 #TOC>   2        Task solutions                              110
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/RPR-Unit_testing.R
+++ b/RPR-Unit_testing.R
@ -1,4 +1,10 @@
-# RPR-Unit_testing.R
+# tocID <- "RPR-Unit_testing.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Unit_testing unit.
@ -29,10 +35,10 @@
 #TOC> 
 #TOC>   Section  Title                             Line
 #TOC> -------------------------------------------------
-#TOC>   1        Unit Tests with testthat            40
-#TOC>   2        Organizing your tests              159
-#TOC>   2.1        Testing scripts                  183
-#TOC>   3        Task solutions                     198
+#TOC>   1        Unit Tests with testthat            46
+#TOC>   2        Organizing your tests              165
+#TOC>   2.1        Testing scripts                  189
+#TOC>   3        Task solutions                     204
 #TOC> 
 #TOC> ==========================================================================

--- a/RPR-eUtils_XML.R
+++ b/RPR-eUtils_XML.R
@ -1,4 +1,10 @@
-# RPR-eUtils_and_XML.R
+# tocID <- "RPR-eUtils_and_XML.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
@ -28,13 +34,13 @@


 #TOC> ==========================================================================
-#TOC> 
+#TOC>
 #TOC>   Section  Title                                       Line
 #TOC> -----------------------------------------------------------
 #TOC>   1        Working with NCBI eUtils                      41
 #TOC>   1.1        Task - fetchNCBItaxData() function         144
 #TOC>   2        Task solutions                               151
-#TOC> 
+#TOC>
 #TOC> ==========================================================================


--- a/scripts/ABC-createRefDB.R
+++ b/scripts/ABC-createRefDB.R
@ -8,7 +8,7 @@
 # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 #
 # For the data model, see
-# https://docs.google.com/drawings/d/1uupNvz18_FYFwyyVPebTM0CUxcJCPDQuxuIJGpjWQWg
+# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
 # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
 #
 # ==============================================================================
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@ -1,12 +1,35 @@
-# ABC-dbUtilities.R
-
+# tocID <- "scripts/ABC-dbUtilities.R"
+#
 # database utilities for ABC learning units
 #
 # ==============================================================================
-#


-# ====== PACKAGES ==============================================================
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        PACKAGES                            32
+#TOC>   2        FUNCTIONS                           50
+#TOC>   2.01       dbSanitizeSequence()              53
+#TOC>   2.02       dbConfirmUnique()                 88
+#TOC>   2.03       dbInit()                         106
+#TOC>   2.04       dbAutoincrement()                147
+#TOC>   2.05       dbAddProtein()                   160
+#TOC>   2.06       dbAddFeature()                   180
+#TOC>   2.07       dbAddTaxonomy()                  199
+#TOC>   2.08       dbAddAnnotation()                215
+#TOC>   2.09       dbFetchUniProtSeq()              243
+#TOC>   2.10       dbFetchPrositeFeatures()         267
+#TOC>   2.11       node2text()                      311
+#TOC>   2.12       dbFetchNCBItaxData()             323
+#TOC>   2.13       UniProtIDmap()                   362
+#TOC>   3        TESTS                              399
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  PACKAGES  ============================================================


 if (! requireNamespace("jsonlite", quietly = TRUE)) {
@ -24,9 +47,10 @@ if (! requireNamespace("xml2", quietly = TRUE)) {
 }


-# ====== FUNCTIONS =============================================================
+# =    2  FUNCTIONS  ===========================================================


+# ==   2.01  dbSanitizeSequence()  =============================================
 dbSanitizeSequence <- function(s, unambiguous = TRUE) {
  # Remove FASTA header lines, if any,
  # flatten any structure that s has,
@ -61,6 +85,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 }


+# ==   2.02  dbConfirmUnique()  ================================================
 dbConfirmUnique <- function(x) {
  # x is a vector of logicals.
  # returns x if x has exactly one TRUE element.
@ -78,24 +103,27 @@ dbConfirmUnique <- function(x) {
 }


+# ==   2.03  dbInit()  =========================================================
 dbInit <- function() {
  # Return an empty instance of the protein database
+  # Open the link and study the schema:
+  # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0

  db <- list()

+  db$version <- "1.0"
+
  db$protein <- data.frame(
    ID = numeric(),
    name = character(),
    RefSeqID = character(),
    UniProtID = character(),
    taxonomyID = numeric(),
-    sequence = character(),
-    stringsAsFactors = FALSE)
+    sequence = character())

  db$taxonomy <- data.frame(
    ID = numeric(),
-    species = character(),
-    stringsAsFactors = FALSE)
+    species = character())


  db$annotation <- data.frame(
@ -103,21 +131,20 @@ dbInit <- function() {
    proteinID = numeric(),
    featureID = numeric(),
    start = numeric(),
-    end = numeric(),
-    stringsAsFactors = FALSE)
+    end = numeric())

  db$feature <- data.frame(
    ID = numeric(),
    name = character(),
    description = character(),
    sourceDB = character(),
-    accession = character(),
-    stringsAsFactors = FALSE)
+    accession = character())

  return(db)
 }


+# ==   2.04  dbAutoincrement()  ================================================
 dbAutoincrement <- function(tb) {
  # Return a unique integer that can be used as a primary key
  # Value:
@ -130,6 +157,7 @@ dbAutoincrement <- function(tb) {
 }


+# ==   2.05  dbAddProtein()  ===================================================
 dbAddProtein <- function(db, jsonDF) {
  # Add one or more protein entries to the database db.
  # Parameters:
@ -142,14 +170,14 @@ dbAddProtein <- function(db, jsonDF) {
                    RefSeqID    = jsonDF$RefSeqID[i],
                    UniProtID   = jsonDF$UniProtID[i],
                    taxonomyID  = jsonDF$taxonomyID[i],
-                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]),
-                    stringsAsFactors = FALSE)
+                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
    db$protein <- rbind(db$protein, x)
  }
  return(db)
 }


+# ==   2.06  dbAddFeature()  ===================================================
 dbAddFeature <- function(db, jsonDF) {
  # Add one or more feature entries to the database db.
  # Parameters:
@ -161,14 +189,14 @@ dbAddFeature <- function(db, jsonDF) {
                    name        = jsonDF$name[i],
                    description = jsonDF$description[i],
                    sourceDB    = jsonDF$sourceDB[i],
-                    accession   = jsonDF$accession[i],
-                    stringsAsFactors = FALSE)
+                    accession   = jsonDF$accession[i])
    db$feature <- rbind(db$feature, x)
  }
  return(db)
 }


+# ==   2.07  dbAddTaxonomy()  ==================================================
 dbAddTaxonomy <- function(db, jsonDF) {
  # Add one or more taxonomy entries to the database db.
  # Parameters:
@ -178,13 +206,13 @@ dbAddTaxonomy <- function(db, jsonDF) {
  for (i in seq_len(nrow(jsonDF))) {
    x <- data.frame(
      ID =  jsonDF$ID[i],
-      species = jsonDF$species[i],
-      stringsAsFactors = FALSE)
+      species = jsonDF$species[i])
    db$taxonomy <- rbind(db$taxonomy, x)
  }
  return(db)
 }

+# ==   2.08  dbAddAnnotation()  ================================================
 dbAddAnnotation <- function(db, jsonDF) {
  # Add one or more annotation entries to the database db.
  # Parameters:
@ -205,14 +233,14 @@ dbAddAnnotation <- function(db, jsonDF) {
                    proteinID = pID,
                    featureID = fID,
                    start     = as.integer(jsonDF$start[i]),
-                    end       = as.integer(jsonDF$end[i]),
-                    stringsAsFactors = FALSE)
+                    end       = as.integer(jsonDF$end[i]))
    db$annotation <- rbind(db$annotation, x)
  }
  return(db)
 }


+# ==   2.09  dbFetchUniProtSeq()  ==============================================
 dbFetchUniProtSeq <- function(ID) {
  # Fetch a protein sequence from UniProt.
  # Parameters:
@ -236,6 +264,7 @@ dbFetchUniProtSeq <- function(ID) {
 }


+# ==   2.10  dbFetchPrositeFeatures()  =========================================
 dbFetchPrositeFeatures <- function(ID) {
  # Fetch feature annotations from ScanProsite.
  # Parameters:
@ -272,14 +301,14 @@ dbFetchPrositeFeatures <- function(ID) {
                                     start =  as.numeric(tokens[4]),
                                     end   =  as.numeric(tokens[5]),
                                     psID  =  tokens[6],
-                                     psName = tokens[7],
-                                     stringsAsFactors = FALSE))
+                                     psName = tokens[7]))
    }
  }
  return(myFeatures)
 }


+# ==   2.11  node2text()  ======================================================
 node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
@ -291,6 +320,7 @@ node2text <- function(doc, tag) {
 }


+# ==   2.12  dbFetchNCBItaxData()  =============================================
 dbFetchNCBItaxData <- function(ID) {
  # Fetch feature taxID and Organism from the NCBI.
  # Parameters:
@ -329,6 +359,7 @@ dbFetchNCBItaxData <- function(ID) {



+# ==   2.13  UniProtIDmap()  ===================================================
 UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  # Use UniProt ID mapping service to map one or more IDs
  # Parameters:
@ -351,8 +382,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {

  if (httr::status_code(response) == 200) { # 200: oK
    myMap <- read.delim(file = textConnection(httr::content(response)),
-                        sep = "\t",
-                        stringsAsFactors = FALSE)
+                        sep = "\t")
    myMap <- myMap[ , c(1,3)]
    colnames(myMap) <- c("From", "To")
  } else {
@ -366,7 +396,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 }


-# ====== TESTS =================================================================
+# =    3  TESTS  ===============================================================

 if (FALSE) {
  if (! requireNamespace("testthat", quietly = TRUE)) {
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@ -1,4 +1,4 @@
-# ABC-makeScCCnet.R
+# tocID <- "scripts/ABC-makeScCCnet.R"
 #
 # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
 # GOSlim annotation.
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@ -1,4 +1,4 @@
-# ABC-writeALN.R
+# tocID <- "scripts/ABC-writeALN.R"
 #
 # ToDo:    calculate consensus line
 #          append sequence numbers
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@ -40,7 +40,7 @@ writeMFA <- function(ali,
  if (is.na(blockWidth)) {
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
  }
-  if (blockWidth < 1){
+  if (! blockWidth > 0){
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
  }

@ -105,7 +105,7 @@ writeMFA <- function(ali,
    txt <- c(txt, "")  # append an empty line for readability
  }

-  writeLines(txt, con= myCon)
+  writeLines(txt, con = myCon)

 }

--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@ -357,20 +357,23 @@ parseBLASTalignment <- function(hit) {

 # ==== TESTS ===================================================================

-# define query:
-# q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
-#              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-#              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-#              sep="")
-# or ...
-# q <- "NP_010227" # refseq ID
-#
-# test <- BLAST(q,
-#               nHits = 100,
-#               E = 0.001,
-#               rid = "",
-#               limits = "txid4751[ORGN]")
-# length(test$hits)
+if (FALSE) {
+  # define query:
+  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
+               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+               sep="")
+  # or ...
+  q <- "NP_010227" # refseq ID
+
+  test <- BLAST(q,
+                nHits = 100,
+                E = 0.001,
+                rid = "",
+                limits = "txid4751[ORGN]")
+  str(test)
+  length(test$hits)
+}

 # [END]