2020 updates - deactivate for maintenance

2020-09-18 21:56:30 +10:00 · 2020-09-18 21:56:30 +10:00 · 37ef655d47
commit 37ef655d47
parent 89bdd14d1c
42 changed files with 447 additions and 243 deletions
--- a/.init.R
+++ b/.init.R
@ -1,42 +0,0 @@
-# .init.R
-# Functions to initialize this collection of learning units
-# Boris Steipe
-# ====================================================================
-
-# Create a local copy of myScript.R if required, and not been done yet.
-if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
-    file.copy(".tmp.R", "myScript.R")
-}
-
-# If it doesn't exist yet, set up a profile:
-if (! file.exists(".myProfile.R")) {
-  # setup profile data
-  cat("\nPlease enter the requested values correctly, no spaces, and\n")
-  cat("press <enter>.\n")
-  e <- readline("Please enter your UofT eMail address: ")
-  n <- readline("Please enter your Student Number: ")
-
-  conn <- file(".myProfile.R")
-  writeLines(c(sprintf("myEMail <- \"%s\"", e),
-               sprintf("myStudentNumber <- %d", as.numeric(n))),
-             conn)
-  close(conn)
-  rm(e, n, conn)
-}
-
-# Patch YFO -> MYSPE if necessary:
-tmp <- readLines(".myProfile.R")
-if (length(grep("^YFO", tmp)) > 0) {
-  idx <- grep("^YFO", tmp)
-  tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
-  writeLines(tmp, ".myProfile.R")
-}
-rm(tmp)
-
-source(".myProfile.R")
-
-source(".utilities.R")
-
-file.edit("ABC-units.R")
-
-# [End]
--- a/.utilities.R
+++ b/.utilities.R
@ -181,19 +181,29 @@ fetchMSAmotif <- function(ali, mot) {

 # ====== PDB ID selection ======================================================

-selectPDBrep <- function(n) {
+selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
  # Select n PDB IDs from a list of high-resolution, non-homologous, single
  # domain, single chain structure files that represent a CATH topology
  # group.
-  # Parameters   n  num   number of IDs to return.
+  # Parameters:
+  #   n     num     number of IDs to return
+  #   seed  num     a seed for the RNG
+  #
  # Value:          char  PDB IDs
-  # Note: the list is loaded from an RData file in the data directory
+  #
+  # Note: the list is loaded from an RData file in the "./data" directory.
+  # If you use this function for a course submissio, it MUST be invoked as:
+  #
+  #         selectPDBrep(n, seed = myStudentNumber)
+  #
+  # ... and myStudentNumber MUST be correctly initialized

  load("./data/pdbRep.RData")  # loads pdbRep
  if (n > length(pdbRep)) {
-    stop(sprintf("You can select no more than %d IDs.", length(pdbRep)))
+    stop(sprintf("There are only %d PDB IDs in the table to choose from.",
+                 length(pdbRep)))
  }
-  set.seed(as.numeric(Sys.time()))
+  set.seed(seed)
  return(sample(pdbRep, n))
 }

--- a/ABC-units.R
+++ b/ABC-units.R
@ -2,11 +2,16 @@
 #
 # Purpose: A Bioinformatics Course: R code for learning units
 #
-# Version: 0.1
+# Version: 4.0
 #
-# Date:    2017  08  18
+# Date:    2020  09  16
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
+# Versions:
+# V 4.0    2020 version
+# V 3.0    2019 version
+# V 2.0    2018 version
+# V 1.0    2017 version
 # V 0.1    First code
 #
 # TODO:
@ -14,23 +19,19 @@
 #
 # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 #
-# Expect that the learning unit files will be continuously updated.
-#
+# The R-scripts and datasets in this project will be continuously updated,
+# and updates will be posted on GitHub. To bring your version into the latest
+# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
+# repository. However, this will overwrite locally edited version of files.

-# If you wish to edit any of the code, for example to add your own comments and
-# examples, save any edited version under a different name. Otherwise you will
-# have problems with git when you update the project to a new version.
+# To edit code and experiment with it, for example to add your own comments and
+# examples, save your edited version into the "myScripts" folder. Otherwise you
+# may have problems with git when you update the project to a new version. It's
+# good practice to change the filename, for example by prepending your initials.
+# This helps distinguish the files you are working with e.g. in a list of
+# recent files. For example if your name is Honjo Tasuku, your edited
+# BIN-Sequence.R might be named HT-BIN-Sequence.R

-# DO NOT SIMPLY  source()  THESE FILES!
-
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# While this file itself should not be edited by you this is YOUR project
-# directory, and files that you create (notes etc.) will not be harmed when you
-# pull updated version of the master, or other new files, from github.
-#
 # If you pull from github and get the following type of error ...
 #     ---------------
 #     error: Your local changes to the following files would be
@ -41,8 +42,11 @@
 # ... then, you need to bring the offending file into its original state.
 # Open the Commit window, select the file, and click on the Revert button.
 #
-# Of course, you can save a local copy under a different name before you revert,
-# in case you want to keep your changes.
+# When working with these script DO NOT SIMPLY  source()  THESE FILES!
+
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
 #
 #
 # ==============================================================================
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@ -1,4 +1,10 @@
-# BIN-ALI-BLAST.R
+# tocID <- "BIN-ALI-BLAST.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-BLAST unit.
--- a/BIN-ALI-Dotplot.R
+++ b/BIN-ALI-Dotplot.R
@ -1,4 +1,10 @@
-# BIN-ALI-Dotplot.R
+# tocID <- "BIN-ALI-Dotplot.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Dotplot unit.
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
@ -1,4 +1,10 @@
-# BIN-ALI-MSA.R
+# tocID <- "BIN-ALI-MSA.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-MSA unit.
--- a/BIN-ALI-Optimal_sequence_alignment.R
+++ b/BIN-ALI-Optimal_sequence_alignment.R
@ -1,4 +1,10 @@
-# BIN-ALI-Optimal_sequence_alignment.R
+# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
--- a/BIN-ALI-Similarity.R
+++ b/BIN-ALI-Similarity.R
@ -1,4 +1,10 @@
-# BIN-ALI-Similarity.R
+# tocID <- "BIN-ALI-Similarity.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-ALI-Similarity unit.
--- a/BIN-Data_integration.R
+++ b/BIN-Data_integration.R
@ -1,4 +1,10 @@
-# BIN-Data_integration.R
+# tocID <- "BIN-Data_integration.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Data_integration unit.
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@ -1,4 +1,10 @@
-# BIN-FUNC-Domain_annotation.R
+# tocID <- "BIN-FUNC-Domain_annotation.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC-Domain_annotation unit.
--- a/BIN-FUNC-Semantic_similarity.R
+++ b/BIN-FUNC-Semantic_similarity.R
@ -1,4 +1,10 @@
-# BIN-FUNC_Semantic_similarity.R
+# tocID <- "BIN-FUNC_Semantic_similarity.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC_Semantic_similarity unit.
@ -158,27 +164,9 @@ myEnr <- GOenrichment(mySet, allGenes)

 sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...

-#Yes: most significantly enriched is GO:0071931. What is this?
-getGOTerm("GO:0071931")  # ... makes sense.
+#Most significantly enriched is GO:0071931. What is this?
+annotate::getGOTerm("GO:0071931")  # ... makes sense.

-(fullSet <- myEnr$genes$`GO:0071931`)  # What genes are annotated to this term?
-
-intersect(mySet, fullSet) # These are in both sets
-setdiff(mySet, fullSet)   # These mySet members are not annotated to that term
-setdiff(fullSet, mySet)   # These are annotated to that term but not in mySet.
-                          # ... that's the most interesting set. From a set of
-                          # genes we have identified a function that they
-                          # share, and that shared function has allowed us
-                          # to identify
-
-# What are these genes?
-# Select annotations from the annotation database:
-AnnotationDbi::select(org.Sc.sgd.db,
-                      keys = setdiff(fullSet, mySet),
-                      columns = c("COMMON", "DESCRIPTION"))
-
-# Note that these annotations are partially redundant to several different
-# aliases of the same three genes.



--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@ -1,4 +1,10 @@
-# BIN-MYSPE.R
+# tocID <- "BIN-MYSPE.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-MYSPE unit
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Data_preparation.R
+# tocID <- "BIN-PHYLO-Data_preparation.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Data_preparation unit.
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Tree_analysis.R
+# tocID <- "BIN-PHYLO-Tree_analysis.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_analysis unit.
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@ -1,4 +1,10 @@
-# BIN-PHYLO-Tree_building.R
+# tocID <- "BIN-PHYLO-Tree_building.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Tree_building unit.
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@ -1,4 +1,10 @@
-# BIN-PPI-Analysis.R
+# tocID <- "BIN-PPI-Analysis.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PPI-Analysis unit.
--- a/BIN-SEQA-Composition.R
+++ b/BIN-SEQA-Composition.R
@ -1,4 +1,10 @@
-# BIN-SEQA-Composition.R
+# tocID <- "BIN-SEQA-Composition.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-SEQA-Comparison unit
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -1,4 +1,10 @@
-# BIN-Sequence.R
+# tocID <- "BIN-Sequence.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-Sequence unit.
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
@ -1,4 +1,10 @@
-# BIN-Storing_data.R
+# tocID <- "BIN-Storing_data.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-Storing_data unit
--- a/FND-Genetic_code.R
+++ b/FND-Genetic_code.R
@ -1,4 +1,10 @@
-# FND-Genetic_code.R
+# tocID <- "FND-Genetic_code.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-Genetic_code unit.
--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
@ -1,4 +1,10 @@
-# FND-MAT-Graphs_and_networks.R
+# tocID <- "FND-MAT-Graphs_and_networks.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-MAT-Graphs_and_networks unit.
@ -280,7 +286,7 @@ plot(GBA,
     vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
     vertex.size = 200 + (30 * igraph::degree(GBA)),
     vertex.label = NA)
-par(oPar)                              # restore grphics state
+par(oPar)                              # restore graphics state

 # This is a very obviously different graph! Some biological networks have
 # features that look like that - but in my experience the hub nodes are usually
--- a/FND-STA-Information_theory.R
+++ b/FND-STA-Information_theory.R
@ -1,14 +1,21 @@
-# FND-STA-Information_theory.R
+# tocID <- "FND-STA-Information_theory.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Information_theory unit.
 #
-# Version:  0.2
+# Version:  0.2.1
 #
-# Date:     2017  MM  DD
+# Date:     2017 - 2019
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           0.2.1  Maintenance
 #           0.2    Under development
 #           0.1    First code copied from 2016 material.
 #
@ -58,11 +65,33 @@ AAref["Y"] <- 0.0294
 sum(AAref)

 # Function to calculate Shannon entropy
-H <- function(v) {
-  # Shannon entropy (bits)
-  return(-sum(v * (log(v) / log(2))))
+H <- function(pmf) {
+  # Calculate Shannon entropy
+  # Parameters:
+  #   pmf (numeric) probability mass function: a vector of states and
+  #                 associated probabilities. Each element of
+  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
+  # Value:
+  #   Shannon entropy in bits.
+  # Examples:
+  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
+  #                                         # nucleotide sequence
+  #   H(1)     # If all elements are the same, entropy is zero
+  #
+  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
+    stop("Input is not a discrete probability distribution.")
+  }
+  H <- -sum(pmf * (log(pmf) / log(2)))
+  return(H)
 }

+# Why use all.equal()? Exact comparisons with floating point numbers are
+# brittle. Consider for example:
+1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
+print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
+# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
+
+

 # Entropy of the database frequencies (in bits):
 (Href <- H(AAref))
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
@ -1,4 +1,10 @@
-# FND-STA-Probability_distribution.R
+# tocID <- "FND-STA-Probability_distribution.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Probability_distribution unit.
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -1,4 +1,10 @@
-# FND-STA-Significance.R
+# tocID <- "FND-STA-Significance.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the FND-STA-Significance unit.
--- a/README.md
+++ b/README.md
@ -1,2 +1,4 @@
 # ABC-units
 A Bioinformatics Course: R modules for learning units
+
+Follow the instructions in the learning unit to install your local copy of this R-project.
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@ -1,4 +1,10 @@
-# RPR-Biostrings.R
+# tocID <- "RPR-Biostrings.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Biostrings unit.
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@ -1,4 +1,10 @@
-# RPR-FASTA.R
+# tocID <- "RPR-FASTA.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-FASTA unit.
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
@ -1,4 +1,10 @@
-# RPR_GEO2R.R
+# tocID <- "RPR_GEO2R.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR_GEO2R unit.
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -1,4 +1,10 @@
-# RPR-Genetic_code_optimality.R
+# tocID <- "RPR-Genetic_code_optimality.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Genetic_code_optimality unit.
--- a/RPR-Introduction.R
+++ b/RPR-Introduction.R
@ -1,4 +1,10 @@
-# RPR-Introduction.R
+# tocID <- "RPR-Introduction.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the RPR-Introduction unit
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@ -1,4 +1,10 @@
-# RPR-PROSITE_POST.R
+# tocID <- "RPR-PROSITE_POST.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -1,4 +1,10 @@
-# RPR-RegEx.R
+# tocID <- "RPR-RegEx.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the RPR-RegEx unit
--- a/RPR-SX-PDB.R
+++ b/RPR-SX-PDB.R
@ -1,4 +1,10 @@
-# RPR-SX-PDB.R
+# tocID <- "RPR-SX-PDB.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-SX-PDB unit.
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@ -1,4 +1,10 @@
-# RPR-UniProt_GET.R
+# tocID <- "RPR-UniProt_GET.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
--- a/RPR-Unit_testing.R
+++ b/RPR-Unit_testing.R
@ -1,4 +1,10 @@
-# RPR-Unit_testing.R
+# tocID <- "RPR-Unit_testing.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Unit_testing unit.
@ -29,10 +35,10 @@
 #TOC> 
 #TOC>   Section  Title                             Line
 #TOC> -------------------------------------------------
-#TOC>   1        Unit Tests with testthat            40
-#TOC>   2        Organizing your tests              159
-#TOC>   2.1        Testing scripts                  183
-#TOC>   3        Task solutions                     198
+#TOC>   1        Unit Tests with testthat            46
+#TOC>   2        Organizing your tests              165
+#TOC>   2.1        Testing scripts                  189
+#TOC>   3        Task solutions                     204
 #TOC> 
 #TOC> ==========================================================================

--- a/RPR-eUtils_XML.R
+++ b/RPR-eUtils_XML.R
@ -1,4 +1,10 @@
-# RPR-eUtils_and_XML.R
+# tocID <- "RPR-eUtils_and_XML.R"
+#
+# ---------------------------------------------------------------------------- #
+#  PATIENCE  ...                                                               #
+#    Do not yet work wih this code. Updates in progress. Thank you.            #
+#    boris.steipe@utoronto.ca                                                  #
+# ---------------------------------------------------------------------------- #
 #
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the RPR-Scripting_data_downloads unit.
--- a/scripts/ABC-createRefDB.R
+++ b/scripts/ABC-createRefDB.R
@ -8,7 +8,7 @@
 # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 #
 # For the data model, see
-# https://docs.google.com/drawings/d/1uupNvz18_FYFwyyVPebTM0CUxcJCPDQuxuIJGpjWQWg
+# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
 # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
 #
 # ==============================================================================
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@ -1,12 +1,35 @@
-# ABC-dbUtilities.R
-
+# tocID <- "scripts/ABC-dbUtilities.R"
+#
 # database utilities for ABC learning units
 #
 # ==============================================================================
-#


-# ====== PACKAGES ==============================================================
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        PACKAGES                            32
+#TOC>   2        FUNCTIONS                           50
+#TOC>   2.01       dbSanitizeSequence()              53
+#TOC>   2.02       dbConfirmUnique()                 88
+#TOC>   2.03       dbInit()                         106
+#TOC>   2.04       dbAutoincrement()                147
+#TOC>   2.05       dbAddProtein()                   160
+#TOC>   2.06       dbAddFeature()                   180
+#TOC>   2.07       dbAddTaxonomy()                  199
+#TOC>   2.08       dbAddAnnotation()                215
+#TOC>   2.09       dbFetchUniProtSeq()              243
+#TOC>   2.10       dbFetchPrositeFeatures()         267
+#TOC>   2.11       node2text()                      311
+#TOC>   2.12       dbFetchNCBItaxData()             323
+#TOC>   2.13       UniProtIDmap()                   362
+#TOC>   3        TESTS                              399
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  PACKAGES  ============================================================


 if (! requireNamespace("jsonlite", quietly = TRUE)) {
@ -24,9 +47,10 @@ if (! requireNamespace("xml2", quietly = TRUE)) {
 }


-# ====== FUNCTIONS =============================================================
+# =    2  FUNCTIONS  ===========================================================


+# ==   2.01  dbSanitizeSequence()  =============================================
 dbSanitizeSequence <- function(s, unambiguous = TRUE) {
  # Remove FASTA header lines, if any,
  # flatten any structure that s has,
@ -61,6 +85,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 }


+# ==   2.02  dbConfirmUnique()  ================================================
 dbConfirmUnique <- function(x) {
  # x is a vector of logicals.
  # returns x if x has exactly one TRUE element.
@ -78,24 +103,27 @@ dbConfirmUnique <- function(x) {
 }


+# ==   2.03  dbInit()  =========================================================
 dbInit <- function() {
  # Return an empty instance of the protein database
+  # Open the link and study the schema:
+  # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0

  db <- list()

+  db$version <- "1.0"
+
  db$protein <- data.frame(
    ID = numeric(),
    name = character(),
    RefSeqID = character(),
    UniProtID = character(),
    taxonomyID = numeric(),
-    sequence = character(),
-    stringsAsFactors = FALSE)
+    sequence = character())

  db$taxonomy <- data.frame(
    ID = numeric(),
-    species = character(),
-    stringsAsFactors = FALSE)
+    species = character())


  db$annotation <- data.frame(
@ -103,21 +131,20 @@ dbInit <- function() {
    proteinID = numeric(),
    featureID = numeric(),
    start = numeric(),
-    end = numeric(),
-    stringsAsFactors = FALSE)
+    end = numeric())

  db$feature <- data.frame(
    ID = numeric(),
    name = character(),
    description = character(),
    sourceDB = character(),
-    accession = character(),
-    stringsAsFactors = FALSE)
+    accession = character())

  return(db)
 }


+# ==   2.04  dbAutoincrement()  ================================================
 dbAutoincrement <- function(tb) {
  # Return a unique integer that can be used as a primary key
  # Value:
@ -130,6 +157,7 @@ dbAutoincrement <- function(tb) {
 }


+# ==   2.05  dbAddProtein()  ===================================================
 dbAddProtein <- function(db, jsonDF) {
  # Add one or more protein entries to the database db.
  # Parameters:
@ -142,14 +170,14 @@ dbAddProtein <- function(db, jsonDF) {
                    RefSeqID    = jsonDF$RefSeqID[i],
                    UniProtID   = jsonDF$UniProtID[i],
                    taxonomyID  = jsonDF$taxonomyID[i],
-                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]),
-                    stringsAsFactors = FALSE)
+                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
    db$protein <- rbind(db$protein, x)
  }
  return(db)
 }


+# ==   2.06  dbAddFeature()  ===================================================
 dbAddFeature <- function(db, jsonDF) {
  # Add one or more feature entries to the database db.
  # Parameters:
@ -161,14 +189,14 @@ dbAddFeature <- function(db, jsonDF) {
                    name        = jsonDF$name[i],
                    description = jsonDF$description[i],
                    sourceDB    = jsonDF$sourceDB[i],
-                    accession   = jsonDF$accession[i],
-                    stringsAsFactors = FALSE)
+                    accession   = jsonDF$accession[i])
    db$feature <- rbind(db$feature, x)
  }
  return(db)
 }


+# ==   2.07  dbAddTaxonomy()  ==================================================
 dbAddTaxonomy <- function(db, jsonDF) {
  # Add one or more taxonomy entries to the database db.
  # Parameters:
@ -178,13 +206,13 @@ dbAddTaxonomy <- function(db, jsonDF) {
  for (i in seq_len(nrow(jsonDF))) {
    x <- data.frame(
      ID =  jsonDF$ID[i],
-      species = jsonDF$species[i],
-      stringsAsFactors = FALSE)
+      species = jsonDF$species[i])
    db$taxonomy <- rbind(db$taxonomy, x)
  }
  return(db)
 }

+# ==   2.08  dbAddAnnotation()  ================================================
 dbAddAnnotation <- function(db, jsonDF) {
  # Add one or more annotation entries to the database db.
  # Parameters:
@ -205,14 +233,14 @@ dbAddAnnotation <- function(db, jsonDF) {
                    proteinID = pID,
                    featureID = fID,
                    start     = as.integer(jsonDF$start[i]),
-                    end       = as.integer(jsonDF$end[i]),
-                    stringsAsFactors = FALSE)
+                    end       = as.integer(jsonDF$end[i]))
    db$annotation <- rbind(db$annotation, x)
  }
  return(db)
 }


+# ==   2.09  dbFetchUniProtSeq()  ==============================================
 dbFetchUniProtSeq <- function(ID) {
  # Fetch a protein sequence from UniProt.
  # Parameters:
@ -236,6 +264,7 @@ dbFetchUniProtSeq <- function(ID) {
 }


+# ==   2.10  dbFetchPrositeFeatures()  =========================================
 dbFetchPrositeFeatures <- function(ID) {
  # Fetch feature annotations from ScanProsite.
  # Parameters:
@ -272,14 +301,14 @@ dbFetchPrositeFeatures <- function(ID) {
                                     start =  as.numeric(tokens[4]),
                                     end   =  as.numeric(tokens[5]),
                                     psID  =  tokens[6],
-                                     psName = tokens[7],
-                                     stringsAsFactors = FALSE))
+                                     psName = tokens[7]))
    }
  }
  return(myFeatures)
 }


+# ==   2.11  node2text()  ======================================================
 node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
@ -291,6 +320,7 @@ node2text <- function(doc, tag) {
 }


+# ==   2.12  dbFetchNCBItaxData()  =============================================
 dbFetchNCBItaxData <- function(ID) {
  # Fetch feature taxID and Organism from the NCBI.
  # Parameters:
@ -329,6 +359,7 @@ dbFetchNCBItaxData <- function(ID) {



+# ==   2.13  UniProtIDmap()  ===================================================
 UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  # Use UniProt ID mapping service to map one or more IDs
  # Parameters:
@ -351,8 +382,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {

  if (httr::status_code(response) == 200) { # 200: oK
    myMap <- read.delim(file = textConnection(httr::content(response)),
-                        sep = "\t",
-                        stringsAsFactors = FALSE)
+                        sep = "\t")
    myMap <- myMap[ , c(1,3)]
    colnames(myMap) <- c("From", "To")
  } else {
@ -366,7 +396,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 }


-# ====== TESTS =================================================================
+# =    3  TESTS  ===============================================================

 if (FALSE) {
  if (! requireNamespace("testthat", quietly = TRUE)) {
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@ -1,4 +1,4 @@
-# ABC-makeScCCnet.R
+# tocID <- "scripts/ABC-makeScCCnet.R"
 #
 # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
 # GOSlim annotation.
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@ -1,4 +1,4 @@
-# ABC-writeALN.R
+# tocID <- "scripts/ABC-writeALN.R"
 #
 # ToDo:    calculate consensus line
 #          append sequence numbers
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@ -40,7 +40,7 @@ writeMFA <- function(ali,
  if (is.na(blockWidth)) {
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
  }
-  if (blockWidth < 1){
+  if (! blockWidth > 0){
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
  }

@ -105,7 +105,7 @@ writeMFA <- function(ali,
    txt <- c(txt, "")  # append an empty line for readability
  }

-  writeLines(txt, con= myCon)
+  writeLines(txt, con = myCon)

 }

--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@ -357,20 +357,23 @@ parseBLASTalignment <- function(hit) {

 # ==== TESTS ===================================================================

-# define query:
-# q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
-#              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-#              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-#              sep="")
-# or ...
-# q <- "NP_010227" # refseq ID
-#
-# test <- BLAST(q,
-#               nHits = 100,
-#               E = 0.001,
-#               rid = "",
-#               limits = "txid4751[ORGN]")
-# length(test$hits)
+if (FALSE) {
+  # define query:
+  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
+               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+               sep="")
+  # or ...
+  q <- "NP_010227" # refseq ID
+
+  test <- BLAST(q,
+                nHits = 100,
+                E = 0.001,
+                rid = "",
+                limits = "txid4751[ORGN]")
+  str(test)
+  length(test$hits)
+}

 # [END]