Line termination change and old code.

2021-11-16 00:31:48 -05:00 · 2021-11-16 00:31:48 -05:00 · affe00f6fb
commit affe00f6fb
parent b1e00f52f7
86 changed files with 37873 additions and 37876 deletions
--- a/.Rprofile
+++ b/.Rprofile
@ -1,129 +1,129 @@
-# .Rprofile
-#
-# This script is automatically executed on startup
-# ==============================================================================
-
-init <- function() {
-
-  # Create a local copy of myScript.R if not done yet.
-  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
-    file.copy(".tmp.R", "myScript.R")
-    cat("A new file \"myScript.R\" was created. You can use it for\n")
-    cat("notes and code experiments.\n\n")
-  }
-
-  cat("\n\n")
-  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
-  cat("\"files\" pane), edit it and save it.\n")
-  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
-  cat("to move it into the \"myScripts\" folder.\n\n")
-
-  file.edit("ABC-units.R")
-  return(invisible(NULL))
-}
-
-if (! file.exists("./myScripts/.myProfile.R")) {
-  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
-  cat("    =================")
-  cat("\n\n")
-  cat("        WELCOME !\n")
-  cat("\n")
-  cat("  Type  'init()'  to begin\n\n")
-  cat("\n")
-  cat("    =================")
-  cat("\n\n")
-
-} else {  # local profile exists ... validate state:
-  cat("\n\nLoading local functions ...")
-
-  source(".utilities.R")  # local profile appears sane, source utilities
-  source("./myScripts/.myProfile.R")
-
-  if (! exists("myEMail")) {  # ... has eMail been defined?
-    cat("ERROR !\n")
-    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("the variable \"myEMail\" was not loaded.\n")
-    cat("Please contact your instructor to continue.\n\n")
-  }
-  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
-    cat("ERROR !\n")
-    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("the variable \"myStudentNumber\" was not loaded.\n")
-    cat("Please contact your instructor to continue.\n\n")
-  }
-  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
-    cat("ERROR !\n")                 # is the Student Number valid?
-    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("your Student Number could not be validated.\n")
-    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
-    cat(" and fix the problem or contact your instructor to continue.\n\n")
-  }
-
-
-  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
-                            # ... and write it into the profile.
-       prf <- readLines("./myScripts/.myProfile.R")
-       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
-       out <- prf[1:iEmail]
-       out <- c(out, sprintf("MYSPE <- \"%s\" ",
-                             getMYSPE(myStudentNumber)))
-       out <- c(out, prf[(iEmail+1):length(prf)])
-       writeLines(out, "./myScripts/.myProfile.R")
-
-       cat("\n")
-       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
-                   getMYSPE(myStudentNumber)))
-       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
-       rm(prf, iEmail, out)                # cleanup
-  }
-  cat("... done.\n\n")
-}
-
-if (default.stringsAsFactors()) {
-  cat("WARNING.\n")
-  cat("========\n")
-  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
-  cat("This will break some of the code.\n")
-  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
-  cat("\n")
-}
-
-errText <- list()
-errText[["noProfileFile"]] <- '
-Your PROFILE FILE does not exist. This problem must be fixed to continue.
-
-  The code expects the file "./myScripts/.myProfile.R" to exist and to
-  contain your correct eMail address and student number. Detailed
-  instructions were given when you first ran the init() command.
-
-  Try running init() again and follow the instructions. Reload youR RStudio
-  session and start over with this file.
-
-  If this does not fix the problem, ask for help.
-'
-
-errText[["noStudentNumber"]] <- '
-Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
-
-  The code expects the file "./myScripts/.myProfile.R" to exist and to
-  contain your correct eMail address and student number. This file gets
-  sourced when you start a new R-session, but since you see this error
-  message there was a problem.
-
-  Perhaps you need to restart your R-session. Try closing the RStudio
-  project and reopening it from the File > Recent Projects menu.
-
-  Perhaps there was a syntax error in your file. Then not all the
-  instructions in the file are executed. Check the file: is your
-  email perhpas not defined? Or did you type it without qwuoataion
-  marks?
-
-  Try fixing problems, and then restart R as described above.
-
-  If none of this fixes the problem, ask for help.
-'
-
-# [END]
+# .Rprofile
+#
+# This script is automatically executed on startup
+# ==============================================================================
+
+init <- function() {
+
+  # Create a local copy of myScript.R if not done yet.
+  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
+    file.copy(".tmp.R", "myScript.R")
+    cat("A new file \"myScript.R\" was created. You can use it for\n")
+    cat("notes and code experiments.\n\n")
+  }
+
+  cat("\n\n")
+  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
+  cat("\"files\" pane), edit it and save it.\n")
+  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
+  cat("to move it into the \"myScripts\" folder.\n\n")
+
+  file.edit("ABC-units.R")
+  return(invisible(NULL))
+}
+
+if (! file.exists("./myScripts/.myProfile.R")) {
+  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
+  cat("    =================")
+  cat("\n\n")
+  cat("        WELCOME !\n")
+  cat("\n")
+  cat("  Type  'init()'  to begin\n\n")
+  cat("\n")
+  cat("    =================")
+  cat("\n\n")
+
+} else {  # local profile exists ... validate state:
+  cat("\n\nLoading local functions ...")
+
+  source(".utilities.R")  # local profile appears sane, source utilities
+  source("./myScripts/.myProfile.R")
+
+  if (! exists("myEMail")) {  # ... has eMail been defined?
+    cat("ERROR !\n")
+    cat("=======\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("the variable \"myEMail\" was not loaded.\n")
+    cat("Please contact your instructor to continue.\n\n")
+  }
+  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
+    cat("ERROR !\n")
+    cat("=======\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("the variable \"myStudentNumber\" was not loaded.\n")
+    cat("Please contact your instructor to continue.\n\n")
+  }
+  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
+    cat("ERROR !\n")                 # is the Student Number valid?
+    cat("=======\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("your Student Number could not be validated.\n")
+    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
+    cat(" and fix the problem or contact your instructor to continue.\n\n")
+  }
+
+
+  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
+                            # ... and write it into the profile.
+       prf <- readLines("./myScripts/.myProfile.R")
+       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
+       out <- prf[1:iEmail]
+       out <- c(out, sprintf("MYSPE <- \"%s\" ",
+                             getMYSPE(myStudentNumber)))
+       out <- c(out, prf[(iEmail+1):length(prf)])
+       writeLines(out, "./myScripts/.myProfile.R")
+
+       cat("\n")
+       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
+                   getMYSPE(myStudentNumber)))
+       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
+       rm(prf, iEmail, out)                # cleanup
+  }
+  cat("... done.\n\n")
+}
+
+if (default.stringsAsFactors()) {
+  cat("WARNING.\n")
+  cat("========\n")
+  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
+  cat("This will break some of the code.\n")
+  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
+  cat("\n")
+}
+
+errText <- list()
+errText[["noProfileFile"]] <- '
+Your PROFILE FILE does not exist. This problem must be fixed to continue.
+
+  The code expects the file "./myScripts/.myProfile.R" to exist and to
+  contain your correct eMail address and student number. Detailed
+  instructions were given when you first ran the init() command.
+
+  Try running init() again and follow the instructions. Reload youR RStudio
+  session and start over with this file.
+
+  If this does not fix the problem, ask for help.
+'
+
+errText[["noStudentNumber"]] <- '
+Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
+
+  The code expects the file "./myScripts/.myProfile.R" to exist and to
+  contain your correct eMail address and student number. This file gets
+  sourced when you start a new R-session, but since you see this error
+  message there was a problem.
+
+  Perhaps you need to restart your R-session. Try closing the RStudio
+  project and reopening it from the File > Recent Projects menu.
+
+  Perhaps there was a syntax error in your file. Then not all the
+  instructions in the file are executed. Check the file: is your
+  email perhpas not defined? Or did you type it without qwuoataion
+  marks?
+
+  Try fixing problems, and then restart R as described above.
+
+  If none of this fixes the problem, ask for help.
+'
+
+# [END]
--- a/.gitignore
+++ b/.gitignore
@ -1,44 +1,44 @@
-# Miscellaneous
-.Ds_store
-instructor/
-dev/
-# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
-
-# History files
-.Rhistory
-.Rapp.history
-
-# Session Data files
-# .RData
-
-# Files produced in assingments
-data/APSESphyloSet.mfa
-data/APSEStreeRproml.rds
-
-# Example code in package build process
-*-Ex.R
-
-# Output files from R CMD build
-/*.tar.gz
-
-# Output files from R CMD check
-/*.Rcheck/
-
-# RStudio files
-.Rproj.user/
-
-# produced vignettes
-vignettes/*.html
-vignettes/*.pdf
-
-# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
-.httr-oauth
-
-# knitr and R markdown default cache directories
-/*_cache/
-/cache/
-
-# Temporary files created by R markdown
-*.utf8.md
-*.knit.md
-.Rproj.user
+# Miscellaneous
+.Ds_store
+instructor/
+dev/
+# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
+
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+# .RData
+
+# Files produced in assingments
+data/APSESphyloSet.mfa
+data/APSEStreeRproml.rds
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+.Rproj.user
--- a/.tmp.R
+++ b/.tmp.R
@ -1,38 +1,38 @@
-# myScript.R
-#
-# --- As you work with this file, you can delete the instructions below --------
-# Write your notes and code experiments into this document. Save it
-# from time to time - however I recommend that you do not _commit_
-# your saved version.
-#
-# As long as you do not _commit_ this script to version control,
-# you can _pull_ updated versions of the entire project from GitHub
-# by using the RStudio version control interface. However, once
-# you _commit_ any file in your local version, RStudio will require
-# you to resolve conflicts before you can _pull_ updates.
-# --- As you work with this file, you can delete the instructions above --------
-#
-## Purpose: <...>
-#
-# Version: <...>
-#
-# Date:    <...>
-# Author:  <Name> (<namee@mail.utoronto.ca>)
-#
-# Versions:
-#
-#   <number>    <Features>
-#
-# TODO:
-#   <...>
-#
-# ====================================================================
-
-
-
-
-
-
-
-# [END]
-
+# myScript.R
+#
+# --- As you work with this file, you can delete the instructions below --------
+# Write your notes and code experiments into this document. Save it
+# from time to time - however I recommend that you do not _commit_
+# your saved version.
+#
+# As long as you do not _commit_ this script to version control,
+# you can _pull_ updated versions of the entire project from GitHub
+# by using the RStudio version control interface. However, once
+# you _commit_ any file in your local version, RStudio will require
+# you to resolve conflicts before you can _pull_ updates.
+# --- As you work with this file, you can delete the instructions above --------
+#
+## Purpose: <...>
+#
+# Version: <...>
+#
+# Date:    <...>
+# Author:  <Name> (<namee@mail.utoronto.ca>)
+#
+# Versions:
+#
+#   <number>    <Features>
+#
+# TODO:
+#   <...>
+#
+# ====================================================================
+
+
+
+
+
+
+
+# [END]
+
--- a/.utilities.R
+++ b/.utilities.R
--- a/2021-10-12_In-Class_exploration.R
+++ b/2021-10-12_In-Class_exploration.R
@ -1,257 +1,257 @@
-# 2021-10-12_In-Class_exploration.R
-#
-#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
-#
-# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
-# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
-# Scribe:     boris.steipe@utoronto.ca
-#
-# ==============================================================================
-#
-# In our last session we explored some properties of amino acids and noted that
-# we can arrange them in a scatter-plot according to some properties. But can
-# we also arrange them according to generic properties, i.e. taking all
-# published property scales into account? We will try to use all tables from
-# the seqinr package.
-
-# First we load the package - this makes all datasets immediately available and
-# we don't have to load them one by one.
-
-library(seqinr)
-
-# Determine what datasets are available
-#
-# Using "find in topic" ... "amino acid"
-data(aacost)
-data(aaindex)
-data(pK)
-
-# We note that datasets may be sorted in different ways: for example
-# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
-# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
-# acids are sorted in the same way.
-
-# Build a datastructure ...
-# rows: amino acids
-# columns: properties
-
-# Are all lists in aaindex organized in the same way?
-
-refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
-                                  # index as a reference list
-
-# Loop over each list in aaindex
-for (i in 1:length(aaindex)) {
-#   get the I-vector
-  x <- aaindex[[i]]$I
-#   get the names
-  x <- names(x)
-#   compare with the names of our reference list
-#   the == and != operators are vectorized. Applying them to two vectors
-#   gives TRUE or FALSE for each pair of elements. any() or all() can be
-#   applied to logical vectors to anylise them and return a soingle result.
-#   if (...) conditions evaluate only a single value and will throw a warning if
-#   there is more than one.
-
-  if (any(x != refNames)) {
-    # There was at least one not-equal pair - so: complain
-    print(sprintf("Problem in list %d: names don't match", i))
-  }
-}
-
-# If we get here without identifying problems, it means all pairs of
-# rownames match throughout the aainfex list.
-
-
-# Next: what is the cvorrect syntax to add one vector (the "I" vector of
-# one of the list elements) to our dataframe?
-aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
-aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
-
-str(aaData)  # Confirm: we now have a two-column dataframe
-
-# Next: add the rest ...
-for (i in 3:length(aaindex)) {
-  #   get the I-vector and write it into our dataframe
-  aaData[,i] <- aaindex[[i]]$I
-}
-
-# Sanity check
-plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
-
-# Looks good.
-
-# We finished building our data structure ... but let's add the aacost table
-# aacost is ordered differently:
-rownames(aaData)
-aacost[ , 1]
-
-# using order(), applied to aacost - ordering the column with column-name
-# "aaa"
-sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
-aacost[sel, "aaa"] # applying the order vector sorts the column
-
-# Is this the same order as refNames?
-refNames == aacost[sel, "aaa"]  # Yes!
-
-# add the data from column "tot" (i.e. total metabolic cost) after the
-# last column of aaData
-aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
-
-# Done.
-str(aaData)  # A dataframe with 20 rows and 545 columns
-
-# To answer the question "Which amino acids are similar to each other?" we
-# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
-# we will succumb to the "Curse of Dimensionality":
-#
-#    "in high dimensional data, however, all objects appear
-#     to be sparse and dissimilar in many ways..."
-#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
-#
-# A classic way to do this is Principal Component Analysis (PCA) ...
-# (Principal components analysis)
-#
-# PCA expects objects in columns, properties in rows. Therefore we need to
-# transpose our dataset:
-
-aaPCA <- prcomp(t(aaData))
-
-# This creates an error, because some of our indicews contain NA values!
-# Which indices are this?
-
-# We create a vector "sel" for which we check whether any element in each
-# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
-# then use this vector to subset ourt dataframe.
-
-sel <- logical()
-
-for (i in 1:ncol(aaData)) {         # for each index
-  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
-    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
-  } else {                          #   else
-    sel <- c(sel, TRUE)             #     add a TRUE element
-  }
-}
-
-# Done. sel now subsets only the NA-free columns
-545 - sum(sel)                      # 13 columns excluded
-
-# Do the PCA ... use the prcomp() function
-aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
-
-str(aaPCA)   # structure of the result
-
-plot(aaPCA)                         # plot the contributions of the
-                                    # components to the variance
-
-plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
-     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
-     type ="n")                     # just to set up the coordinate system
-
-text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
-     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
-     labels = rownames(aaPCA$rotation))
-
-# PCA results are sensitive to the absolute numeric value of the features that
-# we are comparing. The prcomp() function has an option scale. = TRUE that
-# scales each row of features so that the variance of the value is 1.0  This
-# ensures that each feature is given approximately equal weight
-
-aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
-
-plot(aaPCA)
-
-plot(aaPCA$rotation[ , 1],
-     aaPCA$rotation[ , 2],
-     type ="n")
-text(aaPCA$rotation[ , 1],
-     aaPCA$rotation[ , 2],
-     labels = rownames(aaPCA$rotation))
-
-
-# Next we try to identify what the PCs correspond to. We see whether there are
-# specific features that are highly correlated with the PCs
-
-# ==== Rotation 1 ===================
-#
-
-(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
-
-# The function cor() calculates Pearson coefficients of correlation
-cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
-
-
-# Iterate over all columns and calculate correlations
-cors <- numeric()
-
-for (i in 1:ncol(aaData)) {
-  cors[i] <- cor(PC1, aaData[ , i])
-}
-
-summary(cors)
-#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
-# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
-#
-#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
-
-which(cors == max(cors, na.rm = TRUE))
-
-aaindex[[504]]   # Linker propensity ???
-
-cor(PC1, aaindex[[504]]$I) # Did we get the right index?
-
-# Plot this ...
-plot(aaPCA$rotation[ , 1],
-     aaindex[[504]]$I,
-     type ="n")
-text(aaPCA$rotation[ , 1],
-     aaindex[[504]]$I,
-     labels = rownames(aaPCA$rotation))
-
-# This is essentially a random correlation but for Cysteine ...
-
-
-# ==== Rotation 2 ===================
-#
-# same process
-PC2 <- aaPCA$rotation[ , 2]
-
-cors2 <- numeric()
-
-for (i in 1:ncol(aaData)) {
-  cors2[i] <- cor(PC2, aaData[ , i])
-}
-
-summary(cors2)
-#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
-# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
-
-# Here we have quite strong correlations
-
-which(cors2 == max(cors2, na.rm = TRUE))
-
-aaindex[[148]]
-
-# this index itself is correlated with many other indices
-
-cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
-
-# Plot this too...
-plot(aaPCA$rotation[ , 2],
-     aaindex[[148]]$I,
-     type ="n")
-text(aaPCA$rotation[ , 2],
-     aaindex[[148]]$I,
-     labels = rownames(aaPCA$rotation))
-
-# This correlates well with hydrophobicity measures. In this case the
-# PC is to a certain degree interpretable - but this is not always the case
-# with PCA (see the example of the first PC).
-
-
-
-
-
-
-# [END]
+# 2021-10-12_In-Class_exploration.R
+#
+#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
+#
+# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
+# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
+# Scribe:     boris.steipe@utoronto.ca
+#
+# ==============================================================================
+#
+# In our last session we explored some properties of amino acids and noted that
+# we can arrange them in a scatter-plot according to some properties. But can
+# we also arrange them according to generic properties, i.e. taking all
+# published property scales into account? We will try to use all tables from
+# the seqinr package.
+
+# First we load the package - this makes all datasets immediately available and
+# we don't have to load them one by one.
+
+library(seqinr)
+
+# Determine what datasets are available
+#
+# Using "find in topic" ... "amino acid"
+data(aacost)
+data(aaindex)
+data(pK)
+
+# We note that datasets may be sorted in different ways: for example
+# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
+# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
+# acids are sorted in the same way.
+
+# Build a datastructure ...
+# rows: amino acids
+# columns: properties
+
+# Are all lists in aaindex organized in the same way?
+
+refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
+                                  # index as a reference list
+
+# Loop over each list in aaindex
+for (i in 1:length(aaindex)) {
+#   get the I-vector
+  x <- aaindex[[i]]$I
+#   get the names
+  x <- names(x)
+#   compare with the names of our reference list
+#   the == and != operators are vectorized. Applying them to two vectors
+#   gives TRUE or FALSE for each pair of elements. any() or all() can be
+#   applied to logical vectors to anylise them and return a soingle result.
+#   if (...) conditions evaluate only a single value and will throw a warning if
+#   there is more than one.
+
+  if (any(x != refNames)) {
+    # There was at least one not-equal pair - so: complain
+    print(sprintf("Problem in list %d: names don't match", i))
+  }
+}
+
+# If we get here without identifying problems, it means all pairs of
+# rownames match throughout the aainfex list.
+
+
+# Next: what is the cvorrect syntax to add one vector (the "I" vector of
+# one of the list elements) to our dataframe?
+aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
+aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
+
+str(aaData)  # Confirm: we now have a two-column dataframe
+
+# Next: add the rest ...
+for (i in 3:length(aaindex)) {
+  #   get the I-vector and write it into our dataframe
+  aaData[,i] <- aaindex[[i]]$I
+}
+
+# Sanity check
+plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
+
+# Looks good.
+
+# We finished building our data structure ... but let's add the aacost table
+# aacost is ordered differently:
+rownames(aaData)
+aacost[ , 1]
+
+# using order(), applied to aacost - ordering the column with column-name
+# "aaa"
+sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
+aacost[sel, "aaa"] # applying the order vector sorts the column
+
+# Is this the same order as refNames?
+refNames == aacost[sel, "aaa"]  # Yes!
+
+# add the data from column "tot" (i.e. total metabolic cost) after the
+# last column of aaData
+aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
+
+# Done.
+str(aaData)  # A dataframe with 20 rows and 545 columns
+
+# To answer the question "Which amino acids are similar to each other?" we
+# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
+# we will succumb to the "Curse of Dimensionality":
+#
+#    "in high dimensional data, however, all objects appear
+#     to be sparse and dissimilar in many ways..."
+#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
+#
+# A classic way to do this is Principal Component Analysis (PCA) ...
+# (Principal components analysis)
+#
+# PCA expects objects in columns, properties in rows. Therefore we need to
+# transpose our dataset:
+
+aaPCA <- prcomp(t(aaData))
+
+# This creates an error, because some of our indicews contain NA values!
+# Which indices are this?
+
+# We create a vector "sel" for which we check whether any element in each
+# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
+# then use this vector to subset ourt dataframe.
+
+sel <- logical()
+
+for (i in 1:ncol(aaData)) {         # for each index
+  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
+    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
+  } else {                          #   else
+    sel <- c(sel, TRUE)             #     add a TRUE element
+  }
+}
+
+# Done. sel now subsets only the NA-free columns
+545 - sum(sel)                      # 13 columns excluded
+
+# Do the PCA ... use the prcomp() function
+aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
+
+str(aaPCA)   # structure of the result
+
+plot(aaPCA)                         # plot the contributions of the
+                                    # components to the variance
+
+plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
+     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
+     type ="n")                     # just to set up the coordinate system
+
+text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
+     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
+     labels = rownames(aaPCA$rotation))
+
+# PCA results are sensitive to the absolute numeric value of the features that
+# we are comparing. The prcomp() function has an option scale. = TRUE that
+# scales each row of features so that the variance of the value is 1.0  This
+# ensures that each feature is given approximately equal weight
+
+aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
+
+plot(aaPCA)
+
+plot(aaPCA$rotation[ , 1],
+     aaPCA$rotation[ , 2],
+     type ="n")
+text(aaPCA$rotation[ , 1],
+     aaPCA$rotation[ , 2],
+     labels = rownames(aaPCA$rotation))
+
+
+# Next we try to identify what the PCs correspond to. We see whether there are
+# specific features that are highly correlated with the PCs
+
+# ==== Rotation 1 ===================
+#
+
+(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
+
+# The function cor() calculates Pearson coefficients of correlation
+cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
+
+
+# Iterate over all columns and calculate correlations
+cors <- numeric()
+
+for (i in 1:ncol(aaData)) {
+  cors[i] <- cor(PC1, aaData[ , i])
+}
+
+summary(cors)
+#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
+#
+#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
+
+which(cors == max(cors, na.rm = TRUE))
+
+aaindex[[504]]   # Linker propensity ???
+
+cor(PC1, aaindex[[504]]$I) # Did we get the right index?
+
+# Plot this ...
+plot(aaPCA$rotation[ , 1],
+     aaindex[[504]]$I,
+     type ="n")
+text(aaPCA$rotation[ , 1],
+     aaindex[[504]]$I,
+     labels = rownames(aaPCA$rotation))
+
+# This is essentially a random correlation but for Cysteine ...
+
+
+# ==== Rotation 2 ===================
+#
+# same process
+PC2 <- aaPCA$rotation[ , 2]
+
+cors2 <- numeric()
+
+for (i in 1:ncol(aaData)) {
+  cors2[i] <- cor(PC2, aaData[ , i])
+}
+
+summary(cors2)
+#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
+
+# Here we have quite strong correlations
+
+which(cors2 == max(cors2, na.rm = TRUE))
+
+aaindex[[148]]
+
+# this index itself is correlated with many other indices
+
+cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
+
+# Plot this too...
+plot(aaPCA$rotation[ , 2],
+     aaindex[[148]]$I,
+     type ="n")
+text(aaPCA$rotation[ , 2],
+     aaindex[[148]]$I,
+     labels = rownames(aaPCA$rotation))
+
+# This correlates well with hydrophobicity measures. In this case the
+# PC is to a certain degree interpretable - but this is not always the case
+# with PCA (see the example of the first PC).
+
+
+
+
+
+
+# [END]
--- a/ABC-Install_all_packages.R
+++ b/ABC-Install_all_packages.R
@ -1,161 +1,161 @@
-# tocID <- "ABC-Install_all_packages.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              Installing all packages in this course
-#
-# Version:  1.0
-#
-# Date:     2021  10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.0    New code
-#
-#
-# TODO:
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                          Line
-#TOC> ----------------------------------------------
-#TOC>   1        Packages                         33
-#TOC>   2        CRAN packages                    98
-#TOC>   3        Bioconductor packages           127
-#TOC>   4        Other package sources           142
-#TOC>   5        Updating packages               148
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Packages  ============================================================
-
-# Much of R's functionality is contributed in packages: bundles of R scripts
-# or code in other languages, pre-configured objects, and datasets. Making this
-# functionality available is often done by issuing a library(<package-name>)
-# command, however this is not the preferred way, since it may override other
-# R functions and it makes it harder to understand where the source code of
-# a particular function is located. In this course we call the function name
-# prefixed with the package name and two colons:
-#   <package-name>::<function-name>()
-# This is the preferred way, since it is explicit.
-#
-# Regardless of which idiom one uses to call the actual function, the package
-#  needs to be "installed" first, i.e. the code must have been downloaded
-# from CRAN, or using the BiocManager::install() function.
-#
-# This script contains download commands for all packages that are used in the
-# course. You can execute the script line by line (or even source the entire
-# script) to make sure all packages can be installed on your computer. Just
-# one reminder: if you are ever asked to install from source, the correct
-# answer is usually "no" - except if you really know what you are doing and why.
-#
-# Once packages are installed you can get additional information about
-# the contents of a package with the commands:
-#  library(help=<package-name>)       # basic information
-#  browseVignettes("<package-name>")  # available vignettes
-#  data(package = "<package-name>")   # available datasets
-#
-#  ... and you can load data sets with:
-#  data(<data-set-name>, package = "<package-name>")
-#
-#  All packages here are installed only when they have not been installed
-#  before, using the following idiom:
-#
-#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
-#       install.packages("<package-name>")
-#     }
-#
-#  ... or its BiocManager::install() equivalent:
-#
-# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
-#   BiocManager::install("<bioconductor-package-name>")
-# }
-#
-#  If you want to _force_ a re-installation of the package, simply issue
-#  the install.packages("<package-name>") command on its own. For compactness
-#  we wrap the idiom into a function, which can also switch between CRAN
-#  and BIOconductor sources:
-
-installIfNeeded <- function(package, s = "CRAN") {
-  # s: "CRAN" or "BIO"
-  if (s == "CRAN") {
-    if (! requireNamespace(package, quietly=TRUE)) {
-      install.packages(package)
-    }
-  } else if (s == "BIO") {
-    if (! requireNamespace("BiocManager", quietly=TRUE)) {
-      install.packages("BiocManager")
-    }
-    if (! requireNamespace(package, quietly=TRUE)) {
-      BiocManager::install(package)
-    }
-  } else {
-    stop(sprintf("Unknown source \"%s\".", s))
-  }
-}
-
-
-# =    2  CRAN packages  =======================================================
-
-installIfNeeded("ape")
-installIfNeeded("BiocManager")
-installIfNeeded("bio3d")
-installIfNeeded("evd")
-installIfNeeded("ggseqlogo")
-installIfNeeded("ggtern")
-installIfNeeded("hexbin")
-installIfNeeded("httr")
-installIfNeeded("igraph")
-installIfNeeded("jsonlite")
-installIfNeeded("magrittr")
-installIfNeeded("MASS")
-installIfNeeded("microbenchmark")
-installIfNeeded("phangorn")
-installIfNeeded("plotly")
-installIfNeeded("plotrix")
-installIfNeeded("profvis")
-installIfNeeded("robustbase")
-installIfNeeded("RColorBrewer")
-installIfNeeded("Rphylip")
-installIfNeeded("rvest")
-installIfNeeded("seqinr")
-installIfNeeded("stringi")
-installIfNeeded("taxize")
-installIfNeeded("testthat")
-installIfNeeded("xml2")
-
-# =    3  Bioconductor packages  ===============================================
-
-installIfNeeded("Biobase",       s = "BIO")
-installIfNeeded("biomaRt",       s = "BIO")
-installIfNeeded("Biostrings",    s = "BIO")
-installIfNeeded("DECIPHER",      s = "BIO")
-installIfNeeded("GEOquery",      s = "BIO")
-installIfNeeded("GOSim",         s = "BIO")
-installIfNeeded("limma",         s = "BIO")
-installIfNeeded("msa",           s = "BIO")
-installIfNeeded("org.Sc.sgd.db", s = "BIO")
-installIfNeeded("prada",         s = "BIO")
-installIfNeeded("topGO",         s = "BIO")
-
-
-# =    4  Other package sources  ===============================================
-
-# Using sources other than CRAN or Bioconductor to download general-purpose
-# programs that run on your computer is not generally recommended.
-
-
-# =    5  Updating packages  ===================================================
-
-# From time to time, update CRAN packages with the following command ...
-
-update.packages()
-
-# ... and also update Bioconductor packages as follows:
-
-BiocManager::install()
-
-# [END]
+# tocID <- "ABC-Install_all_packages.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              Installing all packages in this course
+#
+# Version:  1.0
+#
+# Date:     2021  10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    New code
+#
+#
+# TODO:
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                          Line
+#TOC> ----------------------------------------------
+#TOC>   1        Packages                         33
+#TOC>   2        CRAN packages                    98
+#TOC>   3        Bioconductor packages           127
+#TOC>   4        Other package sources           142
+#TOC>   5        Updating packages               148
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Packages  ============================================================
+
+# Much of R's functionality is contributed in packages: bundles of R scripts
+# or code in other languages, pre-configured objects, and datasets. Making this
+# functionality available is often done by issuing a library(<package-name>)
+# command, however this is not the preferred way, since it may override other
+# R functions and it makes it harder to understand where the source code of
+# a particular function is located. In this course we call the function name
+# prefixed with the package name and two colons:
+#   <package-name>::<function-name>()
+# This is the preferred way, since it is explicit.
+#
+# Regardless of which idiom one uses to call the actual function, the package
+#  needs to be "installed" first, i.e. the code must have been downloaded
+# from CRAN, or using the BiocManager::install() function.
+#
+# This script contains download commands for all packages that are used in the
+# course. You can execute the script line by line (or even source the entire
+# script) to make sure all packages can be installed on your computer. Just
+# one reminder: if you are ever asked to install from source, the correct
+# answer is usually "no" - except if you really know what you are doing and why.
+#
+# Once packages are installed you can get additional information about
+# the contents of a package with the commands:
+#  library(help=<package-name>)       # basic information
+#  browseVignettes("<package-name>")  # available vignettes
+#  data(package = "<package-name>")   # available datasets
+#
+#  ... and you can load data sets with:
+#  data(<data-set-name>, package = "<package-name>")
+#
+#  All packages here are installed only when they have not been installed
+#  before, using the following idiom:
+#
+#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
+#       install.packages("<package-name>")
+#     }
+#
+#  ... or its BiocManager::install() equivalent:
+#
+# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
+#   BiocManager::install("<bioconductor-package-name>")
+# }
+#
+#  If you want to _force_ a re-installation of the package, simply issue
+#  the install.packages("<package-name>") command on its own. For compactness
+#  we wrap the idiom into a function, which can also switch between CRAN
+#  and BIOconductor sources:
+
+installIfNeeded <- function(package, s = "CRAN") {
+  # s: "CRAN" or "BIO"
+  if (s == "CRAN") {
+    if (! requireNamespace(package, quietly=TRUE)) {
+      install.packages(package)
+    }
+  } else if (s == "BIO") {
+    if (! requireNamespace("BiocManager", quietly=TRUE)) {
+      install.packages("BiocManager")
+    }
+    if (! requireNamespace(package, quietly=TRUE)) {
+      BiocManager::install(package)
+    }
+  } else {
+    stop(sprintf("Unknown source \"%s\".", s))
+  }
+}
+
+
+# =    2  CRAN packages  =======================================================
+
+installIfNeeded("ape")
+installIfNeeded("BiocManager")
+installIfNeeded("bio3d")
+installIfNeeded("evd")
+installIfNeeded("ggseqlogo")
+installIfNeeded("ggtern")
+installIfNeeded("hexbin")
+installIfNeeded("httr")
+installIfNeeded("igraph")
+installIfNeeded("jsonlite")
+installIfNeeded("magrittr")
+installIfNeeded("MASS")
+installIfNeeded("microbenchmark")
+installIfNeeded("phangorn")
+installIfNeeded("plotly")
+installIfNeeded("plotrix")
+installIfNeeded("profvis")
+installIfNeeded("robustbase")
+installIfNeeded("RColorBrewer")
+installIfNeeded("Rphylip")
+installIfNeeded("rvest")
+installIfNeeded("seqinr")
+installIfNeeded("stringi")
+installIfNeeded("taxize")
+installIfNeeded("testthat")
+installIfNeeded("xml2")
+
+# =    3  Bioconductor packages  ===============================================
+
+installIfNeeded("Biobase",       s = "BIO")
+installIfNeeded("biomaRt",       s = "BIO")
+installIfNeeded("Biostrings",    s = "BIO")
+installIfNeeded("DECIPHER",      s = "BIO")
+installIfNeeded("GEOquery",      s = "BIO")
+installIfNeeded("GOSim",         s = "BIO")
+installIfNeeded("limma",         s = "BIO")
+installIfNeeded("msa",           s = "BIO")
+installIfNeeded("org.Sc.sgd.db", s = "BIO")
+installIfNeeded("prada",         s = "BIO")
+installIfNeeded("topGO",         s = "BIO")
+
+
+# =    4  Other package sources  ===============================================
+
+# Using sources other than CRAN or Bioconductor to download general-purpose
+# programs that run on your computer is not generally recommended.
+
+
+# =    5  Updating packages  ===================================================
+
+# From time to time, update CRAN packages with the following command ...
+
+update.packages()
+
+# ... and also update Bioconductor packages as follows:
+
+BiocManager::install()
+
+# [END]
--- a/ABC-addSACCE_APSESproteins.R
+++ b/ABC-addSACCE_APSESproteins.R
@ -1,100 +1,100 @@
-# addSACCE_APSESproteins.R
-# Adds the Saccharomyces cerevisiae APSES proteins to myDB
-#
-
-myDB$protein <-
-    rbind(myDB$protein,
-          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "SWI4_SACCE",
-              RefSeqID = "NP_011036",
-              UniProtID = "P25302",
-              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
-        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
-       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
-       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
-       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
-       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
-       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
-       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
-       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
-       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
-       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
-       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
-       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
-       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
-       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
-       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
-       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
-       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
-       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
-       1081 klddiekdlr ana"),
-              stringsAsFactors = FALSE))
-
-myDB$protein <-
-    rbind(myDB$protein,
-          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "PHD1_SACCE",
-              RefSeqID = "NP_012881",
-              UniProtID = "P36093",
-              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
-        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
-       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
-      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
-      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
-      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
-      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
-      361 aknels"),
-              stringsAsFactors = FALSE))
-
-myDB$protein <-
-    rbind(myDB$protein,
-          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "SOK2_SACCE",
-              RefSeqID = "NP_013729",
-              UniProtID = "P53438",
-              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
-        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
-       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
-      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
-      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
-      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
-      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
-      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
-      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
-      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
-      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
-      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
-      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
-      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
-      781 kkqek"),
-              stringsAsFactors = FALSE))
-
-myDB$protein <-
-    rbind(myDB$protein,
-          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "XBP1_SACCE",
-              RefSeqID = "NP_012165",
-              UniProtID = "P40489",
-              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
-        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
-       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
-      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
-      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
-      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
-      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
-      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
-      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
-      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
-      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
-      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
-              stringsAsFactors = FALSE))
-
-# [END]
+# addSACCE_APSESproteins.R
+# Adds the Saccharomyces cerevisiae APSES proteins to myDB
+#
+
+myDB$protein <-
+    rbind(myDB$protein,
+          data.frame(
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              name = "SWI4_SACCE",
+              RefSeqID = "NP_011036",
+              UniProtID = "P25302",
+              taxonomy.ID = as.integer(4932),
+              sequence = dbSanitizeSequence("
+        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
+       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
+       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
+       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
+       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
+       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
+       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
+       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
+       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
+       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
+       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
+       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
+       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
+       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
+       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
+       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
+       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
+       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
+       1081 klddiekdlr ana"),
+              stringsAsFactors = FALSE))
+
+myDB$protein <-
+    rbind(myDB$protein,
+          data.frame(
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              name = "PHD1_SACCE",
+              RefSeqID = "NP_012881",
+              UniProtID = "P36093",
+              taxonomy.ID = as.integer(4932),
+              sequence = dbSanitizeSequence("
+        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
+       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
+      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
+      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
+      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
+      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
+      361 aknels"),
+              stringsAsFactors = FALSE))
+
+myDB$protein <-
+    rbind(myDB$protein,
+          data.frame(
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              name = "SOK2_SACCE",
+              RefSeqID = "NP_013729",
+              UniProtID = "P53438",
+              taxonomy.ID = as.integer(4932),
+              sequence = dbSanitizeSequence("
+        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
+       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
+      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
+      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
+      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
+      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
+      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
+      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
+      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
+      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
+      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
+      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
+      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
+      781 kkqek"),
+              stringsAsFactors = FALSE))
+
+myDB$protein <-
+    rbind(myDB$protein,
+          data.frame(
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              name = "XBP1_SACCE",
+              RefSeqID = "NP_012165",
+              UniProtID = "P40489",
+              taxonomy.ID = as.integer(4932),
+              sequence = dbSanitizeSequence("
+        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
+       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
+      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
+      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
+      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
+      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
+      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
+      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
+      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
+      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
+      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
+              stringsAsFactors = FALSE))
+
+# [END]
--- a/ABC-units.R
+++ b/ABC-units.R
@ -1,69 +1,69 @@
-# ABC-units.R
-#
-# Purpose: A Bioinformatics Course: R code for learning units
-#
-# Version: 4.0
-#
-# Date:    2020  09  16
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-# V 4.0    2020 version
-# V 3.0    2019 version
-# V 2.0    2018 version
-# V 1.0    2017 version
-# V 0.1    First code
-#
-# TODO:
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# The R-scripts and datasets in this project will be continuously updated,
-# and updates will be posted on GitHub. To bring your version into the latest
-# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
-# repository. However, this will overwrite locally edited version of files.
-
-# To edit code and experiment with it, for example to add your own comments and
-# examples, save your edited version into the "myScripts" folder. Otherwise you
-# may have problems with git when you update the project to a new version. It's
-# good practice to change the filename, for example by prepending your initials.
-# This helps distinguish the files you are working with e.g. in a list of
-# recent files. For example if your name is Honjo Tasuku, your edited
-# BIN-Sequence.R might be named HT-BIN-Sequence.R
-
-# If you pull from github and get the following type of error ...
-#     ---------------
-#     error: Your local changes to the following files would be
-#     overwritten by merge
-#     ...
-#     Please commit your changes or stash them before you can merge.
-#     ---------------
-# ... then, you need to bring the offending file into its original state.
-# Open the Commit window, select the file, and click on the Revert button.
-#
-# When working with these script DO NOT SIMPLY  source()  THESE FILES!
-
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-#
-# ==============================================================================
-
-# Once you have typed and executed the function init(), you will find a file
-# called myScript.R in the project directory.
-#
-# Open it, you can place all of your code-experiments and notes into that
-# file. This will complement your "Course Journal". If you keep all contents in
-# this one file, you can find everything by using the <cmd>-F find function. To
-# cross-reference code in your journal, create section headings.
-#
-# ==============================================================================
-
-# The individual learning units' files can be opened by simply clicking on them
-# in the File pane.
-
-
-
-# [END]
+# ABC-units.R
+#
+# Purpose: A Bioinformatics Course: R code for learning units
+#
+# Version: 4.0
+#
+# Date:    2020  09  16
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+# V 4.0    2020 version
+# V 3.0    2019 version
+# V 2.0    2018 version
+# V 1.0    2017 version
+# V 0.1    First code
+#
+# TODO:
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# The R-scripts and datasets in this project will be continuously updated,
+# and updates will be posted on GitHub. To bring your version into the latest
+# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
+# repository. However, this will overwrite locally edited version of files.
+
+# To edit code and experiment with it, for example to add your own comments and
+# examples, save your edited version into the "myScripts" folder. Otherwise you
+# may have problems with git when you update the project to a new version. It's
+# good practice to change the filename, for example by prepending your initials.
+# This helps distinguish the files you are working with e.g. in a list of
+# recent files. For example if your name is Honjo Tasuku, your edited
+# BIN-Sequence.R might be named HT-BIN-Sequence.R
+
+# If you pull from github and get the following type of error ...
+#     ---------------
+#     error: Your local changes to the following files would be
+#     overwritten by merge
+#     ...
+#     Please commit your changes or stash them before you can merge.
+#     ---------------
+# ... then, you need to bring the offending file into its original state.
+# Open the Commit window, select the file, and click on the Revert button.
+#
+# When working with these script DO NOT SIMPLY  source()  THESE FILES!
+
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+#
+# ==============================================================================
+
+# Once you have typed and executed the function init(), you will find a file
+# called myScript.R in the project directory.
+#
+# Open it, you can place all of your code-experiments and notes into that
+# file. This will complement your "Course Journal". If you keep all contents in
+# this one file, you can find everything by using the <cmd>-F find function. To
+# cross-reference code in your journal, create section headings.
+#
+# ==============================================================================
+
+# The individual learning units' files can be opened by simply clicking on them
+# in the File pane.
+
+
+
+# [END]
--- a/ABC-units.Rproj
+++ b/ABC-units.Rproj
@ -1,16 +1,16 @@
-Version: 1.0
-
-RestoreWorkspace: No
-SaveWorkspace: No
-AlwaysSaveHistory: No
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: knitr
-LaTeX: XeLaTeX
-
-AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: No
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: knitr
+LaTeX: XeLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@ -1,111 +1,111 @@
-# tocID <- "BIN-ALI-BLAST.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-BLAST unit.
-#
-# ==============================================================================
-#
-# Version:  1.3
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.3    2020 Maintenance
-#           1.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.1    Fixed parsing logic.
-#           1.0    First live version 2017.
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                               Line
-#TOC> ---------------------------------------------------
-#TOC>   1        Defining the APSES domain             45
-#TOC>   2        Executing the BLAST search            75
-#TOC>   3        Analysing results                     97
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Defining the APSES domain  ===========================================
-
-# Load your protein database
-source("makeProteinDB.R")
-
-# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
-# have entered this data into your database in the
-# BIN-ALI-Optimal_sequence_alignment unit.)
-
-( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
-                                                # name of the Mbp1 orthologue
-                                                # of Mbp1 in your protein
-                                                # database, DON'T continue. We
-                                                # need to fix this problem.
-                                                # Get in touch.
-
-(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                               myDB$annotation$featureID == ftrID])
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
-                 end))
-
-# The MYSPE "apses" sequence is the sequence that we will use for our reverse
-# BLAST search.
-
-
-# =    2  Executing the BLAST search  ==========================================
-
-# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
-# through its Web API, and to parse results. Have a look at the script, then
-# source it:
-
-source("./scripts/BLAST.R")
-
-# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
-# cerevisiae:
-
-BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
-                     db = "refseq_protein",        # database to search in
-                     nHits = 10,                   #
-                     E = 0.01,                     #
-                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
-
-
-length(BLASTresults$hits)  # There should be at least one hit there. Ask for
-                           # advice in case this step fails.
-
-
-# =    3  Analysing results  ===================================================
-
-(topHit <- BLASTresults$hits[[1]])   # Get the top hit
-
-# What is the refseq ID of the top hit
-topHit$accession
-
-# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
-# domain. If it is not, ask me for advice.
-
-
-
-
-
-# [END]
+# tocID <- "BIN-ALI-BLAST.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-ALI-BLAST unit.
+#
+# ==============================================================================
+#
+# Version:  1.3
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.3    2020 Maintenance
+#           1.2    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.1    Fixed parsing logic.
+#           1.0    First live version 2017.
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                               Line
+#TOC> ---------------------------------------------------
+#TOC>   1        Defining the APSES domain             45
+#TOC>   2        Executing the BLAST search            75
+#TOC>   3        Analysing results                     97
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Defining the APSES domain  ===========================================
+
+# Load your protein database
+source("makeProteinDB.R")
+
+# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
+# have entered this data into your database in the
+# BIN-ALI-Optimal_sequence_alignment unit.)
+
+( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
+                                                # name of the Mbp1 orthologue
+                                                # of Mbp1 in your protein
+                                                # database, DON'T continue. We
+                                                # need to fix this problem.
+                                                # Get in touch.
+
+(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+                               myDB$annotation$featureID == ftrID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+                 start,
+                 end))
+
+# The MYSPE "apses" sequence is the sequence that we will use for our reverse
+# BLAST search.
+
+
+# =    2  Executing the BLAST search  ==========================================
+
+# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
+# through its Web API, and to parse results. Have a look at the script, then
+# source it:
+
+source("./scripts/BLAST.R")
+
+# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
+# cerevisiae:
+
+BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
+                     db = "refseq_protein",        # database to search in
+                     nHits = 10,                   #
+                     E = 0.01,                     #
+                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
+
+
+length(BLASTresults$hits)  # There should be at least one hit there. Ask for
+                           # advice in case this step fails.
+
+
+# =    3  Analysing results  ===================================================
+
+(topHit <- BLASTresults$hits[[1]])   # Get the top hit
+
+# What is the refseq ID of the top hit
+topHit$accession
+
+# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
+# domain. If it is not, ask me for advice.
+
+
+
+
+
+# [END]
--- a/BIN-ALI-Dotplot.R
+++ b/BIN-ALI-Dotplot.R
@ -1,195 +1,195 @@
-# tocID <- "BIN-ALI-Dotplot.R"
-#
-#
-# ==============================================================================
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Dotplot unit.
-#
-# Version:  0.2
-#
-# Date:     2019  01  07
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           0.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                  Line
-#TOC> --------------------------------------
-#TOC>   1        ___Section___            42
-#TOC>   2        Tasks                   190
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  ___Section___  =======================================================
-
-if (!requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
-}
-if (!requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-if (!requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
-}
-
-
-# Let's load BLOSUM62
-data(BLOSUM62, package = "Biostrings")
-
-# Now let's craft code for a dotplot. That's surprisingly simple. We build a
-# matrix that has as many rows as one sequence, as many columns as another. Then
-# we go through every cell of the matrix and enter the pairscore we encounter
-# for the amino acid pair whose position corresponds to the row and column
-# index. Finally we visualize the matrix in a plot.
-#
-
-# First we fetch our sequences and split them into single characters.
-sel <- myDB$protein$name == "MBP1_SACCE"
-MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
-
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
-
-# Check that we have two character vectors of the expected length.
-str(MBP1_SACCE)
-str(MBP1_MYSPE)
-
-# How do we get the pairscore values? Consider: a single pair of amino acids can
-# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
-MBP1_SACCE[13]
-MBP1_MYSPE[21]
-
-# ... using these as subsetting expressions, we can pull the pairscore
-# from the MDM
-BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
-
-# First we build an empty matrix that will hold all pairscores ...
-dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
-                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
-
-# ... then we loop over the sequences and store the scores in the matrix.
-#
-for (i in 1:length(MBP1_SACCE)) {
-  for (j in 1:length(MBP1_MYSPE)) {
-    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
-  }
-}
-
-# Even though this is a large matrix, this does not take much time ...
-# Let's have a look at a small block of the values:
-
-dotMat[1:10, 1:10]
-
-# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
-# the matrix correspond to an amino acid from MBP1_MYSPE.
-
-# To plot this, we use the image() function. Here, with default parameters.
-
-image(dotMat)
-
-# Be patient, this takes a few moments to render: more than 500,000 values.
-# Nice.
-# What do you expect?
-# What would similar sequences look like?
-# What do you see?
-
-#You migh notice a thin line of yellow along the diagonal, moving approximately
-# from bottom left to top right, fading in and out of existence. This is the
-# signature of extended sequence similarity.
-
-# Let's magnify this a bit by looking at only the first 200 amino acids ...
-image(dotMat[1:200, 1:200])
-
-# ... and, according to our normal writing convention, we would like the
-# diagonal to run from top-left to bottom-right since we write from left to
-# right and from top to bottom...
-image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
-
-# ... and we would like the range of the x- and y- axis to correspond to the
-# sequence position ...
-image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
-
-# ... and labels! Axis labels would be nice ...
-image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
-      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
-
-# ... and why don't we have axis-numbers on all four sides? Go, make that right
-# too ...
-len <- 200
-image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
-      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
-box()
-axis(1, at = c(1, seq(10, len, by=10)))
-axis(2, at = c(1, seq(10, len, by=10)))
-axis(3, at = c(1, seq(10, len, by=10)))
-axis(4, at = c(1, seq(10, len, by=10)))
-
-# ... you get the idea, we can infinitely customize our plot. However a good way
-# to do this is to develop a particular view for, say, a report or publication
-# in a script and then put it into a function. I have put a function into the
-# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
-# there already is a dotplot function in the seqinr package:
-
-seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
-dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
-
-# Which one do you prefer? You can probably see the block patterns that arise
-# from segments of repetitive, low complexity sequence. But you probably have to
-# look very closely to discern the faint diagonals that correspond to similar
-# sequence.
-
-
-# Let's see if we can enhance the contrast between distributed noise and the
-# actual alignment of conserved residues. We can filter the dot matrix with a
-# pattern that enhances diagonally repeated values. Every value in the matrix
-# will be replaced by a weighted average of its neighborhood. Here is  a
-# diagonal-filter:
-
-myFilter <- matrix(numeric(25), nrow = 5)
-myFilter[1, ] <- c( 1, 0, 0, 0, 0)
-myFilter[2, ] <- c( 0, 1, 0, 0, 0)
-myFilter[3, ] <- c( 0, 0, 1, 0, 0)
-myFilter[4, ] <- c( 0, 0, 0, 1, 0)
-myFilter[5, ] <- c( 0, 0, 0, 0, 1)
-
-# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
-
-dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
-
-# I think the result shows quite nicely how the two sequences are globally
-# related and where the regions of sequence similarity are. Play with this a bit
-# ...  Can you come up with a better filter? If so, eMail us.
-
-
-
-
-# =    2  Tasks  ===============================================================
-
-
-
-
-# [END]
+# tocID <- "BIN-ALI-Dotplot.R"
+#
+#
+# ==============================================================================
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-ALI-Dotplot unit.
+#
+# Version:  0.2
+#
+# Date:     2019  01  07
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           0.2    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                  Line
+#TOC> --------------------------------------
+#TOC>   1        ___Section___            42
+#TOC>   2        Tasks                   190
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  ___Section___  =======================================================
+
+if (!requireNamespace("BiocManager", quietly=TRUE)) {
+  install.packages("BiocManager")
+}
+if (!requireNamespace("Biostrings", quietly=TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+if (!requireNamespace("seqinr", quietly=TRUE)) {
+  install.packages("seqinr")
+}
+
+
+# Let's load BLOSUM62
+data(BLOSUM62, package = "Biostrings")
+
+# Now let's craft code for a dotplot. That's surprisingly simple. We build a
+# matrix that has as many rows as one sequence, as many columns as another. Then
+# we go through every cell of the matrix and enter the pairscore we encounter
+# for the amino acid pair whose position corresponds to the row and column
+# index. Finally we visualize the matrix in a plot.
+#
+
+# First we fetch our sequences and split them into single characters.
+sel <- myDB$protein$name == "MBP1_SACCE"
+MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
+
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
+
+# Check that we have two character vectors of the expected length.
+str(MBP1_SACCE)
+str(MBP1_MYSPE)
+
+# How do we get the pairscore values? Consider: a single pair of amino acids can
+# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
+MBP1_SACCE[13]
+MBP1_MYSPE[21]
+
+# ... using these as subsetting expressions, we can pull the pairscore
+# from the MDM
+BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
+
+# First we build an empty matrix that will hold all pairscores ...
+dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
+                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
+
+# ... then we loop over the sequences and store the scores in the matrix.
+#
+for (i in 1:length(MBP1_SACCE)) {
+  for (j in 1:length(MBP1_MYSPE)) {
+    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
+  }
+}
+
+# Even though this is a large matrix, this does not take much time ...
+# Let's have a look at a small block of the values:
+
+dotMat[1:10, 1:10]
+
+# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
+# the matrix correspond to an amino acid from MBP1_MYSPE.
+
+# To plot this, we use the image() function. Here, with default parameters.
+
+image(dotMat)
+
+# Be patient, this takes a few moments to render: more than 500,000 values.
+# Nice.
+# What do you expect?
+# What would similar sequences look like?
+# What do you see?
+
+#You migh notice a thin line of yellow along the diagonal, moving approximately
+# from bottom left to top right, fading in and out of existence. This is the
+# signature of extended sequence similarity.
+
+# Let's magnify this a bit by looking at only the first 200 amino acids ...
+image(dotMat[1:200, 1:200])
+
+# ... and, according to our normal writing convention, we would like the
+# diagonal to run from top-left to bottom-right since we write from left to
+# right and from top to bottom...
+image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
+
+# ... and we would like the range of the x- and y- axis to correspond to the
+# sequence position ...
+image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
+
+# ... and labels! Axis labels would be nice ...
+image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
+
+# ... and why don't we have axis-numbers on all four sides? Go, make that right
+# too ...
+len <- 200
+image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
+box()
+axis(1, at = c(1, seq(10, len, by=10)))
+axis(2, at = c(1, seq(10, len, by=10)))
+axis(3, at = c(1, seq(10, len, by=10)))
+axis(4, at = c(1, seq(10, len, by=10)))
+
+# ... you get the idea, we can infinitely customize our plot. However a good way
+# to do this is to develop a particular view for, say, a report or publication
+# in a script and then put it into a function. I have put a function into the
+# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
+# there already is a dotplot function in the seqinr package:
+
+seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
+
+# Which one do you prefer? You can probably see the block patterns that arise
+# from segments of repetitive, low complexity sequence. But you probably have to
+# look very closely to discern the faint diagonals that correspond to similar
+# sequence.
+
+
+# Let's see if we can enhance the contrast between distributed noise and the
+# actual alignment of conserved residues. We can filter the dot matrix with a
+# pattern that enhances diagonally repeated values. Every value in the matrix
+# will be replaced by a weighted average of its neighborhood. Here is  a
+# diagonal-filter:
+
+myFilter <- matrix(numeric(25), nrow = 5)
+myFilter[1, ] <- c( 1, 0, 0, 0, 0)
+myFilter[2, ] <- c( 0, 1, 0, 0, 0)
+myFilter[3, ] <- c( 0, 0, 1, 0, 0)
+myFilter[4, ] <- c( 0, 0, 0, 1, 0)
+myFilter[5, ] <- c( 0, 0, 0, 0, 1)
+
+# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
+
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
+
+# I think the result shows quite nicely how the two sequences are globally
+# related and where the regions of sequence similarity are. Play with this a bit
+# ...  Can you come up with a better filter? If so, eMail us.
+
+
+
+
+# =    2  Tasks  ===============================================================
+
+
+
+
+# [END]
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
--- a/BIN-ALI-Optimal_sequence_alignment.R
+++ b/BIN-ALI-Optimal_sequence_alignment.R
@ -1,365 +1,365 @@
-# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
-#
-# ==============================================================================
-# Version:  1.7.1
-#
-# Date:     2017-09   -   2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
-#           1.7    2020 updates
-#           1.6    Maintenance
-#           1.5    Change from require() to requireNamespace(),
-#                    use <package>::<function>() idiom throughout
-#           1.4    Pull s2c() from seqinr package, rather then loading the
-#                    entire library.
-#           1.3    Updated confirmation task with correct logic
-#           1.2    Added missing load of seqinr package
-#           1.1    Update annotation file logic - it could already have been
-#                    prepared in the BIN-FUNC-Annotation unit.
-#           1.0.1  bugfix
-#           1.0    First 2017 live version.
-#           0.1    First code copied from 2016 material.
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                                      Line
-#TOC> --------------------------------------------------------------------------
-#TOC>   1        Prepare                                                      58
-#TOC>   2        Biostrings Pairwise Alignment                                75
-#TOC>   2.1        Optimal global alignment                                   93
-#TOC>   2.2        Optimal local alignment                                   156
-#TOC>   3        APSES Domain annotation by alignment                        180
-#TOC>   4        Update your database script                                 261
-#TOC>   4.1        Preparing an annotation file ...                          267
-#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
-#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
-#TOC>   4.2        Execute and Validate                                      338
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Prepare  =============================================================
-
-if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
-}
-# You can get package information with the following commands:
-# library(help = seqinr)       # basic information
-# browseVignettes("seqinr")    # available vignettes
-# data(package = "seqinr")     # available datasets
-
-
-# You need to recreate the protein database that you have constructed in the
-# BIN-Storing_data unit.
-
-source("./myScripts/makeProteinDB.R")
-
-
-# =    2  Biostrings Pairwise Alignment  =======================================
-
-
-if (!requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
-}
-if (!requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-
-# Biostrings stores sequences in "XString" objects. Once we have converted our
-# target sequences to AAString objects, the alignment itself is straightforward.
-
-# ==   2.1  Optimal global alignment  ==========================================
-
-# The pairwiseAlignment() function was written to behave
-# exactly like the functions you encountered on the EMBOSS server.
-
-# First: make AAString objects ...
-sel <- myDB$protein$name == "MBP1_SACCE"
-aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
-
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
-
-?pairwiseAlignment
-# ... and align.
-# Global optimal alignment with end-gap penalties is default.
-ali1 <-  Biostrings::pairwiseAlignment(
-  aaMBP1_SACCE,
-  aaMBP1_MYSPE,
-  substitutionMatrix = "BLOSUM62",
-  gapOpening = 10,
-  gapExtension = 0.5)
-
-str(ali1)  # ... it's complicated
-
-# This is a Biostrings alignment object. But we can use Biostrings functions to
-# tame it:
-ali1
-Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
-
-# And we can make the internal structure work for us  (@ is for classes as
-# $ is for lists ...)
-str(ali1@pattern)
-ali1@pattern
-ali1@pattern@range
-ali1@pattern@indel
-ali1@pattern@mismatch
-
-# or work with "normal" R functions
-# the alignment length
-nchar(as.character(ali1@pattern))
-
-# the number of identities
-sum(seqinr::s2c(as.character(ali1@pattern)) ==
-    seqinr::s2c(as.character(ali1@subject)))
-
-# ... e.g. to calculate the percentage of identities
-100 *
-  sum(seqinr::s2c(as.character(ali1@pattern)) ==
-      seqinr::s2c(as.character(ali1@subject))) /
-  nchar(as.character(ali1@pattern))
-# ... which should be the same as reported in the writePairwiseAlignments()
-# output. Awkward to type? Then it calls for a function:
-#
-percentID <- function(al) {
-  # returns the percent-identity of a Biostrings alignment object
-  return(100 *
-         sum(seqinr::s2c(as.character(al@pattern)) ==
-             seqinr::s2c(as.character(al@subject))) /
-         nchar(as.character(al@pattern)))
-}
-
-percentID(ali1)
-
-# ==   2.2  Optimal local alignment  ===========================================
-
-# Compare with local optimal alignment (like EMBOSS Water)
-ali2 <-  Biostrings::pairwiseAlignment(
-  aaMBP1_SACCE,
-  aaMBP1_MYSPE,
-  type = "local",
-  substitutionMatrix = "BLOSUM62",
-  gapOpening = 50,
-  gapExtension = 10)
-
-Biostrings::writePairwiseAlignments(ali2)
-# This has probably only aligned the N-terminal DNA binding domain - but that
-# one has quite high sequence identity:
-percentID(ali2)
-
-# == TASK: ==
-
-# Compare the two alignments. I have weighted the local alignment heavily
-# towards an ungapped alignment by setting very high gap penalties. Try changing
-# the gap penalties and see what happens: how does the number of indels change,
-# how does the length of indels change...
-
-
-# =    3  APSES Domain annotation by alignment  ================================
-
-# In this section we define the MYSPE APSES sequence by performing a global,
-# optimal sequence alignment of the yeast APSES domain with the full length
-# protein sequence of the protein that was the most similar to the yeast APSES
-# domain.
-#
-
-# I have annotated the yeast APSES domain as a feature in the
-# database. To view the annotation, we can retrieve it via the proteinID and
-# featureID. Here is the yeast protein ID:
-(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
-
-
-# ... and if you look at the feature table, you can identify the feature ID
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-
-# ... and with the two annotations we can get the corresponding ID from the
-# annotation table
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                             myDB$annotation$featureID == ftrID])
-
-myDB$annotation[myDB$annotation$ID == proID &
-                myDB$annotation$ID == ftrID, ]
-
-# The annotation record contains the start and end coordinates which we can use
-# to define the APSES domain sequence with a substr() expression.
-
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
-                 end))
-
-# Lots of code. But don't get lost. Let's recapitulate what we have done: we
-# have selected from the sequence column of the protein table the sequence whose
-# name is "MBP1_SACCE", and selected from the annotation table the start
-# and end coordinates of the annotation that joins an "APSES fold" feature with
-# the sequence, and used the start and end coordinates to extract a substring.
-
-# Let's convert this to an AAstring and assign it:
-aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
-
-# Now let's align these two sequences of very different length without end-gap
-# penalties using the "overlap" type. "overlap" turns the
-# end-gap penalties off and that is crucially important since
-# the sequences have very different length.
-
-aliApses <-  Biostrings::pairwiseAlignment(
-  aaMB1_SACCE_APSES,
-  aaMBP1_MYSPE,
-  type = "overlap",
-  substitutionMatrix = "BLOSUM62",
-  gapOpening = 10,
-  gapExtension = 0.5)
-
-# Inspect the result. The aligned sequences should be clearly
-# homologous, and have (almost) no indels. The entire "pattern"
-# sequence from QIYSAR ... to ... KPLFDF  should be matched
-# with the "query". Is this correct?
-Biostrings::writePairwiseAlignments(aliApses)
-
-# If this is correct, you can extract the matched sequence from
-# the alignment object. The syntax is a bit different from what
-# you have seen before: this is an "S4 object", not a list. No
-# worries: as.character() returns a normal string.
-as.character(aliApses@subject)
-
-# Now, what are the aligned start and end coordinates? You can read them from
-# the output of writePairwiseAlignments(), or you can get them from the range of
-# the match.
-
-str(aliApses@subject@range)
-
-# start is:
-aliApses@subject@range@start
-
-# ... and end is:
-aliApses@subject@range@start + aliApses@subject@range@width - 1
-
-
-# =    4  Update your database script  =========================================
-
-
-# Since we have this feature defined now, we can create a feature annotation
-# right away and store it in myDB.
-
-# ==   4.1  Preparing an annotation file ...  ==================================
-#
-# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
-#
-#
-#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
-#
-#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
-#     myScripts/ directory.
-#
-#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
-#     if MYSPE is called "Crptycoccus neoformans", your file should be called
-#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
-#     "MBP1_CRYNE").
-#
-#   - Open the file in the RStudio editor and delete all blocks for
-#     the Mbp1 protein annotations except the first one.
-#
-#   - From that block, delete all lines except for the line that says:
-#
-# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
-#
-#   - Then delete the comma at the end of the line (your file will just have
-#     this one annotation).
-#
-#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
-#     "start" and "end" features to the coordinates you just discovered for the
-#     APSES domain in your sequence.
-#
-#   - Save the file in your myScripts/ directory
-#
-##   - Validate your file online at https://jsonlint.com/
-#
-#   - Update your "./myScripts/makeProteinDB.R" script to load your new
-#     annotation when you recreate the database. Open the script in the
-#     RStudio editor, and add the following command at the end:
-#
-#     myDB <- dbAddAnnotation(myDB,
-#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
-#                                                 ^^^^^^^
-#                                                edit this!
-#   - save and close the file.
-#
-# Then SKIP the next section.
-#
-#
-# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
-#
-#
-#   You DO already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
-#
-#   - Open the file in the RStudio editor.
-#
-#   - Below the last feature lines (but before the closing "]") add the
-#     following feature line (without the "#")
-#
-# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
-#
-#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
-#     "start" and "end" features to the coordinates you just discovered for the
-#     APSES domain in your sequence.
-#
-#   - Add a comma after the preceding feature line.
-#
-#   - Save your file.
-#
-#   - Validate your file online at https://jsonlint.com/
-#
-#
-# ==   4.2  Execute and Validate  ==============================================
-#
-#   - source() your database creation script:
-#
-#  source("./myScripts/makeProteinDB.R")
-#
-#     This should run without errors or warnings. If it doesn't work and you
-#     can't figure out quickly what's happening, ask on the mailing list for
-#     help.
-#
-#   - Confirm
-#     The following commands should retrieve the correct start and end
-#     coordinates and sequence of the MBP1_MYSPE APSES domain:
-
-sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
-
-(proID <- myDB$protein$ID[sel])
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                             myDB$annotation$featureID == ftrID])
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
-                 end))
-
-
-# [END]
+# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
+#
+# ==============================================================================
+# Version:  1.7.1
+#
+# Date:     2017-09   -   2020-10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
+#           1.7    2020 updates
+#           1.6    Maintenance
+#           1.5    Change from require() to requireNamespace(),
+#                    use <package>::<function>() idiom throughout
+#           1.4    Pull s2c() from seqinr package, rather then loading the
+#                    entire library.
+#           1.3    Updated confirmation task with correct logic
+#           1.2    Added missing load of seqinr package
+#           1.1    Update annotation file logic - it could already have been
+#                    prepared in the BIN-FUNC-Annotation unit.
+#           1.0.1  bugfix
+#           1.0    First 2017 live version.
+#           0.1    First code copied from 2016 material.
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                                      Line
+#TOC> --------------------------------------------------------------------------
+#TOC>   1        Prepare                                                      58
+#TOC>   2        Biostrings Pairwise Alignment                                75
+#TOC>   2.1        Optimal global alignment                                   93
+#TOC>   2.2        Optimal local alignment                                   156
+#TOC>   3        APSES Domain annotation by alignment                        180
+#TOC>   4        Update your database script                                 261
+#TOC>   4.1        Preparing an annotation file ...                          267
+#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
+#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
+#TOC>   4.2        Execute and Validate                                      338
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Prepare  =============================================================
+
+if (! requireNamespace("seqinr", quietly=TRUE)) {
+  install.packages("seqinr")
+}
+# You can get package information with the following commands:
+# library(help = seqinr)       # basic information
+# browseVignettes("seqinr")    # available vignettes
+# data(package = "seqinr")     # available datasets
+
+
+# You need to recreate the protein database that you have constructed in the
+# BIN-Storing_data unit.
+
+source("./myScripts/makeProteinDB.R")
+
+
+# =    2  Biostrings Pairwise Alignment  =======================================
+
+
+if (!requireNamespace("BiocManager", quietly=TRUE)) {
+  install.packages("BiocManager")
+}
+if (!requireNamespace("Biostrings", quietly=TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+# Biostrings stores sequences in "XString" objects. Once we have converted our
+# target sequences to AAString objects, the alignment itself is straightforward.
+
+# ==   2.1  Optimal global alignment  ==========================================
+
+# The pairwiseAlignment() function was written to behave
+# exactly like the functions you encountered on the EMBOSS server.
+
+# First: make AAString objects ...
+sel <- myDB$protein$name == "MBP1_SACCE"
+aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
+
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
+
+?pairwiseAlignment
+# ... and align.
+# Global optimal alignment with end-gap penalties is default.
+ali1 <-  Biostrings::pairwiseAlignment(
+  aaMBP1_SACCE,
+  aaMBP1_MYSPE,
+  substitutionMatrix = "BLOSUM62",
+  gapOpening = 10,
+  gapExtension = 0.5)
+
+str(ali1)  # ... it's complicated
+
+# This is a Biostrings alignment object. But we can use Biostrings functions to
+# tame it:
+ali1
+Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
+
+# And we can make the internal structure work for us  (@ is for classes as
+# $ is for lists ...)
+str(ali1@pattern)
+ali1@pattern
+ali1@pattern@range
+ali1@pattern@indel
+ali1@pattern@mismatch
+
+# or work with "normal" R functions
+# the alignment length
+nchar(as.character(ali1@pattern))
+
+# the number of identities
+sum(seqinr::s2c(as.character(ali1@pattern)) ==
+    seqinr::s2c(as.character(ali1@subject)))
+
+# ... e.g. to calculate the percentage of identities
+100 *
+  sum(seqinr::s2c(as.character(ali1@pattern)) ==
+      seqinr::s2c(as.character(ali1@subject))) /
+  nchar(as.character(ali1@pattern))
+# ... which should be the same as reported in the writePairwiseAlignments()
+# output. Awkward to type? Then it calls for a function:
+#
+percentID <- function(al) {
+  # returns the percent-identity of a Biostrings alignment object
+  return(100 *
+         sum(seqinr::s2c(as.character(al@pattern)) ==
+             seqinr::s2c(as.character(al@subject))) /
+         nchar(as.character(al@pattern)))
+}
+
+percentID(ali1)
+
+# ==   2.2  Optimal local alignment  ===========================================
+
+# Compare with local optimal alignment (like EMBOSS Water)
+ali2 <-  Biostrings::pairwiseAlignment(
+  aaMBP1_SACCE,
+  aaMBP1_MYSPE,
+  type = "local",
+  substitutionMatrix = "BLOSUM62",
+  gapOpening = 50,
+  gapExtension = 10)
+
+Biostrings::writePairwiseAlignments(ali2)
+# This has probably only aligned the N-terminal DNA binding domain - but that
+# one has quite high sequence identity:
+percentID(ali2)
+
+# == TASK: ==
+
+# Compare the two alignments. I have weighted the local alignment heavily
+# towards an ungapped alignment by setting very high gap penalties. Try changing
+# the gap penalties and see what happens: how does the number of indels change,
+# how does the length of indels change...
+
+
+# =    3  APSES Domain annotation by alignment  ================================
+
+# In this section we define the MYSPE APSES sequence by performing a global,
+# optimal sequence alignment of the yeast APSES domain with the full length
+# protein sequence of the protein that was the most similar to the yeast APSES
+# domain.
+#
+
+# I have annotated the yeast APSES domain as a feature in the
+# database. To view the annotation, we can retrieve it via the proteinID and
+# featureID. Here is the yeast protein ID:
+(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
+
+
+# ... and if you look at the feature table, you can identify the feature ID
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+
+# ... and with the two annotations we can get the corresponding ID from the
+# annotation table
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+                             myDB$annotation$featureID == ftrID])
+
+myDB$annotation[myDB$annotation$ID == proID &
+                myDB$annotation$ID == ftrID, ]
+
+# The annotation record contains the start and end coordinates which we can use
+# to define the APSES domain sequence with a substr() expression.
+
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+                 start,
+                 end))
+
+# Lots of code. But don't get lost. Let's recapitulate what we have done: we
+# have selected from the sequence column of the protein table the sequence whose
+# name is "MBP1_SACCE", and selected from the annotation table the start
+# and end coordinates of the annotation that joins an "APSES fold" feature with
+# the sequence, and used the start and end coordinates to extract a substring.
+
+# Let's convert this to an AAstring and assign it:
+aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
+
+# Now let's align these two sequences of very different length without end-gap
+# penalties using the "overlap" type. "overlap" turns the
+# end-gap penalties off and that is crucially important since
+# the sequences have very different length.
+
+aliApses <-  Biostrings::pairwiseAlignment(
+  aaMB1_SACCE_APSES,
+  aaMBP1_MYSPE,
+  type = "overlap",
+  substitutionMatrix = "BLOSUM62",
+  gapOpening = 10,
+  gapExtension = 0.5)
+
+# Inspect the result. The aligned sequences should be clearly
+# homologous, and have (almost) no indels. The entire "pattern"
+# sequence from QIYSAR ... to ... KPLFDF  should be matched
+# with the "query". Is this correct?
+Biostrings::writePairwiseAlignments(aliApses)
+
+# If this is correct, you can extract the matched sequence from
+# the alignment object. The syntax is a bit different from what
+# you have seen before: this is an "S4 object", not a list. No
+# worries: as.character() returns a normal string.
+as.character(aliApses@subject)
+
+# Now, what are the aligned start and end coordinates? You can read them from
+# the output of writePairwiseAlignments(), or you can get them from the range of
+# the match.
+
+str(aliApses@subject@range)
+
+# start is:
+aliApses@subject@range@start
+
+# ... and end is:
+aliApses@subject@range@start + aliApses@subject@range@width - 1
+
+
+# =    4  Update your database script  =========================================
+
+
+# Since we have this feature defined now, we can create a feature annotation
+# right away and store it in myDB.
+
+# ==   4.1  Preparing an annotation file ...  ==================================
+#
+# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
+#
+#
+#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
+#   ./myScripts/ directory:
+#
+#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
+#     myScripts/ directory.
+#
+#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
+#     if MYSPE is called "Crptycoccus neoformans", your file should be called
+#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
+#     "MBP1_CRYNE").
+#
+#   - Open the file in the RStudio editor and delete all blocks for
+#     the Mbp1 protein annotations except the first one.
+#
+#   - From that block, delete all lines except for the line that says:
+#
+# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
+#
+#   - Then delete the comma at the end of the line (your file will just have
+#     this one annotation).
+#
+#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
+#     "start" and "end" features to the coordinates you just discovered for the
+#     APSES domain in your sequence.
+#
+#   - Save the file in your myScripts/ directory
+#
+##   - Validate your file online at https://jsonlint.com/
+#
+#   - Update your "./myScripts/makeProteinDB.R" script to load your new
+#     annotation when you recreate the database. Open the script in the
+#     RStudio editor, and add the following command at the end:
+#
+#     myDB <- dbAddAnnotation(myDB,
+#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
+#                                                 ^^^^^^^
+#                                                edit this!
+#   - save and close the file.
+#
+# Then SKIP the next section.
+#
+#
+# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
+#
+#
+#   You DO already have a file called "<MYSPE>-Annotations.json" in the
+#   ./myScripts/ directory:
+#
+#   - Open the file in the RStudio editor.
+#
+#   - Below the last feature lines (but before the closing "]") add the
+#     following feature line (without the "#")
+#
+# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
+#
+#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
+#     "start" and "end" features to the coordinates you just discovered for the
+#     APSES domain in your sequence.
+#
+#   - Add a comma after the preceding feature line.
+#
+#   - Save your file.
+#
+#   - Validate your file online at https://jsonlint.com/
+#
+#
+# ==   4.2  Execute and Validate  ==============================================
+#
+#   - source() your database creation script:
+#
+#  source("./myScripts/makeProteinDB.R")
+#
+#     This should run without errors or warnings. If it doesn't work and you
+#     can't figure out quickly what's happening, ask on the mailing list for
+#     help.
+#
+#   - Confirm
+#     The following commands should retrieve the correct start and end
+#     coordinates and sequence of the MBP1_MYSPE APSES domain:
+
+sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
+
+(proID <- myDB$protein$ID[sel])
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+                             myDB$annotation$featureID == ftrID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+                 start,
+                 end))
+
+
+# [END]
--- a/BIN-ALI-Similarity.R
+++ b/BIN-ALI-Similarity.R
@ -1,313 +1,313 @@
-# tocID <- "BIN-ALI-Similarity.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Similarity unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0    Refactored for 2017; add aaindex, ternary plot.
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#   Update ggtern:: ternary plot to use aacol dots under text
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                          Line
-#TOC> ----------------------------------------------
-#TOC>   1        Amino Acid Properties            43
-#TOC>   2        Mutation Data matrix            189
-#TOC>   3        Background score                230
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Amino Acid Properties  ===============================================
-
-# A large collection of amino acid property tables is available via the seqinr
-# package:
-
-if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
-}
-# Package information:
-#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
-
-# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
-#  data:
-
-?aaindex
-data(aaindex, package = "seqinr")  # load the aaindex list from the package
-
-length(aaindex)
-
-# Here are all the index descriptions
-for (i in 1:length(aaindex)) {
-  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
-}
-
-# It's a bit cumbersome to search through the descriptions ... here is a
-# function to make this easier:
-
-searchAAindex <- function(patt) {
-  # Searches the aaindex descriptions for regular expression "patt"
-  # and prints index number and description.
-  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
-  for (i in seq_along(hits)) {
-    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
-  }
-}
-
-
-searchAAindex("free energy")          # Search for "free energy"
-searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
-
-
-
-
-# Let's examine ...
-# ... a hydrophobicity index
-(Y <- aaindex[[528]][c("D", "I")])
-
-# ... a volume index
-(V <- aaindex[[150]][c("D", "I")])
-
-# ... and one of our own: side-chain pK values as reported by
-# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
-# to 7.4 (physiological pH)
-K <- list(I = c( 7.4,   # Ala
-                12.3,   # Arg
-                 7.4,   # Asn
-                 3.9,   # Asp
-                 8.6,   # Cys
-                 7.4,   # Gln
-                 4.3,   # Glu
-                 7.4,   # Gly
-                 6.5,   # His
-                 7.4,   # Ile
-                 7.4,   # Leu
-                10.4,   # Lys
-                 7.4,   # Met
-                 7.4,   # Phe
-                 7.4,   # Pro
-                 7.4,   # Ser
-                 7.4,   # Thr
-                 7.4,   # Trp
-                 9.8,   # Tyr
-                 7.4))  # Val
-names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
-                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
-
-
-# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
-
-# pull the names from Y$I, convert them to single letter code, and reorder the
-# AACOLS palette accordingly ...
-aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
-
-plot(Y$I, V$I,
-     xlab = "hydrophobicity", ylab = "volume",
-     pch = 21,
-     cex = 6,
-     col = aac,
-     bg  = aac)
-text(Y$I, V$I, names(Y$I), cex = 0.8)
-
-plot(Y$I, K$I,
-     xlab = "hydrophobicity", ylab = "pK",
-     pch = 21,
-     cex = 6,
-     col = aac,
-     bg  = aac)
-text(Y$I, K$I, names(Y$I), cex = 0.8)
-
-# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
-# plots are in general unintuitive and hard to interpret. One alternative is a
-# so-called "ternary plot":
-
-if (! requireNamespace("ggtern", quietly=TRUE)) {
-  install.packages("ggtern")
-}
-# Package information:
-#  library(help = ggtern)       # basic information
-#  browseVignettes("ggtern")    # available vignettes
-#  data(package = "ggtern")     # available datasets
-
-
-
-# collect into data frame, normalize to (0.05, 0.95)
-myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
-                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
-                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
-                    stringsAsFactors = FALSE)
-rownames(myDat) <- names(Y$I)
-
-ggtern::ggtern(data = myDat,
-               ggplot2::aes(x = vol,
-                   y = phi,
-                   z = pK,
-                   label = rownames(myDat))) + ggplot2::geom_text()
-
-# This results in a mapping of amino acids relative to each other that is
-# similar to the Venn diagram you have seen in the notes.
-
-# ... or we could use principal components analysis, to pull out the
-# best projection of the three feature dimensions into two. (Done here without delving
-# into the theory ...)
-prc <- prcomp(myDat)
-plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
-     pch=19, cex=6, col=aad, cex.main=0.7,
-     main="Principal Component Analysis of Amino Acid Features")
-text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
-
-# This matches the intuition rather well in that "similar" amino acids are close
-# on the plot. But we can't interpret the distances in terms of just one of the
-# parameters. Whatever - nature has a different way to define similarity:
-# mutations to similar amino acids are less likely to break the protein.
-
-
-# =    2  Mutation Data matrix  ================================================
-
-# A mutation data matrix encodes all amino acid pairscores in a matrix.
-
-# The Biostrings package contains the most common mutation data matrices.
-
-if (! requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help=Biostrings)       # basic information
-#  browseVignettes("Biostrings")  # available vignettes
-#  data(package = "Biostrings")   # available datasets
-
-# Let's attach the BLOSUM62 mutation data matrix from the package
-data(BLOSUM62, package = "Biostrings")
-
-# ... and see what it contains. (You've seen this matrix before.)
-BLOSUM62
-
-# We can simply access values via the row/column names.
-# Identical amino acids have high scores ...
-BLOSUM62["H", "H"]   # Score for a pair of two histidines
-BLOSUM62["S", "S"]   # Score for a pair of two serines
-
-# Similar amino acids have low positive scores ...
-BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
-BLOSUM62["F", "Y"]   # etc.
-
-# Dissimilar amino acids have negative scores ...
-BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
-BLOSUM62["Q", "P"]   # etc.
-
-
-BLOSUM62["R", "W"]   # the matrix is symmetric!
-BLOSUM62["W", "R"]
-
-
-# =    3  Background score  ====================================================
-
-# The mutation data matrix is designed to give high scores to homologous
-# sequences, low scores to non-homologous sequences. What score on average
-# should we expect for a random sequence?
-
-# If we sample amino acid pairs at random, we will get a score that is the
-# average of the individual pairscores in the matrix. Omitting the ambiguity
-# codes and the gap character:
-
-sum(BLOSUM62[1:20, 1:20])/400
-
-# But that score could be higher for real sequences, for which the amino acid
-# distribution is not random. For example membrane proteins have a large number
-# of hydrophobic residues - an alignment of unrelated proteins might produce
-# positive scores. And there are other proteins with biased amino acid
-# compositions, in particular poteins that interact with multiple other
-# proteins. Let's test how this impacts the background score by comparing a
-# sequence with shuffled sequences. These have the same composition, but are
-# obvioulsy not homologous. The data directory contains the FASTA file for the
-# PDB ID 3FG7 - a villin headpiece structure with a large amount of
-# low-complexity amino acid sequence ...
-
-aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
-
-# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
-# with an exceptionally high percentage of hydrophobic residues.
-
-aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
-
-# Here is a function that takes two sequences and
-# returns their average pairscore.
-
-averagePairScore <- function(a, b, MDM = BLOSUM62) {
-  # Returns average pairscore of two sequences.
-  # Parameters:
-  #    a, b   chr   amino acid sequence string
-  #    MDM          mutation data matrix. Default is BLOSUM62
-  # Value:    num   average pairscore.
-  a <- unlist(strsplit(a, ""))
-  b <- unlist(strsplit(b, ""))
-  v <- 0
-  for (i in seq_along(a)) {
-    v <- v + MDM[ a[i], b[i] ]
-  }
-  return(v / length(a))
-}
-
-orig3FG7 <- toString(aa3FG7)
-orig2F1C <- toString(aa2F1C)
-N <- 1000
-scores3FG7 <- numeric(N)
-scores2F1C <- numeric(N)
-for (i in 1:N) {
-  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
-  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
-}
-
-# Plot the distributions
-hist(scores3FG7,
-     col="#5599EE33",
-     breaks = seq(-1.5, 0, by=0.1),
-     main = "Pairscores for randomly shuffled sequences",
-     xlab = "Average pairscore from BLOSUM 62")
-hist(scores2F1C,
-     col="#55EE9933",
-     breaks = seq(-1.5, 0, by=0.1),
-     add = TRUE)
-abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
-legend('topright',
-       c("3FG7 (villin)", "2F1C (OmpG)"),
-       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
-       inset = 0.1)
-
-# This is an important result: even though we have shuffled significantly biased
-# sequences, and the average scores trend above the average of the mutation data
-# matrix, the average scores still remain comfortably below zero. This means
-# that we can't (in general) improve a high-scoring alignment by simply
-# extending it with randomly matched residues. We will only improve the score if
-# the similarity of newly added residues is larger than what we expect to get by
-# random chance!
-
-
-# [END]
+# tocID <- "BIN-ALI-Similarity.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-ALI-Similarity unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Updates
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0    Refactored for 2017; add aaindex, ternary plot.
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#   Update ggtern:: ternary plot to use aacol dots under text
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                          Line
+#TOC> ----------------------------------------------
+#TOC>   1        Amino Acid Properties            43
+#TOC>   2        Mutation Data matrix            189
+#TOC>   3        Background score                230
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Amino Acid Properties  ===============================================
+
+# A large collection of amino acid property tables is available via the seqinr
+# package:
+
+if (! requireNamespace("seqinr", quietly=TRUE)) {
+  install.packages("seqinr")
+}
+# Package information:
+#  library(help = seqinr)       # basic information
+#  browseVignettes("seqinr")    # available vignettes
+#  data(package = "seqinr")     # available datasets
+
+# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
+#  data:
+
+?aaindex
+data(aaindex, package = "seqinr")  # load the aaindex list from the package
+
+length(aaindex)
+
+# Here are all the index descriptions
+for (i in 1:length(aaindex)) {
+  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
+}
+
+# It's a bit cumbersome to search through the descriptions ... here is a
+# function to make this easier:
+
+searchAAindex <- function(patt) {
+  # Searches the aaindex descriptions for regular expression "patt"
+  # and prints index number and description.
+  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
+  for (i in seq_along(hits)) {
+    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
+  }
+}
+
+
+searchAAindex("free energy")          # Search for "free energy"
+searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
+
+
+
+
+# Let's examine ...
+# ... a hydrophobicity index
+(Y <- aaindex[[528]][c("D", "I")])
+
+# ... a volume index
+(V <- aaindex[[150]][c("D", "I")])
+
+# ... and one of our own: side-chain pK values as reported by
+# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
+# to 7.4 (physiological pH)
+K <- list(I = c( 7.4,   # Ala
+                12.3,   # Arg
+                 7.4,   # Asn
+                 3.9,   # Asp
+                 8.6,   # Cys
+                 7.4,   # Gln
+                 4.3,   # Glu
+                 7.4,   # Gly
+                 6.5,   # His
+                 7.4,   # Ile
+                 7.4,   # Leu
+                10.4,   # Lys
+                 7.4,   # Met
+                 7.4,   # Phe
+                 7.4,   # Pro
+                 7.4,   # Ser
+                 7.4,   # Thr
+                 7.4,   # Trp
+                 9.8,   # Tyr
+                 7.4))  # Val
+names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
+                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
+
+
+# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
+
+# pull the names from Y$I, convert them to single letter code, and reorder the
+# AACOLS palette accordingly ...
+aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
+
+plot(Y$I, V$I,
+     xlab = "hydrophobicity", ylab = "volume",
+     pch = 21,
+     cex = 6,
+     col = aac,
+     bg  = aac)
+text(Y$I, V$I, names(Y$I), cex = 0.8)
+
+plot(Y$I, K$I,
+     xlab = "hydrophobicity", ylab = "pK",
+     pch = 21,
+     cex = 6,
+     col = aac,
+     bg  = aac)
+text(Y$I, K$I, names(Y$I), cex = 0.8)
+
+# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
+# plots are in general unintuitive and hard to interpret. One alternative is a
+# so-called "ternary plot":
+
+if (! requireNamespace("ggtern", quietly=TRUE)) {
+  install.packages("ggtern")
+}
+# Package information:
+#  library(help = ggtern)       # basic information
+#  browseVignettes("ggtern")    # available vignettes
+#  data(package = "ggtern")     # available datasets
+
+
+
+# collect into data frame, normalize to (0.05, 0.95)
+myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
+                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
+                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
+                    stringsAsFactors = FALSE)
+rownames(myDat) <- names(Y$I)
+
+ggtern::ggtern(data = myDat,
+               ggplot2::aes(x = vol,
+                   y = phi,
+                   z = pK,
+                   label = rownames(myDat))) + ggplot2::geom_text()
+
+# This results in a mapping of amino acids relative to each other that is
+# similar to the Venn diagram you have seen in the notes.
+
+# ... or we could use principal components analysis, to pull out the
+# best projection of the three feature dimensions into two. (Done here without delving
+# into the theory ...)
+prc <- prcomp(myDat)
+plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
+     pch=19, cex=6, col=aad, cex.main=0.7,
+     main="Principal Component Analysis of Amino Acid Features")
+text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
+
+# This matches the intuition rather well in that "similar" amino acids are close
+# on the plot. But we can't interpret the distances in terms of just one of the
+# parameters. Whatever - nature has a different way to define similarity:
+# mutations to similar amino acids are less likely to break the protein.
+
+
+# =    2  Mutation Data matrix  ================================================
+
+# A mutation data matrix encodes all amino acid pairscores in a matrix.
+
+# The Biostrings package contains the most common mutation data matrices.
+
+if (! requireNamespace("BiocManager", quietly=TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly=TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help=Biostrings)       # basic information
+#  browseVignettes("Biostrings")  # available vignettes
+#  data(package = "Biostrings")   # available datasets
+
+# Let's attach the BLOSUM62 mutation data matrix from the package
+data(BLOSUM62, package = "Biostrings")
+
+# ... and see what it contains. (You've seen this matrix before.)
+BLOSUM62
+
+# We can simply access values via the row/column names.
+# Identical amino acids have high scores ...
+BLOSUM62["H", "H"]   # Score for a pair of two histidines
+BLOSUM62["S", "S"]   # Score for a pair of two serines
+
+# Similar amino acids have low positive scores ...
+BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
+BLOSUM62["F", "Y"]   # etc.
+
+# Dissimilar amino acids have negative scores ...
+BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
+BLOSUM62["Q", "P"]   # etc.
+
+
+BLOSUM62["R", "W"]   # the matrix is symmetric!
+BLOSUM62["W", "R"]
+
+
+# =    3  Background score  ====================================================
+
+# The mutation data matrix is designed to give high scores to homologous
+# sequences, low scores to non-homologous sequences. What score on average
+# should we expect for a random sequence?
+
+# If we sample amino acid pairs at random, we will get a score that is the
+# average of the individual pairscores in the matrix. Omitting the ambiguity
+# codes and the gap character:
+
+sum(BLOSUM62[1:20, 1:20])/400
+
+# But that score could be higher for real sequences, for which the amino acid
+# distribution is not random. For example membrane proteins have a large number
+# of hydrophobic residues - an alignment of unrelated proteins might produce
+# positive scores. And there are other proteins with biased amino acid
+# compositions, in particular poteins that interact with multiple other
+# proteins. Let's test how this impacts the background score by comparing a
+# sequence with shuffled sequences. These have the same composition, but are
+# obvioulsy not homologous. The data directory contains the FASTA file for the
+# PDB ID 3FG7 - a villin headpiece structure with a large amount of
+# low-complexity amino acid sequence ...
+
+aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
+
+# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
+# with an exceptionally high percentage of hydrophobic residues.
+
+aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
+
+# Here is a function that takes two sequences and
+# returns their average pairscore.
+
+averagePairScore <- function(a, b, MDM = BLOSUM62) {
+  # Returns average pairscore of two sequences.
+  # Parameters:
+  #    a, b   chr   amino acid sequence string
+  #    MDM          mutation data matrix. Default is BLOSUM62
+  # Value:    num   average pairscore.
+  a <- unlist(strsplit(a, ""))
+  b <- unlist(strsplit(b, ""))
+  v <- 0
+  for (i in seq_along(a)) {
+    v <- v + MDM[ a[i], b[i] ]
+  }
+  return(v / length(a))
+}
+
+orig3FG7 <- toString(aa3FG7)
+orig2F1C <- toString(aa2F1C)
+N <- 1000
+scores3FG7 <- numeric(N)
+scores2F1C <- numeric(N)
+for (i in 1:N) {
+  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
+  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
+}
+
+# Plot the distributions
+hist(scores3FG7,
+     col="#5599EE33",
+     breaks = seq(-1.5, 0, by=0.1),
+     main = "Pairscores for randomly shuffled sequences",
+     xlab = "Average pairscore from BLOSUM 62")
+hist(scores2F1C,
+     col="#55EE9933",
+     breaks = seq(-1.5, 0, by=0.1),
+     add = TRUE)
+abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
+legend('topright',
+       c("3FG7 (villin)", "2F1C (OmpG)"),
+       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
+       inset = 0.1)
+
+# This is an important result: even though we have shuffled significantly biased
+# sequences, and the average scores trend above the average of the mutation data
+# matrix, the average scores still remain comfortably below zero. This means
+# that we can't (in general) improve a high-scoring alignment by simply
+# extending it with randomly matched residues. We will only improve the score if
+# the similarity of newly added residues is larger than what we expect to get by
+# random chance!
+
+
+# [END]
--- a/BIN-Data_integration.R
+++ b/BIN-Data_integration.R
@ -1,216 +1,216 @@
-# tocID <- "BIN-Data_integration.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-Data_integration unit.
-#
-# Version:  1.2
-#
-# Date:     2018-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance and updates
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0.1  Bugfix: UniProt ID Mapping service API change
-#           1.0    First live version
-#
-#
-# TODO:
-#           Develop a fungi-specific BioMart example.
-#           (cf.
-# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
-#TOC>   1        Identifier mapping                  42
-#TOC>   2        Cross-referencing tables           165
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Identifier mapping  ==================================================
-
-# UniProt provides a well-designed ID mapping tool that can be accessed
-# online at     http://www.uniprot.org/mapping/
-#
-# Here we will use the UniProt Web API for this tool to map identifiers. The
-# UniProt ID mapping service supports a "RESTful API": responses can be obtained
-# simply via a Web- browsers request. Such requests are commonly sent via the
-# GET or POST verbs that a Webserver responds to, when a client asks for data.
-# GET requests are visible in the URL of the request; POST requests are not
-# directly visible, they are commonly used to send the contents of forms, or
-# when transmitting larger, complex data items. The UniProt ID mapping sevice
-# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
-# and  POST() functions are part of the httr package.
-
-# To begin, we load  httr, which supports sending and receiving data via the
-# http protocol, just like a Web browser.
-if (! requireNamespace("httr", quietly=TRUE)) {
-  install.packages("httr")
-}
-# Package information:
-#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
-
-
-# We will walk through the process with the refSeqID
-# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
-# happens if the ID can't be mapped:
-myQueryIDs <- "NP_010227 NP_00000 NP_011036"
-
-
-# The UniProt ID mapping service API is very straightforward to use: just define
-# the URL of the server and send a list of items labelled as "query" in the body
-# of the request. GET() and POST() are functions from httr.
-
-# Note. A recent bug in the interaction between the server expectations and the
-# curl client libraries requires the following initialization
-httr::set_config(httr::config(http_version = 0))
-# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
-
-
-URL <- "https://www.uniprot.org/mapping/"
-response <- httr::POST(URL,
-                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
-                                   to = "ACC",             # UniProt ID
-                                   format = "tab",
-                                   query = myQueryIDs))
-
-cat(httr::content(response))
-
-# We need to check the status code - if it is not 200, an error ocurred and we
-# can't process the result:
-httr::status_code(response)
-
-# If the query is successful, tabbed text is returned. We can assign that to a
-# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
-
-myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
-                          sep = "\t",
-                          stringsAsFactors = FALSE)
-myMappedIDs
-
-# If this works as expected, you should see:
-#        From     To
-# 1 NP_010227 P39678
-# 2 NP_011036 P25302
-#
-# ... and note that there are only two entries, because nothing was returned
-# for the dummy "RefSeq ID" NP_00000
-
-# If the query can't be fulfilled because of a problem with the server, a
-# WebPage is returned. But the server status is also returned and we can check
-# the status code. I have lately gotten many "503" status codes: Server Not
-# Available...
-
-# We wrap this into a function:
-
-myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
-  # Use UniProt ID mapping service to map one or more IDs
-  # Parameters:
-  #    s  char  A string of separated IDs
-  #    mapFrom  char  the database in which the IDs in s are valid. Default
-  #                     is RefSeq protein
-  #    mapTo    char  the database in which the target IDs are valid. Default
-  #                     is UniProtKB
-  # Value
-  #    a data frame of mapped IDs, with column names From and To, or an
-  #    empty data frame if the mapping was unsuccessful. No rows are returned
-  #    for IDs that are not mapped.
-
-  # Initialize curl
-  httr::set_config(httr::config(http_version = 0))
-
-  URL <- "https://www.uniprot.org/uploadlists/"
-  response <- httr::POST(URL,
-                         body = list(from = mapFrom,
-                                     to = mapTo,
-                                     format = "tab",
-                                     query = s))
-
-  if (httr::status_code(response) == 200) { # 200: oK
-    myMap <- read.delim(file = textConnection(httr::content(response)),
-                        sep = "\t",
-                        stringsAsFactors = FALSE)
-    colnames(myMap) <- c("From", "To")
-  } else {
-    myMap <- data.frame()
-    warning(paste("No uniProt ID mapping returned:",
-                  "server sent status",
-                  httr::status_code(response)))
-  }
-
-  return(myMap)
-}
-
-# Try it out ...
-myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
-
-# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
-# into your workspace on startup.
-
-
-# =    2  Cross-referencing tables  ============================================
-
-# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
-# genes in a model organism database such as SGD, or from the Human Genen
-# Nomenclature commission. How do we map one set of identifiers to another one?
-
-# The function to use is match().
-# Here is a tiny set of identifiers taken from a much larger table to
-# illustrate the principle:
-#
-
-myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
-                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
-                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
-                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
-                    refID = c("NP_014657", "NP_009386",
-                              "NP_012683", "NP_012559",
-                              "NP_010038", "NP_014882",
-                              "NP_012616", "NP_013254",
-                              "NP_014555", "NP_013629"))
-
-myIDs
-
-# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
-# their gene names.
-myQuery <- c("NP_010038", "NP_999999", "NP_013629")
-
-# %in% will only tell us if these IDs are present in the table:
-myQuery %in% myIDs$refID
-
-# ... but not where they are located. But match() does what we need here:
-match(myQuery, myIDs$refID)
-
-# ... and we can use the result to subset the column that we want to map to:
-myIDs$name[match(myQuery, myIDs$refID)]
-
-# Note that the output preserves the NA - i.e. the length of the mapped
-# values is exactly the same as the length of the query.
-
-# task: map the three genes to their UniProt Identifier.
-
-
-#
-# Note: if you want to do very many queries in very large tables, use the
-# fmatch() function in the "fastmatch" package for a considerable
-# speedup.
-
-
-
-
-# [END]
+# tocID <- "BIN-Data_integration.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-Data_integration unit.
+#
+# Version:  1.2
+#
+# Date:     2018-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance and updates
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0.1  Bugfix: UniProt ID Mapping service API change
+#           1.0    First live version
+#
+#
+# TODO:
+#           Develop a fungi-specific BioMart example.
+#           (cf.
+# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Identifier mapping                  42
+#TOC>   2        Cross-referencing tables           165
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Identifier mapping  ==================================================
+
+# UniProt provides a well-designed ID mapping tool that can be accessed
+# online at     http://www.uniprot.org/mapping/
+#
+# Here we will use the UniProt Web API for this tool to map identifiers. The
+# UniProt ID mapping service supports a "RESTful API": responses can be obtained
+# simply via a Web- browsers request. Such requests are commonly sent via the
+# GET or POST verbs that a Webserver responds to, when a client asks for data.
+# GET requests are visible in the URL of the request; POST requests are not
+# directly visible, they are commonly used to send the contents of forms, or
+# when transmitting larger, complex data items. The UniProt ID mapping sevice
+# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
+# and  POST() functions are part of the httr package.
+
+# To begin, we load  httr, which supports sending and receiving data via the
+# http protocol, just like a Web browser.
+if (! requireNamespace("httr", quietly=TRUE)) {
+  install.packages("httr")
+}
+# Package information:
+#  library(help = httr)       # basic information
+#  browseVignettes("httr")    # available vignettes
+#  data(package = "httr")     # available datasets
+
+
+# We will walk through the process with the refSeqID
+# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
+# happens if the ID can't be mapped:
+myQueryIDs <- "NP_010227 NP_00000 NP_011036"
+
+
+# The UniProt ID mapping service API is very straightforward to use: just define
+# the URL of the server and send a list of items labelled as "query" in the body
+# of the request. GET() and POST() are functions from httr.
+
+# Note. A recent bug in the interaction between the server expectations and the
+# curl client libraries requires the following initialization
+httr::set_config(httr::config(http_version = 0))
+# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
+
+
+URL <- "https://www.uniprot.org/mapping/"
+response <- httr::POST(URL,
+                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
+                                   to = "ACC",             # UniProt ID
+                                   format = "tab",
+                                   query = myQueryIDs))
+
+cat(httr::content(response))
+
+# We need to check the status code - if it is not 200, an error ocurred and we
+# can't process the result:
+httr::status_code(response)
+
+# If the query is successful, tabbed text is returned. We can assign that to a
+# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
+
+myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
+                          sep = "\t",
+                          stringsAsFactors = FALSE)
+myMappedIDs
+
+# If this works as expected, you should see:
+#        From     To
+# 1 NP_010227 P39678
+# 2 NP_011036 P25302
+#
+# ... and note that there are only two entries, because nothing was returned
+# for the dummy "RefSeq ID" NP_00000
+
+# If the query can't be fulfilled because of a problem with the server, a
+# WebPage is returned. But the server status is also returned and we can check
+# the status code. I have lately gotten many "503" status codes: Server Not
+# Available...
+
+# We wrap this into a function:
+
+myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
+  # Use UniProt ID mapping service to map one or more IDs
+  # Parameters:
+  #    s  char  A string of separated IDs
+  #    mapFrom  char  the database in which the IDs in s are valid. Default
+  #                     is RefSeq protein
+  #    mapTo    char  the database in which the target IDs are valid. Default
+  #                     is UniProtKB
+  # Value
+  #    a data frame of mapped IDs, with column names From and To, or an
+  #    empty data frame if the mapping was unsuccessful. No rows are returned
+  #    for IDs that are not mapped.
+
+  # Initialize curl
+  httr::set_config(httr::config(http_version = 0))
+
+  URL <- "https://www.uniprot.org/uploadlists/"
+  response <- httr::POST(URL,
+                         body = list(from = mapFrom,
+                                     to = mapTo,
+                                     format = "tab",
+                                     query = s))
+
+  if (httr::status_code(response) == 200) { # 200: oK
+    myMap <- read.delim(file = textConnection(httr::content(response)),
+                        sep = "\t",
+                        stringsAsFactors = FALSE)
+    colnames(myMap) <- c("From", "To")
+  } else {
+    myMap <- data.frame()
+    warning(paste("No uniProt ID mapping returned:",
+                  "server sent status",
+                  httr::status_code(response)))
+  }
+
+  return(myMap)
+}
+
+# Try it out ...
+myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
+
+# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
+# into your workspace on startup.
+
+
+# =    2  Cross-referencing tables  ============================================
+
+# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
+# genes in a model organism database such as SGD, or from the Human Genen
+# Nomenclature commission. How do we map one set of identifiers to another one?
+
+# The function to use is match().
+# Here is a tiny set of identifiers taken from a much larger table to
+# illustrate the principle:
+#
+
+myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
+                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
+                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
+                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
+                    refID = c("NP_014657", "NP_009386",
+                              "NP_012683", "NP_012559",
+                              "NP_010038", "NP_014882",
+                              "NP_012616", "NP_013254",
+                              "NP_014555", "NP_013629"))
+
+myIDs
+
+# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
+# their gene names.
+myQuery <- c("NP_010038", "NP_999999", "NP_013629")
+
+# %in% will only tell us if these IDs are present in the table:
+myQuery %in% myIDs$refID
+
+# ... but not where they are located. But match() does what we need here:
+match(myQuery, myIDs$refID)
+
+# ... and we can use the result to subset the column that we want to map to:
+myIDs$name[match(myQuery, myIDs$refID)]
+
+# Note that the output preserves the NA - i.e. the length of the mapped
+# values is exactly the same as the length of the query.
+
+# task: map the three genes to their UniProt Identifier.
+
+
+#
+# Note: if you want to do very many queries in very large tables, use the
+# fmatch() function in the "fastmatch" package for a considerable
+# speedup.
+
+
+
+
+# [END]
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@ -1,435 +1,435 @@
-# tocID <- "BIN-FUNC-Domain_annotation.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-FUNC-Domain_annotation unit.
-#
-# ==============================================================================
-# Version:  1.4
-#
-# Date:     2017-11  -  2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.4    Add code for shared data import from the Wiki
-#           1.3    Add code for database export to JSON and instructions
-#                  for uploading annotations to the Public Student Wiki page
-#           1.2    Consistently: data in ./myScripts/ ;
-#                    begin SHARING DATA section
-#           1.1    2020 Updates
-#           1.0    Live version 2017
-#           0.1    First code copied from 2016 material.
-#
-# TODO:
-#           Put the domain plot into a function
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                                 Line
-#TOC> ---------------------------------------------------------------------
-#TOC>   1        Update your database script                             51
-#TOC>   1.1        Preparing an annotation file ...                      58
-#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
-#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
-#TOC>   1.2        Execute and Validate                                 136
-#TOC>   2        Plot Annotations                                       161
-#TOC>   3        SHARING DATA                                           287
-#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
-#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Update your database script  =========================================
-
-
-# Since you have recorded domain features at the SMART database, we can store
-# the feature annotations in myDB ...
-
-
-# ==   1.1  Preparing an annotation file ...  ==================================
-
-
-# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
-#
-#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
-#
-#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
-#
-#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
-#     myScripts/ directory.
-#
-#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
-#     if MYSPE is called "Crptycoccus neoformans", your file should be called
-#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
-#     "MBP1_CRYNE").
-#
-#   - Open the file in the RStudio editor and delete all blocks for
-#     the Mbp1 protein annotations except the first one.
-#
-#   - From that block, delete all lines that have annotations you did not
-#     find in SMART for MBP1_MYSPE.
-#
-#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
-#     lines to have a line for each feature you found.
-#
-#   - Then delete the comma at the end of the last line.
-#
-#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
-#     and change the "start" and "end" features to the coordinates you
-#     recorded in the SMART database.
-#
-#   - Save your file in the ./myScripts/ folder.
-#
-#   - Validate your file online at https://jsonlint.com/
-#
-#   - Update your "./myScripts/makeProteinDB.R" script to load your new
-#     annotation when you recreate the database. Open the script in the
-#     RStudio editor, and add the following command at the end:
-#
-#     myDB <- dbAddAnnotation(myDB,
-#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
-#                                         ^^^^^^^
-#                                        edit this!
-#
-#   - save and close the file.
-#
-# Then SKIP the next section.
-#
-#
-# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
-#
-#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
-#
-#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
-#  ./myScripts/ directory:
-#
-#   - Open the file in the RStudio editor.
-#
-#   - Make as many copies of the "APSES fold" line as you have found
-#     features in SMART.
-#
-#   - Add a comma after every line except for the last one
-#
-#   - Edit the annotations but include only features that are in the
-#     myDB$feature table. Check which features are in the database by executing
-#
-#        myDB$feature$name
-#
-#   - Update the "start" and "end" coordinates for each feature to the
-#     values you found.
-#
-#   - Save your file.
-#
-#   - Validate your file online at https://jsonlint.com/
-#
-#
-# ==   1.2  Execute and Validate  ==============================================
-#
-#   - source() your database creation script:
-#
-#     source("./myScripts/makeProteinDB.R")
-#
-#     This should run without errors or warnings. If it doesn't work and you
-#     can't figure out quickly what's happening, ask for help on the
-#     Discussion Board.
-#
-#   - Confirm
-#     The following commands should retrieve all of the features that have been
-#     annotated for MBP1_MYSPE
-
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-
-(proID  <- myDB$protein$ID[sel])
-(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
-(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
-myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
-                          # (once). If not, consider what could have gone wrong
-                          # and ask on the list if you have difficulties fixing
-                          # it.
-
-
-# =    2  Plot Annotations  ====================================================
-
-# In this section we will plot domain annotations as colored rectangles on a
-# sequence, as an example of using the R plotting system for generic, data
-# driven images.
-
-# We need a small utility function that draws the annotation boxes on a
-# representation of sequence. It should accept the start and end coordinates,
-# the y value where it should be plotted and the color of the box, and plot a
-# rectangle using R's rect() function.
-
-drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
-  # Draw a box from xStart to xEnd at y, filled with colour myCol
-  # The height of the box is y +- DELTA
-  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
-       border = "black", col = myCol)
-}
-
-# test this:
-plot(c(-1.5, 1.5), c(0, 0), type = "l")
-drawBox(-1, 1, 0.0, "peachpuff")
-
-# Next, we define a function to plot annotations for one protein: the name of
-# the protein, a horizontal grey line for its length, and all of its features.
-
-plotProtein <- function(DB, name, y) {
-  # DB: protein database
-  # name: the name of the protein in the database.
-  # y: height where to draw the plot
-  #
-  # Define colors: we create a vector of color values, one for
-  # each feature, and we give it names of the feature ID. Then we
-  # can easily get the color value from the feature name.
-  # A: make a vector of color values. The syntax may appear unusual -
-  #    colorRampPalette() returns a function, and we simply append
-  #    the parameter (number-of-features) without assigning the function
-  #    to its own variable name.
-  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
-                               "#62C923", "#0A9A9B", "#1958C3",
-                               "#8000D3", "#D0007F"),
-                             space="Lab",
-                             interpolate="linear")(nrow(DB$feature))
-  # B: Features may overlap, so we make the colors transparent by setting
-  #    their "alpha channel" to 1/3  (hex: 55)
-  ftrCol <- paste0(ftrCol, "55")
-  # C: we asssign names
-  names(ftrCol) <- DB$feature$ID
-  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
-
-  # find the row-index of the protein ID in the protein table of DB
-  iProtein <- which(DB$protein$name == name)
-
-  # write the name of the protein
-  text(-30, y, adj=1, labels=name, cex=0.75 )
-
-  #draw a line from 0 to nchar(sequence-of-the-protein)
-  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
-        lwd=3, col="#999999")
-
-  # get the rows of feature annotations for the protein
-  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
-
-  # draw a colored box for each feature
-  for (i in iFtr) {
-    drawBox(DB$annotation$start[i],
-            DB$annotation$end[i],
-            y,
-            ftrCol[ DB$annotation$featureID[i] ])
-  }
-}
-
-# Plot each annotated protein:
-# Get the rows of all unique annotated Mbp1 proteins in myDB
-
-iRows <- grep("^MBP1_", myDB$protein$name)
-
-# define the size of the plot-frame to accomodate all proteins
-yMax <- length(iRows) * 1.1
-xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
-
-# plot an empty frame
-oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
-                                        # decrease margins
-plot(1, 1,
-     xlim = c(-200, xMax + 100),
-     ylim = c(0, yMax),
-     type = "n",
-     axes = FALSE,
-     bty = "n",
-     main = "Mbp1 orthologue domain annotations",
-     xlab = "sequence position",
-     cex.axis = 0.8,
-     ylab="")
-axis(1, at = seq(0, xMax, by = 100))
-myCol <- colorRampPalette(c("#f2003c", "#F0A200",
-                            "#f0ea00", "#62C923",
-                            "#0A9A9B", "#1958C3",
-                            "#8000D3", "#D0007F"),
-                          space="Lab",
-                          interpolate="linear")(nrow(myDB$feature))
-myCol <- paste0(myCol, "55")
-legend(xMax - 150, 7,
-       legend = myDB$feature$name,
-       cex = 0.7,
-       fill = myCol,
-       bty = "n")
-
-# Finally, iterate over all proteins and call plotProtein()
-for (i in seq_along(iRows)) {
-  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
-}
-par(oPar)  # reset the plot parameters
-
-
-# The plot shows what is variable and what is constant about the annotations in
-# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
-# top.
-
-# Task:
-#    Put a copy of the plot into your journal and interpret it with respect
-#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
-
-# Task:
-#    It would be better to align the motif borders, at least approximately (not
-#    all proteins have all motifs). How would you go about doing that?
-
-# =    3  SHARING DATA  ========================================================
-
-# It's particularly interesting to compare such annotations across many
-# homologous proteins. I have created a page on the Student Wiki () that you can
-# edit, and then download the data from the entire class directly to your
-# RStudio project.
-#
-
-# I have provided a function that extracts all information that refers to a
-# single protein from the database, and prints it out as well-formatted JSON,
-# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
-# bookkeeping involved, but the code is not otherwise very enlightening so I
-# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
-# would want to have a look.
-
-
-# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
-
-# Task:
-# =====
-# 1: Run the following code:
-
-cat("{{Vspace}}",
-    "<!-- ==== BEGIN  PROTEIN ==== -->",
-    "<pre class=\"protein-data\">",
-    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
-    "</pre>",
-    "<!-- ===== END PROTEIN ====== -->",
-    "", sep = "\n"
-)
-
-# 2: Copy the entire output from the console.
-# 3: Navigate to
-#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
-#    ... edit the page, and paste your output at the top.
-# 4: Save your edits.
-
-
-
-# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
-
-# Once we have collected a number of protein annotations, we can access the
-# Wiki-page and import the data into our database. The Wiki page is  an html
-# document with lots of MediaWiki specific stuff - but the contents we are
-# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
-# work like normal HTML <pre> tags, but we have defined a special class for them
-# to make it easy to parse out the contents we want. The rvest:: package in
-# combination with xml2:: provides us with all the tools we need for such
-# "Webscraping" of data....
-
-if (! requireNamespace("rvest", quietly=TRUE)) {
-  install.packages("rvest")
-}
-
-if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
-}
-
-# Here's the process:
-# The URL is an "open" page on the student Wiki. Users that are not logged in
-# can view the contents, but you can only edit if you are logged in.
-myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
-
-# First thing is to retrieve the HTML from the url...
-x <- xml2::read_html(myURL)
-
-# This retrieves the page source, but that still needs to be parsed into its
-# logical elements. HTML is a subset of XML and such documents are structured as
-# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
-# parses out the document structure and then uses a so-called "xpath" expression
-# to select nodes we are interested in. Now, xpath is one of those specialized
-# languages of which there are a few more to learn than one would care for. You
-# MUST know how to format sprintf() expressions, and you SHOULD be competent
-# with regular expressions. But if you want to be really competent in your work,
-# basic HTML and CSS is required ... and enough knowledge about xpath to be able
-# to search on Stackoverflow for what you need for parsing data out of Web
-# documents...
-
-# The expression we use below is:
-#   - get any node anywhere in the tree ("//*") ...
-#   - that has a particular attribute("[@ ... ]").
-#   - The attribute we want is that the class of the node is "protein-data";
-#      that is the class we have defined for our <pre> tags.
-# As a result of this selection, we get a list of pointers to the document tree.
-y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
-
-# Next we fetch the actual payload - the text - from the tree:
-# rvest::html_text() gets the text from the list of pointers. The result is a
-# normal list of character strings.
-z <- rvest::html_text(y)
-
-# Finally we can iterate over the list, and add all proteins we don't already
-# have to our database. There may well be items that are rejected because they
-# are already present in the database - for example, unless somebody has
-# annotated new features, all of the features are already there. Don't worry -
-# that is intended; we don't want duplicate entries.
-
-for (thisJSON in z) {
-  thisData <- jsonlite::fromJSON(thisJSON)
-  if (! thisData$protein$name %in% myDB$protein$name) {
-    myDB <- dbAddProtein(myDB, thisData$protein)
-    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
-    myDB <- dbAddFeature(myDB, thisData$feature)
-    myDB <- dbAddAnnotation(myDB, thisData$annotation)
-  }
-}
-
-# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
-
-iRows <- grep("^MBP1_", myDB$protein$name)
-yMax <- length(iRows) * 1.1
-xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
-
-# plot an empty frame
-oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
-plot(1, 1,
-     xlim = c(-200, xMax + 100),
-     ylim = c(0, yMax),
-     type = "n",
-     axes = FALSE,
-     bty = "n",
-     main = "Mbp1 orthologue domain annotations",
-     xlab = "sequence position",
-     cex.axis = 0.8,
-     ylab="")
-axis(1, at = seq(0, xMax, by = 100))
-myCol <- colorRampPalette(c("#f2003c", "#F0A200",
-                            "#f0ea00", "#62C923",
-                            "#0A9A9B", "#1958C3",
-                            "#8000D3", "#D0007F"),
-                          space="Lab",
-                          interpolate="linear")(nrow(myDB$feature))
-myCol <- paste0(myCol, "55")
-legend(xMax - 150, 7,
-       legend = myDB$feature$name,
-       cex = 0.7,
-       fill = myCol,
-       bty = "n")
-
-for (i in seq_along(iRows)) {
-  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
-}
-par(oPar)  # reset the plot parameters
-
-# ... the more proteins we can compare, the more we learn about the
-# architectural principles of this family's domains.
-
-
-# [END]
+# tocID <- "BIN-FUNC-Domain_annotation.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-FUNC-Domain_annotation unit.
+#
+# ==============================================================================
+# Version:  1.4
+#
+# Date:     2017-11  -  2020-10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.4    Add code for shared data import from the Wiki
+#           1.3    Add code for database export to JSON and instructions
+#                  for uploading annotations to the Public Student Wiki page
+#           1.2    Consistently: data in ./myScripts/ ;
+#                    begin SHARING DATA section
+#           1.1    2020 Updates
+#           1.0    Live version 2017
+#           0.1    First code copied from 2016 material.
+#
+# TODO:
+#           Put the domain plot into a function
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                                 Line
+#TOC> ---------------------------------------------------------------------
+#TOC>   1        Update your database script                             51
+#TOC>   1.1        Preparing an annotation file ...                      58
+#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
+#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
+#TOC>   1.2        Execute and Validate                                 136
+#TOC>   2        Plot Annotations                                       161
+#TOC>   3        SHARING DATA                                           287
+#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
+#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Update your database script  =========================================
+
+
+# Since you have recorded domain features at the SMART database, we can store
+# the feature annotations in myDB ...
+
+
+# ==   1.1  Preparing an annotation file ...  ==================================
+
+
+# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
+#
+#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
+#
+#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
+#   ./myScripts/ directory:
+#
+#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
+#     myScripts/ directory.
+#
+#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
+#     if MYSPE is called "Crptycoccus neoformans", your file should be called
+#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
+#     "MBP1_CRYNE").
+#
+#   - Open the file in the RStudio editor and delete all blocks for
+#     the Mbp1 protein annotations except the first one.
+#
+#   - From that block, delete all lines that have annotations you did not
+#     find in SMART for MBP1_MYSPE.
+#
+#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
+#     lines to have a line for each feature you found.
+#
+#   - Then delete the comma at the end of the last line.
+#
+#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
+#     and change the "start" and "end" features to the coordinates you
+#     recorded in the SMART database.
+#
+#   - Save your file in the ./myScripts/ folder.
+#
+#   - Validate your file online at https://jsonlint.com/
+#
+#   - Update your "./myScripts/makeProteinDB.R" script to load your new
+#     annotation when you recreate the database. Open the script in the
+#     RStudio editor, and add the following command at the end:
+#
+#     myDB <- dbAddAnnotation(myDB,
+#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
+#                                         ^^^^^^^
+#                                        edit this!
+#
+#   - save and close the file.
+#
+# Then SKIP the next section.
+#
+#
+# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
+#
+#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
+#
+#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
+#  ./myScripts/ directory:
+#
+#   - Open the file in the RStudio editor.
+#
+#   - Make as many copies of the "APSES fold" line as you have found
+#     features in SMART.
+#
+#   - Add a comma after every line except for the last one
+#
+#   - Edit the annotations but include only features that are in the
+#     myDB$feature table. Check which features are in the database by executing
+#
+#        myDB$feature$name
+#
+#   - Update the "start" and "end" coordinates for each feature to the
+#     values you found.
+#
+#   - Save your file.
+#
+#   - Validate your file online at https://jsonlint.com/
+#
+#
+# ==   1.2  Execute and Validate  ==============================================
+#
+#   - source() your database creation script:
+#
+#     source("./myScripts/makeProteinDB.R")
+#
+#     This should run without errors or warnings. If it doesn't work and you
+#     can't figure out quickly what's happening, ask for help on the
+#     Discussion Board.
+#
+#   - Confirm
+#     The following commands should retrieve all of the features that have been
+#     annotated for MBP1_MYSPE
+
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+
+(proID  <- myDB$protein$ID[sel])
+(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
+(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
+myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
+                          # (once). If not, consider what could have gone wrong
+                          # and ask on the list if you have difficulties fixing
+                          # it.
+
+
+# =    2  Plot Annotations  ====================================================
+
+# In this section we will plot domain annotations as colored rectangles on a
+# sequence, as an example of using the R plotting system for generic, data
+# driven images.
+
+# We need a small utility function that draws the annotation boxes on a
+# representation of sequence. It should accept the start and end coordinates,
+# the y value where it should be plotted and the color of the box, and plot a
+# rectangle using R's rect() function.
+
+drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
+  # Draw a box from xStart to xEnd at y, filled with colour myCol
+  # The height of the box is y +- DELTA
+  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
+       border = "black", col = myCol)
+}
+
+# test this:
+plot(c(-1.5, 1.5), c(0, 0), type = "l")
+drawBox(-1, 1, 0.0, "peachpuff")
+
+# Next, we define a function to plot annotations for one protein: the name of
+# the protein, a horizontal grey line for its length, and all of its features.
+
+plotProtein <- function(DB, name, y) {
+  # DB: protein database
+  # name: the name of the protein in the database.
+  # y: height where to draw the plot
+  #
+  # Define colors: we create a vector of color values, one for
+  # each feature, and we give it names of the feature ID. Then we
+  # can easily get the color value from the feature name.
+  # A: make a vector of color values. The syntax may appear unusual -
+  #    colorRampPalette() returns a function, and we simply append
+  #    the parameter (number-of-features) without assigning the function
+  #    to its own variable name.
+  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
+                               "#62C923", "#0A9A9B", "#1958C3",
+                               "#8000D3", "#D0007F"),
+                             space="Lab",
+                             interpolate="linear")(nrow(DB$feature))
+  # B: Features may overlap, so we make the colors transparent by setting
+  #    their "alpha channel" to 1/3  (hex: 55)
+  ftrCol <- paste0(ftrCol, "55")
+  # C: we asssign names
+  names(ftrCol) <- DB$feature$ID
+  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
+
+  # find the row-index of the protein ID in the protein table of DB
+  iProtein <- which(DB$protein$name == name)
+
+  # write the name of the protein
+  text(-30, y, adj=1, labels=name, cex=0.75 )
+
+  #draw a line from 0 to nchar(sequence-of-the-protein)
+  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
+        lwd=3, col="#999999")
+
+  # get the rows of feature annotations for the protein
+  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
+
+  # draw a colored box for each feature
+  for (i in iFtr) {
+    drawBox(DB$annotation$start[i],
+            DB$annotation$end[i],
+            y,
+            ftrCol[ DB$annotation$featureID[i] ])
+  }
+}
+
+# Plot each annotated protein:
+# Get the rows of all unique annotated Mbp1 proteins in myDB
+
+iRows <- grep("^MBP1_", myDB$protein$name)
+
+# define the size of the plot-frame to accomodate all proteins
+yMax <- length(iRows) * 1.1
+xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
+
+# plot an empty frame
+oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
+                                        # decrease margins
+plot(1, 1,
+     xlim = c(-200, xMax + 100),
+     ylim = c(0, yMax),
+     type = "n",
+     axes = FALSE,
+     bty = "n",
+     main = "Mbp1 orthologue domain annotations",
+     xlab = "sequence position",
+     cex.axis = 0.8,
+     ylab="")
+axis(1, at = seq(0, xMax, by = 100))
+myCol <- colorRampPalette(c("#f2003c", "#F0A200",
+                            "#f0ea00", "#62C923",
+                            "#0A9A9B", "#1958C3",
+                            "#8000D3", "#D0007F"),
+                          space="Lab",
+                          interpolate="linear")(nrow(myDB$feature))
+myCol <- paste0(myCol, "55")
+legend(xMax - 150, 7,
+       legend = myDB$feature$name,
+       cex = 0.7,
+       fill = myCol,
+       bty = "n")
+
+# Finally, iterate over all proteins and call plotProtein()
+for (i in seq_along(iRows)) {
+  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
+}
+par(oPar)  # reset the plot parameters
+
+
+# The plot shows what is variable and what is constant about the annotations in
+# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
+# top.
+
+# Task:
+#    Put a copy of the plot into your journal and interpret it with respect
+#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
+
+# Task:
+#    It would be better to align the motif borders, at least approximately (not
+#    all proteins have all motifs). How would you go about doing that?
+
+# =    3  SHARING DATA  ========================================================
+
+# It's particularly interesting to compare such annotations across many
+# homologous proteins. I have created a page on the Student Wiki () that you can
+# edit, and then download the data from the entire class directly to your
+# RStudio project.
+#
+
+# I have provided a function that extracts all information that refers to a
+# single protein from the database, and prints it out as well-formatted JSON,
+# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
+# bookkeeping involved, but the code is not otherwise very enlightening so I
+# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
+# would want to have a look.
+
+
+# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
+
+# Task:
+# =====
+# 1: Run the following code:
+
+cat("{{Vspace}}",
+    "<!-- ==== BEGIN  PROTEIN ==== -->",
+    "<pre class=\"protein-data\">",
+    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
+    "</pre>",
+    "<!-- ===== END PROTEIN ====== -->",
+    "", sep = "\n"
+)
+
+# 2: Copy the entire output from the console.
+# 3: Navigate to
+#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
+#    ... edit the page, and paste your output at the top.
+# 4: Save your edits.
+
+
+
+# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
+
+# Once we have collected a number of protein annotations, we can access the
+# Wiki-page and import the data into our database. The Wiki page is  an html
+# document with lots of MediaWiki specific stuff - but the contents we are
+# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
+# work like normal HTML <pre> tags, but we have defined a special class for them
+# to make it easy to parse out the contents we want. The rvest:: package in
+# combination with xml2:: provides us with all the tools we need for such
+# "Webscraping" of data....
+
+if (! requireNamespace("rvest", quietly=TRUE)) {
+  install.packages("rvest")
+}
+
+if (! requireNamespace("xml2", quietly=TRUE)) {
+  install.packages("xml2")
+}
+
+# Here's the process:
+# The URL is an "open" page on the student Wiki. Users that are not logged in
+# can view the contents, but you can only edit if you are logged in.
+myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
+
+# First thing is to retrieve the HTML from the url...
+x <- xml2::read_html(myURL)
+
+# This retrieves the page source, but that still needs to be parsed into its
+# logical elements. HTML is a subset of XML and such documents are structured as
+# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
+# parses out the document structure and then uses a so-called "xpath" expression
+# to select nodes we are interested in. Now, xpath is one of those specialized
+# languages of which there are a few more to learn than one would care for. You
+# MUST know how to format sprintf() expressions, and you SHOULD be competent
+# with regular expressions. But if you want to be really competent in your work,
+# basic HTML and CSS is required ... and enough knowledge about xpath to be able
+# to search on Stackoverflow for what you need for parsing data out of Web
+# documents...
+
+# The expression we use below is:
+#   - get any node anywhere in the tree ("//*") ...
+#   - that has a particular attribute("[@ ... ]").
+#   - The attribute we want is that the class of the node is "protein-data";
+#      that is the class we have defined for our <pre> tags.
+# As a result of this selection, we get a list of pointers to the document tree.
+y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
+
+# Next we fetch the actual payload - the text - from the tree:
+# rvest::html_text() gets the text from the list of pointers. The result is a
+# normal list of character strings.
+z <- rvest::html_text(y)
+
+# Finally we can iterate over the list, and add all proteins we don't already
+# have to our database. There may well be items that are rejected because they
+# are already present in the database - for example, unless somebody has
+# annotated new features, all of the features are already there. Don't worry -
+# that is intended; we don't want duplicate entries.
+
+for (thisJSON in z) {
+  thisData <- jsonlite::fromJSON(thisJSON)
+  if (! thisData$protein$name %in% myDB$protein$name) {
+    myDB <- dbAddProtein(myDB, thisData$protein)
+    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
+    myDB <- dbAddFeature(myDB, thisData$feature)
+    myDB <- dbAddAnnotation(myDB, thisData$annotation)
+  }
+}
+
+# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
+
+iRows <- grep("^MBP1_", myDB$protein$name)
+yMax <- length(iRows) * 1.1
+xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
+
+# plot an empty frame
+oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
+plot(1, 1,
+     xlim = c(-200, xMax + 100),
+     ylim = c(0, yMax),
+     type = "n",
+     axes = FALSE,
+     bty = "n",
+     main = "Mbp1 orthologue domain annotations",
+     xlab = "sequence position",
+     cex.axis = 0.8,
+     ylab="")
+axis(1, at = seq(0, xMax, by = 100))
+myCol <- colorRampPalette(c("#f2003c", "#F0A200",
+                            "#f0ea00", "#62C923",
+                            "#0A9A9B", "#1958C3",
+                            "#8000D3", "#D0007F"),
+                          space="Lab",
+                          interpolate="linear")(nrow(myDB$feature))
+myCol <- paste0(myCol, "55")
+legend(xMax - 150, 7,
+       legend = myDB$feature$name,
+       cex = 0.7,
+       fill = myCol,
+       bty = "n")
+
+for (i in seq_along(iRows)) {
+  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
+}
+par(oPar)  # reset the plot parameters
+
+# ... the more proteins we can compare, the more we learn about the
+# architectural principles of this family's domains.
+
+
+# [END]
--- a/BIN-FUNC-Semantic_similarity.R
+++ b/BIN-FUNC-Semantic_similarity.R
@ -1,169 +1,169 @@
-# tocID <- "BIN-FUNC-Semantic_similarity.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
-#
-# Version:  1.2
-#
-# Date:     2017-11  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0    New code.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                                Line
-#TOC> --------------------------------------------------------------------
-#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
-#TOC>   2        Fetch GO Annotations                                  100
-#TOC>   3        Semantic Similarities                                 109
-#TOC>   4        GO Term Enrichment in Gene Sets                       127
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-
-# GOSim is an R-package in the Bioconductor project.
-if (! requireNamespace("GOSim", quietly = TRUE)) {
-  BiocManager::install("GOSim")
-}
-# Package information:
-#  library(help = GOSim)       # basic information
-#  browseVignettes("GOSim")    # available vignettes
-#  data(package = "GOSim")     # available datasets
-
-# GOSim makes extensive assumptions about loaded packages, and many base
-# methods are masked. We will thus use library(GOSim) to load it
-# in its entirety and with all packages it depends on. We will still use
-# the <package>::<function>() syntax in the code below, but this now serves
-# more of a didactic purpose, rather than actual syntax requirements.
-
-library(GOSim)
-
-# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
-# annotations instead...
-if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
-  BiocManager::install("org.Sc.sgd.db")
-}
-
-# Bioconductor annotation packages won't work stably unless we actually load
-# them:
-library(org.Sc.sgd.db)
-
-# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
-# databases exist for all model organisms. It's a kind of a fancy data frame
-# from which we can get annotations by rows (genes) with the keys() funtion ...
-AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
-
-# ... and the types of available annotations with the columns() function
-AnnotationDbi::columns(org.Sc.sgd.db)
-
-# Note that one of the columns is "GO" ... and we load that into the
-# datastructures used by GOSim:
-
-# Choose GOterms to use
-GOSim::setEvidenceLevel(evidences = "all",
-                        organism = org.Sc.sgdORGANISM,
-                        gomap = org.Sc.sgdGO)
-
-# Use Biological Process ontology
-GOSim::setOntology("BP", loadIC = FALSE)
-
-# confirm that we loaded the correct ontology
-head(get("gomap", envir = GOSimEnv))
-
-
-
-# =    2  Fetch GO Annotations  ================================================
-
-
-# All keys being used here are yeast systematic names.
-
-# Get one set of annotations
-GOSim::getGOInfo(c("YDL056W"))  # Mbp1
-
-
-# =    3  Semantic Similarities  ===============================================
-
-
-# Get semantic similarities between genes
-?getGeneSim
-
-# There are _many_ different metrics of term similarity implemented
-# in this package.
-
-                                                         # Mbp1 and...
-GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
-GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
-GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
-GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
-GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
-GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
-
-
-# =    4  GO Term Enrichment in Gene Sets  =====================================
-
-
-# Calculating GO term enrichment in gene sets is done with the Bioconductor
-# topGO package.
-if (! requireNamespace("topGO", quietly = TRUE)) {
-  BiocManager::install("topGO")
-}
-# Package information:
-#  library(help = topGO)       # basic information
-#  browseVignettes("topGO")    # available vignettes
-#  data(package = "topGO")     # available datasets
-
-# Once again - assumptions are made by GOsim that require us to load the
-# topGO package wholesale:
-library(topGO)
-
-# Let's define a gene set: GOterm enrichment for G1/S switch activators:
-mySet <- c("YFR028C", # Cdc14
-           "YDL056W", # Mbp1
-           "YLR182W", # Swi6
-           "YER111C", # Swi4
-           "YOR083W", # Whi5
-           "YBR160W", # Cdc28
-           "YMR199W", # Cln1
-           "YPL256C", # Cln2
-           "YAL040C") # Cln3
-
-allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
-allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
-                                            # we define enrichment
-
-myEnr <- GOenrichment(mySet, allGenes)
-
-sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
-
-#Most significantly enriched is GO:0071931. What is this?
-annotate::getGOTerm("GO:0071931")  # ... makes sense.
-
-
-
-
-# [END]
+# tocID <- "BIN-FUNC-Semantic_similarity.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
+#
+# Version:  1.2
+#
+# Date:     2017-11  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0    New code.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                                Line
+#TOC> --------------------------------------------------------------------
+#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
+#TOC>   2        Fetch GO Annotations                                  100
+#TOC>   3        Semantic Similarities                                 109
+#TOC>   4        GO Term Enrichment in Gene Sets                       127
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+
+# GOSim is an R-package in the Bioconductor project.
+if (! requireNamespace("GOSim", quietly = TRUE)) {
+  BiocManager::install("GOSim")
+}
+# Package information:
+#  library(help = GOSim)       # basic information
+#  browseVignettes("GOSim")    # available vignettes
+#  data(package = "GOSim")     # available datasets
+
+# GOSim makes extensive assumptions about loaded packages, and many base
+# methods are masked. We will thus use library(GOSim) to load it
+# in its entirety and with all packages it depends on. We will still use
+# the <package>::<function>() syntax in the code below, but this now serves
+# more of a didactic purpose, rather than actual syntax requirements.
+
+library(GOSim)
+
+# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
+# annotations instead...
+if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
+  BiocManager::install("org.Sc.sgd.db")
+}
+
+# Bioconductor annotation packages won't work stably unless we actually load
+# them:
+library(org.Sc.sgd.db)
+
+# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
+# databases exist for all model organisms. It's a kind of a fancy data frame
+# from which we can get annotations by rows (genes) with the keys() funtion ...
+AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
+
+# ... and the types of available annotations with the columns() function
+AnnotationDbi::columns(org.Sc.sgd.db)
+
+# Note that one of the columns is "GO" ... and we load that into the
+# datastructures used by GOSim:
+
+# Choose GOterms to use
+GOSim::setEvidenceLevel(evidences = "all",
+                        organism = org.Sc.sgdORGANISM,
+                        gomap = org.Sc.sgdGO)
+
+# Use Biological Process ontology
+GOSim::setOntology("BP", loadIC = FALSE)
+
+# confirm that we loaded the correct ontology
+head(get("gomap", envir = GOSimEnv))
+
+
+
+# =    2  Fetch GO Annotations  ================================================
+
+
+# All keys being used here are yeast systematic names.
+
+# Get one set of annotations
+GOSim::getGOInfo(c("YDL056W"))  # Mbp1
+
+
+# =    3  Semantic Similarities  ===============================================
+
+
+# Get semantic similarities between genes
+?getGeneSim
+
+# There are _many_ different metrics of term similarity implemented
+# in this package.
+
+                                                         # Mbp1 and...
+GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
+GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
+GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
+GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
+GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
+GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
+
+
+# =    4  GO Term Enrichment in Gene Sets  =====================================
+
+
+# Calculating GO term enrichment in gene sets is done with the Bioconductor
+# topGO package.
+if (! requireNamespace("topGO", quietly = TRUE)) {
+  BiocManager::install("topGO")
+}
+# Package information:
+#  library(help = topGO)       # basic information
+#  browseVignettes("topGO")    # available vignettes
+#  data(package = "topGO")     # available datasets
+
+# Once again - assumptions are made by GOsim that require us to load the
+# topGO package wholesale:
+library(topGO)
+
+# Let's define a gene set: GOterm enrichment for G1/S switch activators:
+mySet <- c("YFR028C", # Cdc14
+           "YDL056W", # Mbp1
+           "YLR182W", # Swi6
+           "YER111C", # Swi4
+           "YOR083W", # Whi5
+           "YBR160W", # Cdc28
+           "YMR199W", # Cln1
+           "YPL256C", # Cln2
+           "YAL040C") # Cln3
+
+allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
+allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
+                                            # we define enrichment
+
+myEnr <- GOenrichment(mySet, allGenes)
+
+sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
+
+#Most significantly enriched is GO:0071931. What is this?
+annotate::getGOTerm("GO:0071931")  # ... makes sense.
+
+
+
+
+# [END]
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@ -1,351 +1,351 @@
-# tocID <- "BIN-MYSPE.R"
-#
-# Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-MYSPE unit
-#
-#
-# Version: 1.4
-#
-# Date:    2017-09 - 2021-10
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# V 1.4    Add troubleshooting hints via errText[[...]]
-# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
-# V 1.2    Reorganized proportional plot section into a "further reading"
-#          section, added nested-box, and sankey plot visualization of
-#          proportions. Introduced plotly.
-# V 1.1    2020 Workflow changes
-# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
-# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
-# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
-#
-# TODO:    Sample solution for sankey plot function.
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# DO NOT SIMPLY  source()  THESE FILES!
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                             Line
-#TOC> -----------------------------------------------------------------
-#TOC>   1        PREPARATIONS                                        52
-#TOC>   2        SUITABLE MYSPE SPECIES                              65
-#TOC>   3        ADOPT "MYSPE"                                       89
-#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
-#TOC>   4.1        Percentages                                      146
-#TOC>   4.2        Visualizing proportions: Pie chart               165
-#TOC>   4.3        Visualizing proportions: Nested squares          243
-#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  PREPARATIONS  ========================================================
-#
-
-# Execute the two conditionals below:
-if (! file.exists("./myScripts/.myProfile.R")) {
-  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
-}
-
-if (! exists("myStudentNumber")) {
-  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
-}
-
-
-# =    2  SUITABLE MYSPE SPECIES  ==============================================
-
-
-# In this unit we will select one species from a list of genome sequenced fungi
-# and write it into your personalized profile file. This species will be called
-# "MYSPE" (My Species) for other learning units and exercises.
-
-# A detailed description of the process of compiling the list of genome
-# sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
-# was retrieved from https://fungi.ensembl.org; a search for homologues to
-# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
-# A representative organism at each genus-level was chosen from those hits
-# that actual;ly have a homologue. Finally, a mapping table was constructed to
-# asymmetrically retrieve unique species: a student number will retrieve
-# a species, but (public) knowledge of the species cannot reconstruct the
-# student number.
-
-# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
-#       of selecting and combining data from various data resources. Studying
-#       it will give you a better sense of how such workflows can be
-#       implemented in practice.
-
-
-# =    3  ADOPT "MYSPE"  =======================================================
-
-# Execute:
-( MYSPE <- getMYSPE(myStudentNumber) )
-
-# If this produced an error, this session has not been properly set up. You
-# may not yet have run  init()  and edited  .myProfile.R , or that file is not
-# in your  myScripts/  folder. Fix this, and execute:
-#
-#    source(".Rprofile") .
-
-# If this produced NA, your Student Number may not be correct, or you are not in
-# my class-list. Contact me. Otherwise, this should have printed a species name,
-# and the taxonomy ID of its genome-sequenced strain. This is your unique
-# speciesfor this course. Note it in your journal ...
-
-biCode(MYSPE) # and also note it's "BiCode" ...
-( myTaxID <- names(MYSPE) )  # and its taxID
-
-
-# Task:
-# =====
-#   Note down the species name and its five letter BiCode on your Student
-#   Wiki user page. Use this species whenever this or future assignments refer
-#   to MYSPE. Whenever you start a session, it will automatically be loaded
-#   from  myScripts/.myProfile.R  and is available as  MYSPE .
-
-# Here is some more information about MYSPE, taken from the table of genome-
-# sequenced fungi that is in your ./data folder.
-fungiDat <- read.csv("data/Species.csv")
-iMs <- which(fungiDat$Taxon.ID == myTaxID)
-
-( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
-( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
-( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
-
-# That's all.
-
-
-# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
-
-# The material below is an exploration of data-preparation and plotting
-# techniques; you can treat this as additional practice and further reading and
-# I expect that some of the code and plotting examples may be useful in a
-# different context.
-
-# A frequent task is to visualize the proportion of elements with given
-# categories in a sample. For example, we might ask what the proportion of the
-# different orders of fungi is the order of MYSPE? Let's first collect the
-# numbers.
-
-( nFungi <- nrow(fungiDat) )                            # sequenced fungi
-( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
-( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
-( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
-
-
-# ==   4.1  Percentages  =======================================================
-
-# The zeroth-order approach to visualization is simply to print percentages:
-
-cat(sprintf("\n%s comprise %5.2f%% of fungi.",
-        myOr,
-        (nOrder * 100) / nFungi))
-
-# ... or, adding the actual numbers:
-
-cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
-            myOr,
-            (nOrder * 100) / nFungi,
-            nOrder,
-            nFungi))
-
-# But that's hard to visualize for most of us, and anyway, we don't know how
-# that relates to other orders.
-
-# ==   4.2  Visualizing proportions: Pie chart  ================================
-
-# Often, we will use a pie chart instead. Pie charts are rather informal types
-# of plots, not well suited for analysis. But easy to do:
-
-# Define four colors to identify the four categories
-pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
-
-oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
-                                           # and remember the
-                                           # previous setting
-
-pie(c(nSpecies,                            # subtract numbers since these
-      nGenus - nSpecies,                   # categories are mutually contained
-      nOrder - nGenus - nSpecies,          # in each other
-      nFungi - nOrder - nGenus - nSpecies),
-      labels = "",
-      radius = 0.9,
-      main = "MYSPE in genome-sequenced fungi",
-      lty = 0,                             # turn borders for wedges off
-      col = pCol,
-      clockwise = TRUE,
-      init.angle = 90)
-
-title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
-
-legend(x = 0.95, y = 0.8,    # place at legend here
-       legend = c("Species", "Genus", "Order", "Fungi"),
-       y.intersp = 2,                      # line spacing for labels
-       cex = 0.8,                          # character size for labels
-       bty = "n",                          # "no" box around the legend
-       pt.cex = 2,                         # size of colour boxes
-       pch = 15,                           # a filled square
-       col = pCol)
-
-par(oPar)                                  # reset graphics state
-
-# Unless MYSPE is one of the frequently sequenced species, there will only be a
-# very thin wedge visible. Pie charts are not well suited to visualize small
-# proportions.
-
-# It is a little more useful if we have non-nested proportions - like the
-# number of species in the same order overall:
-
-myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
-head(myTbl)
-
-# pie() does a reasonable job out of the box to interpret table() data:
-pie(myTbl)
-
-# ... we can improve this quickly with a bit of tweaking:
-
-N <- length(myTbl)
-sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
-
-myCol <- rep(pCol[4], N)       # N elements of pCol[1]
-myCol[sel] <- pCol[1]          # replace this one color
-
-myLbl <- rep("", N)            # N labels of ""
-myLbl[sel] <- myOr             # replace this one label with the MYSPE order
-
-
-oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
-
-pie(myTbl,
-    labels = myLbl,
-    radius = 0.9,
-    main = "MYSPE order",
-    border = "#DDDDDD",
-    col = myCol,
-    clockwise = TRUE,
-    init.angle = 90)
-
-par(oPar)                                  # reset graphics state
-
-# But the overall problem remains.
-
-
-# ==   4.3  Visualizing proportions: Nested squares  ===========================
-
-# A simple alternative is to draw such proportions as nested squares:
-
-x <- sqrt(nFungi)
-
-# set margins to ~ 0 and type to square
-oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
-
-# empty, square plot
-plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
-     type="n", axes=FALSE, xlab="", ylab="")
-
-# basic square for all genomes
-rect(0, 0, x,              x,              col = pCol[4])
-
-# grid
-u <- 0:floor(x)
-N <- length(u)
-segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
-segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
-# each square on this grid is one genome
-
-# colored squares
-rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
-rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
-rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
-
-# labels
-text(x/2, x/2,      "Fungi")
-text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
-text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
-text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
-
-par(oPar)                                  # reset graphics state
-
-
-# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
-
-# Sankey diagrams are an excellent way to visualize complicated nested
-# proportions and their changes (see here for example:
-# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
-# example with the MYSPE proportions, as an illustration of the plotting
-# principle.
-
-if (! requireNamespace("plotly")) {
-  install.packages("plotly")
-}
-# Package information:
-#  library(help   = plotly)     # basic information
-#  browseVignettes("plotly")    # available vignettes
-#  data(package  = "plotly")    # available datasets
-
-# Here, we use the plotly package that wraps a very well developed javascript
-# library with many options for interactive plots. I am producing this plot
-# hard-coded for the sample organism "Sporothrix schenkii"; you would need
-# to change the code to adapt it to your own MYSPE - or even build a function
-# for this. Do try this if you have a bit of coding experience, sankey diagrams
-# are a good way to show hierarchical data relations - and if you get this
-# working for your own organism you can be proud that you have understood
-# how preparing the data works.
-
-
-myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
-                          "Ophiostomatales (6)",       # 1
-                          "Other...",                  # 2
-                          "Sporothrix (4)",            # 3
-                          "Other...",                  # 4
-                          "Sporothrix schenckii (2)",  # 5
-                          "Other..."                   # 6
-                          ),
-                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
-                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
-                color = c("#f2f2f0", #
-                          "#ffd5c4",
-                          "#CCCCCC",
-                          "#ff9582",
-                          "#CCCCCC",
-                          "#ed394e",
-                          "#CCCCCC"
-                          ),
-                pad = 15,
-                thickness = 20,
-                line = list(color = "black",
-                            width = 0.5))
-
-myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
-                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
-                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
-
-# Setting up the actual plot ...
-fig  <-  plotly::plot_ly(type = "sankey",
-                         arrangement = "snap",
-                         orientation = "h",
-                         node = myNodes,
-                         link = myLinks)
-
-# Adding and adjusting a few layout parameters
-fig <- plotly::layout(fig,
-              title = "Fungi Genomes - Classification",
-              font = list(size = 10))
-
-fig     # plot the diagram
-
-# Note that the plot appears in the Viewer window, not the Plot window, and that
-# it is interactive: you can hover over nodes and links, and drag the nodes
-# around.
-
-# [END]
+# tocID <- "BIN-MYSPE.R"
+#
+# Purpose: A Bioinformatics Course:
+#              R code accompanying the BIN-MYSPE unit
+#
+#
+# Version: 1.4
+#
+# Date:    2017-09 - 2021-10
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# V 1.4    Add troubleshooting hints via errText[[...]]
+# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
+# V 1.2    Reorganized proportional plot section into a "further reading"
+#          section, added nested-box, and sankey plot visualization of
+#          proportions. Introduced plotly.
+# V 1.1    2020 Workflow changes
+# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
+# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
+# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
+#
+# TODO:    Sample solution for sankey plot function.
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# DO NOT SIMPLY  source()  THESE FILES!
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                             Line
+#TOC> -----------------------------------------------------------------
+#TOC>   1        PREPARATIONS                                        52
+#TOC>   2        SUITABLE MYSPE SPECIES                              65
+#TOC>   3        ADOPT "MYSPE"                                       89
+#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
+#TOC>   4.1        Percentages                                      146
+#TOC>   4.2        Visualizing proportions: Pie chart               165
+#TOC>   4.3        Visualizing proportions: Nested squares          243
+#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  PREPARATIONS  ========================================================
+#
+
+# Execute the two conditionals below:
+if (! file.exists("./myScripts/.myProfile.R")) {
+  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
+}
+
+if (! exists("myStudentNumber")) {
+  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
+}
+
+
+# =    2  SUITABLE MYSPE SPECIES  ==============================================
+
+
+# In this unit we will select one species from a list of genome sequenced fungi
+# and write it into your personalized profile file. This species will be called
+# "MYSPE" (My Species) for other learning units and exercises.
+
+# A detailed description of the process of compiling the list of genome
+# sequenced fungi with protein annotations and Mbp1 homologues is in the file
+# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
+# was retrieved from https://fungi.ensembl.org; a search for homologues to
+# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
+# A representative organism at each genus-level was chosen from those hits
+# that actual;ly have a homologue. Finally, a mapping table was constructed to
+# asymmetrically retrieve unique species: a student number will retrieve
+# a species, but (public) knowledge of the species cannot reconstruct the
+# student number.
+
+# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
+#       of selecting and combining data from various data resources. Studying
+#       it will give you a better sense of how such workflows can be
+#       implemented in practice.
+
+
+# =    3  ADOPT "MYSPE"  =======================================================
+
+# Execute:
+( MYSPE <- getMYSPE(myStudentNumber) )
+
+# If this produced an error, this session has not been properly set up. You
+# may not yet have run  init()  and edited  .myProfile.R , or that file is not
+# in your  myScripts/  folder. Fix this, and execute:
+#
+#    source(".Rprofile") .
+
+# If this produced NA, your Student Number may not be correct, or you are not in
+# my class-list. Contact me. Otherwise, this should have printed a species name,
+# and the taxonomy ID of its genome-sequenced strain. This is your unique
+# speciesfor this course. Note it in your journal ...
+
+biCode(MYSPE) # and also note it's "BiCode" ...
+( myTaxID <- names(MYSPE) )  # and its taxID
+
+
+# Task:
+# =====
+#   Note down the species name and its five letter BiCode on your Student
+#   Wiki user page. Use this species whenever this or future assignments refer
+#   to MYSPE. Whenever you start a session, it will automatically be loaded
+#   from  myScripts/.myProfile.R  and is available as  MYSPE .
+
+# Here is some more information about MYSPE, taken from the table of genome-
+# sequenced fungi that is in your ./data folder.
+fungiDat <- read.csv("data/Species.csv")
+iMs <- which(fungiDat$Taxon.ID == myTaxID)
+
+( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
+( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
+( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
+
+# That's all.
+
+
+# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
+
+# The material below is an exploration of data-preparation and plotting
+# techniques; you can treat this as additional practice and further reading and
+# I expect that some of the code and plotting examples may be useful in a
+# different context.
+
+# A frequent task is to visualize the proportion of elements with given
+# categories in a sample. For example, we might ask what the proportion of the
+# different orders of fungi is the order of MYSPE? Let's first collect the
+# numbers.
+
+( nFungi <- nrow(fungiDat) )                            # sequenced fungi
+( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
+( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
+( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
+
+
+# ==   4.1  Percentages  =======================================================
+
+# The zeroth-order approach to visualization is simply to print percentages:
+
+cat(sprintf("\n%s comprise %5.2f%% of fungi.",
+        myOr,
+        (nOrder * 100) / nFungi))
+
+# ... or, adding the actual numbers:
+
+cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
+            myOr,
+            (nOrder * 100) / nFungi,
+            nOrder,
+            nFungi))
+
+# But that's hard to visualize for most of us, and anyway, we don't know how
+# that relates to other orders.
+
+# ==   4.2  Visualizing proportions: Pie chart  ================================
+
+# Often, we will use a pie chart instead. Pie charts are rather informal types
+# of plots, not well suited for analysis. But easy to do:
+
+# Define four colors to identify the four categories
+pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
+
+oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
+                                           # and remember the
+                                           # previous setting
+
+pie(c(nSpecies,                            # subtract numbers since these
+      nGenus - nSpecies,                   # categories are mutually contained
+      nOrder - nGenus - nSpecies,          # in each other
+      nFungi - nOrder - nGenus - nSpecies),
+      labels = "",
+      radius = 0.9,
+      main = "MYSPE in genome-sequenced fungi",
+      lty = 0,                             # turn borders for wedges off
+      col = pCol,
+      clockwise = TRUE,
+      init.angle = 90)
+
+title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
+
+legend(x = 0.95, y = 0.8,    # place at legend here
+       legend = c("Species", "Genus", "Order", "Fungi"),
+       y.intersp = 2,                      # line spacing for labels
+       cex = 0.8,                          # character size for labels
+       bty = "n",                          # "no" box around the legend
+       pt.cex = 2,                         # size of colour boxes
+       pch = 15,                           # a filled square
+       col = pCol)
+
+par(oPar)                                  # reset graphics state
+
+# Unless MYSPE is one of the frequently sequenced species, there will only be a
+# very thin wedge visible. Pie charts are not well suited to visualize small
+# proportions.
+
+# It is a little more useful if we have non-nested proportions - like the
+# number of species in the same order overall:
+
+myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
+head(myTbl)
+
+# pie() does a reasonable job out of the box to interpret table() data:
+pie(myTbl)
+
+# ... we can improve this quickly with a bit of tweaking:
+
+N <- length(myTbl)
+sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
+
+myCol <- rep(pCol[4], N)       # N elements of pCol[1]
+myCol[sel] <- pCol[1]          # replace this one color
+
+myLbl <- rep("", N)            # N labels of ""
+myLbl[sel] <- myOr             # replace this one label with the MYSPE order
+
+
+oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
+
+pie(myTbl,
+    labels = myLbl,
+    radius = 0.9,
+    main = "MYSPE order",
+    border = "#DDDDDD",
+    col = myCol,
+    clockwise = TRUE,
+    init.angle = 90)
+
+par(oPar)                                  # reset graphics state
+
+# But the overall problem remains.
+
+
+# ==   4.3  Visualizing proportions: Nested squares  ===========================
+
+# A simple alternative is to draw such proportions as nested squares:
+
+x <- sqrt(nFungi)
+
+# set margins to ~ 0 and type to square
+oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
+
+# empty, square plot
+plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
+     type="n", axes=FALSE, xlab="", ylab="")
+
+# basic square for all genomes
+rect(0, 0, x,              x,              col = pCol[4])
+
+# grid
+u <- 0:floor(x)
+N <- length(u)
+segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
+segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
+# each square on this grid is one genome
+
+# colored squares
+rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
+rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
+rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
+
+# labels
+text(x/2, x/2,      "Fungi")
+text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
+text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
+text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
+
+par(oPar)                                  # reset graphics state
+
+
+# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
+
+# Sankey diagrams are an excellent way to visualize complicated nested
+# proportions and their changes (see here for example:
+# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
+# example with the MYSPE proportions, as an illustration of the plotting
+# principle.
+
+if (! requireNamespace("plotly")) {
+  install.packages("plotly")
+}
+# Package information:
+#  library(help   = plotly)     # basic information
+#  browseVignettes("plotly")    # available vignettes
+#  data(package  = "plotly")    # available datasets
+
+# Here, we use the plotly package that wraps a very well developed javascript
+# library with many options for interactive plots. I am producing this plot
+# hard-coded for the sample organism "Sporothrix schenkii"; you would need
+# to change the code to adapt it to your own MYSPE - or even build a function
+# for this. Do try this if you have a bit of coding experience, sankey diagrams
+# are a good way to show hierarchical data relations - and if you get this
+# working for your own organism you can be proud that you have understood
+# how preparing the data works.
+
+
+myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
+                          "Ophiostomatales (6)",       # 1
+                          "Other...",                  # 2
+                          "Sporothrix (4)",            # 3
+                          "Other...",                  # 4
+                          "Sporothrix schenckii (2)",  # 5
+                          "Other..."                   # 6
+                          ),
+                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
+                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
+                color = c("#f2f2f0", #
+                          "#ffd5c4",
+                          "#CCCCCC",
+                          "#ff9582",
+                          "#CCCCCC",
+                          "#ed394e",
+                          "#CCCCCC"
+                          ),
+                pad = 15,
+                thickness = 20,
+                line = list(color = "black",
+                            width = 0.5))
+
+myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
+                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
+                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
+
+# Setting up the actual plot ...
+fig  <-  plotly::plot_ly(type = "sankey",
+                         arrangement = "snap",
+                         orientation = "h",
+                         node = myNodes,
+                         link = myLinks)
+
+# Adding and adjusting a few layout parameters
+fig <- plotly::layout(fig,
+              title = "Fungi Genomes - Classification",
+              font = list(size = 10))
+
+fig     # plot the diagram
+
+# Note that the plot appears in the Viewer window, not the Plot window, and that
+# it is interactive: you can hover over nodes and links, and drag the nodes
+# around.
+
+# [END]
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@ -1,234 +1,234 @@
-# tocID <- "BIN-PHYLO-Data_preparation.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Data_preparation unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                     Line
-#TOC> ---------------------------------------------------------
-#TOC>   1        Preparations                                45
-#TOC>   2        Fetching sequences                          77
-#TOC>   3        Multiple Sequence Alignment                118
-#TOC>   4        Reviewing and Editing Alignments           137
-#TOC>   4.1        Masking workflow                         153
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Preparations  ========================================================
-
-
-# You need to reload your protein database, including changes that might have
-# been made to the reference files. If you have worked with the prerequiste
-# units, you should have a script named "makeProteinDB.R" that will create the
-# myDB object with a protein and feature database. Ask for advice if not.
-source("myScripts/makeProteinDB.R")
-
-# Load packages we need
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-
-if (! requireNamespace("msa", quietly = TRUE)) {
-  BiocManager::install("msa")
-}
-# Package information:
-#  library(help = msa)       # basic information
-#  browseVignettes("msa")  # available vignettes
-#  data(package = "msa")   # available datasets
-
-
-# =    2  Fetching sequences  ==================================================
-
-
-# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
-# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
-# domains. You have annotated their ranges as a feature. The following code
-# retrieves the sequences from myDB. You have seen similar code in other units.
-
-sel <- grep("^MBP1_", myDB$protein$name)
-(proNames <- myDB$protein$name[sel])
-(proIDs <- myDB$protein$ID[sel])
-
-(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
-                              myDB$annotation$featureID == sel])      #  ==  !
-                                                                      # Why?
-APSI <- character(length(fanIDs))
-
-for (i in seq_along(fanIDs)) {
-  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
-  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
-  start <- myDB$annotation$start[sel]       # get start ...
-  end   <- myDB$annotation$end[sel]         # ... and end
-
-  sel <- myDB$protein$ID == proID           # get the protein row index ...
-                                            # ... and the sequence
-  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
-  names(APSI)[i] <- (myDB$protein$name[sel])
-}
-
-head(APSI)
-
-# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
-# phylogenetic tree (see the unit's Wiki page for details on the sequence).
-
-APSI <- c(APSI,
-"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
-names(APSI)[length(APSI)] <- "KILA_ESCCO"
-tail(APSI)
-
-
-# =    3  Multiple Sequence Alignment  =========================================
-
-# This vector of sequences with named elements fulfills the requirements to be
-# imported as a Biostrings object - an AAStringSet - which we need as input for
-# the MSA algorithms in Biostrings.
-#
-
-APSESSet <- Biostrings::AAStringSet(APSI)
-APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
-
-# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
-# that happens in your case, just use msaClustalOmega() instead.
-
-# inspect the alignment.
-writeALN(APSESMsa)
-
-# What do you think? Is this a good alignment for phylogenetic inference?
-
-
-# =    4  Reviewing and Editing Alignments  ====================================
-
-
-# Head back to the Wiki page for this unit and read up on the background
-# first.
-
-# Let's mask out all columns that have observations for
-# less than 1/3 of the sequences in the dataset. This
-# means they have more than round(nrow(msaSet) * (2/3))
-# hyphens in a column.
-#
-# We take all sequences, split them into single
-# characters, and put them into a matrix. Then we
-# go through the matrix, column by column and decide
-# whether we want to include that column.
-
-# ==   4.1  Masking workflow  ==================================================
-
-# get the length of the alignment
-(lenAli <- APSESMsa@unmasked@ranges@width[1])
-
-# initialize a matrix that can hold all characters
-# individually
-msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
-                    ncol = lenAli)
-
-# assign the correct rownames
-rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
-for (i in 1:nrow(APSESMsa)) {
-  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
-}
-
-# inspect the result
-msaMatrix[1:7, 30:40]
-
-# Now let's make a logical vector with an element for each column that selects
-# which columns should be masked out.
-
-# The number of hyphens in a column is easy to count. Consider:
-
-    msaMatrix[ , 20]             # column 20
-    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
-sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
-
-# Thus filling our logical vector is simple:
-
-# initialize a mask
-colMask <- logical(ncol(msaMatrix))
-
-# define the threshold for rejecting a column
-limit <- round(nrow(APSESMsa) * (2/3))
-
-# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
-# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
-# and FALSE columns will be rejected.
-for (i in 1:ncol(msaMatrix)) {
-  count <- sum(msaMatrix[ , i] == "-")
-  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
-}
-
-# Inspect the mask
-colMask
-
-# How many positions are being kept?
-sum(colMask)
-
-cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
-            100 * (1 - (sum(colMask) / length(colMask)))))
-
-
-# Next, we use colMask to remove the masked columns from the matrix
-# in one step:
-maskedMatrix <- msaMatrix[ , colMask]
-
-# check:
-ncol(maskedMatrix)
-
-# ... then collapse each row of single characters back into a string ...
-APSESphyloSet <- character()
-for (i in 1:nrow(maskedMatrix)) {
-  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
-}
-names(APSESphyloSet) <- rownames(maskedMatrix)
-
-# inspect ...
-writeALN(APSESphyloSet)
-
-# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
-# several indels from the KILA_ESCCO outgroup sequence.
-
-
-# We save the aligned, masked domains to a file in the data/ directory,
-# in multi-FASTA format.
-writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
-
-
-
-# [END]
+# tocID <- "BIN-PHYLO-Data_preparation.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-PHYLO-Data_preparation unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0    First 2017 version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                     Line
+#TOC> ---------------------------------------------------------
+#TOC>   1        Preparations                                45
+#TOC>   2        Fetching sequences                          77
+#TOC>   3        Multiple Sequence Alignment                118
+#TOC>   4        Reviewing and Editing Alignments           137
+#TOC>   4.1        Masking workflow                         153
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Preparations  ========================================================
+
+
+# You need to reload your protein database, including changes that might have
+# been made to the reference files. If you have worked with the prerequiste
+# units, you should have a script named "makeProteinDB.R" that will create the
+# myDB object with a protein and feature database. Ask for advice if not.
+source("myScripts/makeProteinDB.R")
+
+# Load packages we need
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+if (! requireNamespace("msa", quietly = TRUE)) {
+  BiocManager::install("msa")
+}
+# Package information:
+#  library(help = msa)       # basic information
+#  browseVignettes("msa")  # available vignettes
+#  data(package = "msa")   # available datasets
+
+
+# =    2  Fetching sequences  ==================================================
+
+
+# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
+# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
+# domains. You have annotated their ranges as a feature. The following code
+# retrieves the sequences from myDB. You have seen similar code in other units.
+
+sel <- grep("^MBP1_", myDB$protein$name)
+(proNames <- myDB$protein$name[sel])
+(proIDs <- myDB$protein$ID[sel])
+
+(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
+                              myDB$annotation$featureID == sel])      #  ==  !
+                                                                      # Why?
+APSI <- character(length(fanIDs))
+
+for (i in seq_along(fanIDs)) {
+  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
+  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
+  start <- myDB$annotation$start[sel]       # get start ...
+  end   <- myDB$annotation$end[sel]         # ... and end
+
+  sel <- myDB$protein$ID == proID           # get the protein row index ...
+                                            # ... and the sequence
+  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
+  names(APSI)[i] <- (myDB$protein$name[sel])
+}
+
+head(APSI)
+
+# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
+# phylogenetic tree (see the unit's Wiki page for details on the sequence).
+
+APSI <- c(APSI,
+"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
+names(APSI)[length(APSI)] <- "KILA_ESCCO"
+tail(APSI)
+
+
+# =    3  Multiple Sequence Alignment  =========================================
+
+# This vector of sequences with named elements fulfills the requirements to be
+# imported as a Biostrings object - an AAStringSet - which we need as input for
+# the MSA algorithms in Biostrings.
+#
+
+APSESSet <- Biostrings::AAStringSet(APSI)
+APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
+
+# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
+# that happens in your case, just use msaClustalOmega() instead.
+
+# inspect the alignment.
+writeALN(APSESMsa)
+
+# What do you think? Is this a good alignment for phylogenetic inference?
+
+
+# =    4  Reviewing and Editing Alignments  ====================================
+
+
+# Head back to the Wiki page for this unit and read up on the background
+# first.
+
+# Let's mask out all columns that have observations for
+# less than 1/3 of the sequences in the dataset. This
+# means they have more than round(nrow(msaSet) * (2/3))
+# hyphens in a column.
+#
+# We take all sequences, split them into single
+# characters, and put them into a matrix. Then we
+# go through the matrix, column by column and decide
+# whether we want to include that column.
+
+# ==   4.1  Masking workflow  ==================================================
+
+# get the length of the alignment
+(lenAli <- APSESMsa@unmasked@ranges@width[1])
+
+# initialize a matrix that can hold all characters
+# individually
+msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
+                    ncol = lenAli)
+
+# assign the correct rownames
+rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
+for (i in 1:nrow(APSESMsa)) {
+  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
+}
+
+# inspect the result
+msaMatrix[1:7, 30:40]
+
+# Now let's make a logical vector with an element for each column that selects
+# which columns should be masked out.
+
+# The number of hyphens in a column is easy to count. Consider:
+
+    msaMatrix[ , 20]             # column 20
+    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
+sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
+
+# Thus filling our logical vector is simple:
+
+# initialize a mask
+colMask <- logical(ncol(msaMatrix))
+
+# define the threshold for rejecting a column
+limit <- round(nrow(APSESMsa) * (2/3))
+
+# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
+# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
+# and FALSE columns will be rejected.
+for (i in 1:ncol(msaMatrix)) {
+  count <- sum(msaMatrix[ , i] == "-")
+  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
+}
+
+# Inspect the mask
+colMask
+
+# How many positions are being kept?
+sum(colMask)
+
+cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
+            100 * (1 - (sum(colMask) / length(colMask)))))
+
+
+# Next, we use colMask to remove the masked columns from the matrix
+# in one step:
+maskedMatrix <- msaMatrix[ , colMask]
+
+# check:
+ncol(maskedMatrix)
+
+# ... then collapse each row of single characters back into a string ...
+APSESphyloSet <- character()
+for (i in 1:nrow(maskedMatrix)) {
+  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
+}
+names(APSESphyloSet) <- rownames(maskedMatrix)
+
+# inspect ...
+writeALN(APSESphyloSet)
+
+# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
+# several indels from the KILA_ESCCO outgroup sequence.
+
+
+# We save the aligned, masked domains to a file in the data/ directory,
+# in multi-FASTA format.
+writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
+
+
+
+# [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@ -1,406 +1,406 @@
-# tocID <- "BIN-PHYLO-Tree_analysis.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
-#                  Rewrite of tip re-ordering. Better handling of
-#                  messages. pBar() for randomization.
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0.2  Typo in variable name, style changes
-#           1.0.1  Wrong section heading
-#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                              Line
-#TOC> --------------------------------------------------
-#TOC>   1        Preparation and Tree Plot            50
-#TOC>   2        SPECIES REFERENCE TREE               66
-#TOC>   3        Tree Analysis                       117
-#TOC>   3.1        Rooting Trees                     177
-#TOC>   3.2        Rotating Clades                   222
-#TOC>   3.3        Computing tree distances          309
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Preparation and Tree Plot  ===========================================
-
-
-if (! requireNamespace("ape", quietly = TRUE)) {
-  install.packages("ape")
-}
-# Package information:
-#  library(help = ape)       # basic information
-#  browseVignettes("ape")    # available vignettes
-#  data(package = "ape")     # available datasets
-
-# We change the graphics parameters from time to time, let's define the
-# default so we can recreate a sane state:
-dev.off()
-PAR <- par()
-
-# =    2  SPECIES REFERENCE TREE  ==============================================
-
-# Before we do any kind of phylogenetic analysis of genes from several species,
-# we MUST have a reference tree of the taxonomic relationships in hand. This
-# context is absolutely required for the interpretation of our tree.
-
-# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
-
-if (! requireNamespace("taxize", quietly = TRUE)) {
-  install.packages("taxize")
-}
-# Package information:
-#  library(help   = taxize)       # basic information
-#  browseVignettes("taxize")    # available vignettes
-#  data(package  = "taxize")     # available datasets
-
-( mySOI <- c(myDB$taxonomy$ID, "83333") )
-myClass <- taxize::classification(mySOI, db = "ncbi")
-str(myClass)
-
-myClass[[1]]
-
-fungiTree <- taxize::class2tree(myClass, check = TRUE)
-plot(fungiTree)
-
-# The tree produced by taxize:: contains full length species names,
-# but it would be more convenient if it had bicodes instead. Also, the actual
-# tree is only part of the list(), which will cause problems later:
-str(fungiTree)
-
-# we therefor simplify
-fungiTree <- fungiTree$phylo
-str(fungiTree)
-
-# The species names are in a vector $phylo$tip.label of this list.
-# We can use biCode() to shorten them.
-fungiTree$tip.label <- biCode(fungiTree$tip.label)
-
-# Plot the tree
-nSP <- length(fungiTree$tip.label)
-plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
-text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
-ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.6,
-                adj = 0.2,
-                bg = "#D4F2DA")
-# Note that you can use the arrow buttons in the menu above the plot pane to
-# scroll back to plots you have created earlier - so you can reference back to
-# this species tree in your later analysis.
-
-
-# =    3  Tree Analysis  =======================================================
-
-
-# 1.1  Visualizing your tree
-# The trees that are produced by Rphylip are stored as an object of class
-# "phylo". This is a class for phylogenetic trees that is widely used in the
-# community, practically all R phylogenetics packages will options to read and
-# manipulate such trees. Outside of R, a popular interchange format is the
-# Newick_format that you have seen above. It's easy to output your calculated
-# trees in Newick format and visualize them elsewhere.
-
-# The "phylo" class object is one of R's "S3" objects and methods to plot and
-# print it have been defined with the Rphylip package, and in ape. You can
-# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
-# to plot it. The underlying function is plot.phylo(), and documentation for its
-# many options can by found by typing:
-
-?plot.phylo
-
-# We load the APSES sequence tree that you produced in the
-# BIN-PHYLO-Tree_building unit:
-apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
-
-plot(apsTree) # default type is "phylogram"
-plot(apsTree, type = "unrooted")
-plot(apsTree, type = "fan", no.margin = TRUE)
-
-# rescale to show all of the labels:
-# record the current plot parameters by assigning them to a variable ...
-(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
-# ... and adjust the plot limits for a new plot:
-plot(apsTree,
-     type = "fan",
-     x.lim = tmp$x.lim * 1.8,
-     y.lim = tmp$y.lim * 1.8,
-     cex = 0.8,
-     no.margin = TRUE)
-
-# Inspect the tree object
-str(apsTree)
-apsTree$tip.label
-apsTree$edge
-apsTree$edge.length
-
-# show the node / edge and tip labels on a plot
-plot(apsTree)
-ape::nodelabels()
-ape::edgelabels()
-ape::tiplabels()
-
-# show the number of nodes, edges and tips
-ape::Nnode(apsTree)
-ape::Nedge(apsTree)
-ape::Ntip(apsTree)
-
-par(PAR)   # reset graphics state
-
-# Finally, write the tree to console in Newick format
-ape::write.tree(apsTree)
-
-# ==   3.1  Rooting Trees  =====================================================
-
-# In order to analyse the tree, it is helpful to root it first and reorder its
-# clades. Contrary to documentation, Rproml() returns an unrooted tree.
-
-ape::is.rooted(apsTree)
-
-# You can root the tree with the command root() from the "ape" package.
-
-plot(apsTree)
-
-# add labels for internal nodes and tips
-ape::nodelabels(cex = 0.5, frame = "circle")
-ape::tiplabels(cex = 0.5, frame = "rect")
-
-# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
-# number in yours. Substitute the correct node number below for "outgroup".
-apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
-plot(apsTree)
-ape::is.rooted(apsTree)
-
-# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
-# an edge of length zero was added to connect the MRCA (Most Recent Common
-# Ancestor) of the ingroup.
-
-# The edge lengths are stored in the phylo object:
-apsTree$edge.length
-
-# ... and you can assign a small arbitrary value to the edge
-# to show how it connects to the tree without having an
-# overlap.
-apsTree$edge.length[1] <- 0.1
-plot(apsTree, cex = 0.7)
-ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
-
-
-# This procedure does however not assign an actual length to a root edge, and
-# therefore no root edge is visible on the plot. Why? , you might ask. I ask
-# myself that too. We'll just add a length by hand.
-
-apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
-plot(apsTree, cex = 0.7, root.edge = TRUE)
-ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
-
-
-# ==   3.2  Rotating Clades  ===================================================
-
-# To interpret the tree, it is useful to rotate the clades so that they appear
-# in the order expected from the cladogram of species.
-
-# We can either rotate around individual internal nodes ...
-layout(matrix(1:2, 1, 2))
-plot(apsTree, no.margin = TRUE, root.edge = TRUE)
-ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
-plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
-ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
-# Note that the species at the bottom of the clade descending from node
-# 17 is now plotted at the top.
-
-par(PAR)   # reset graphics state
-
-# ... or we can rearrange the tree so it corresponds as well as possible to a
-# predefined tip ordering. Here we use the ordering that taxize:: has inferred
-# from the NCBI taxonomic classification.
-
-nOrg <- length(apsTree$tip.label)
-
-plot(fungiTree,
-     no.margin = FALSE, root.edge = TRUE)
-ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.5,
-                adj = 0.2,
-                bg = "#D4F2DA")
-
-# These are the fungi tree tips ...
-fungiTree$tip.label
-# ... and their order is determined by the edge-list that is stored in
-fungiTree$edge
-# which edges join the tips?
-ape::tiplabels(cex = 0.5, frame = "rect")
-# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
-# ordered from bottom to top.
-# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
-
-sel <- fungiTree$edge[ , 2 ] <= nOrg
-( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
-
-# Now, here are the genes of the apsTree tips ...
-apsTree$tip.label
-
-# ... and the "constraint"  we need for reordering, according to the help page
-# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
-# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
-oSp <- gsub("^", "MBP1_", oSp)
-( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
-
-# Then we can plot the two trees to compare: the fungi- tree
-par(PAR)   # reset graphics state
-layout(matrix(1:2, 1, 2))
-plot(fungiTree,
-    no.margin = TRUE,
-     root.edge = TRUE)
-ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.5,
-                adj = 0.2,
-                bg = "#D4F2DA")
-
-# and the re-organized apsesTree ...
-plot(ape::rotateConstr(apsTree, constraint = oSp[]),
-     no.margin = TRUE,
-     root.edge = TRUE)
-
-par(PAR)   # reset graphics state
-
-# As you can see, the reordering is not perfect, since the topologies are
-# different, mostly due to the unresolved nodes in the reference tree. One
-# could play with that ...
-
-
-# Task: Study the two trees and consider their similarities and differences.
-#         What do you expect? What do you find? Note that this is not a "mixed"
-#         gene tree yet, since it contains only a single gene for the species
-#         we considered. All of the branch points in this tree are speciation
-#         events. Thus the gene tree should have the same topology as the
-#         species tree. Does it? Are the differences important? How many
-#         branches would you need to remove and reinsert elsewhere to get the
-#         same topology as the species tree?
-
-# In order to quantify how different these two trees are, we need to compute
-# tree distances.
-
-
-# ==   3.3  Computing tree distances  ==========================================
-
-
-# Many superb phylogeny tools are contributed by the phangorn package.
-
-if (! requireNamespace("phangorn", quietly = TRUE)) {
-  install.packages("phangorn")
-}
-# Package information:
-#  library(help = phangorn)       # basic information
-#  browseVignettes("phangorn")    # available vignettes
-#  data(package = "phangorn")     # available datasets
-
-# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
-# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
-apsTree2 <- apsTree
-apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
-
-
-# phangorn provides several functions to compute tree-differences (and there
-# is a _whole_ lot of theory on how to compare trees). treedist() returns the
-# "symmetric difference"
-phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
-
-# Numbers. What do they mean? How much more similar is our apsTree to the
-# (presumably) ground truth of fungiTree than a random tree would be?
-# The ape package provides the function rtree()
-# to compute random trees.
-
-ape::rtree(n = length(apsTree2$tip.label), # number of tips
-          rooted = TRUE,                   # we rooted the tree above,
-                                           #  and fungiTree is rooted anyway
-          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
-          br = NULL)                       # don't generate branch lengths since
-                                           #   fungiTree has none, so we can't
-                                           #   compare them anyway.
-
-# (Note the warning message about non-binary trees; we'll suppress that later
-#  by wrapping the function call in supressMessages(); we don't want to
-#  print it 10,000 times :-)
-
-
-# Let's compute some random trees this way, calculate the distances to
-# fungiTree, and then compare the values we get for apsTree2. The random
-# trees are provided by ape::rtree().
-
-N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
-            # defined in .utilities.R  to keep track of where we are at:
-myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
-colnames(myTreeDistances) <- c("symm", "path")
-
-set.seed(112358)
-for (i in 1:N) {
-  pBar(i, N)
-  xTree <- ape::rtree(n = length(apsTree2$tip.label),
-                      rooted = TRUE,
-                      tip.label = apsTree2$tip.label,
-                      br = NULL)
-  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
-}
-set.seed(NULL)                      # reset the random number generator
-
-table(myTreeDistances[, "symm"])
-
-( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
-
-# Random events less-or-equal to observation, divided by total number of
-# events gives us the empirical p-value.
-cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
-            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
-
-par(PAR)   # reset graphics state
-hist(myTreeDistances[, "path"],
-     col = "aliceblue",
-     main = "Distances of random Trees to fungiTree")
-(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
-abline(v = pathObs, col = "chartreuse")
-
-# Random events less-or-equal to observation, divided by total number of
-# events gives us the empirical p-value.
-cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
-            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
-
-# Indeed, our apsTree is _very_ much more similar to the species tree than
-# we would expect by random chance.
-
-# What do we gain from that analysis? Analyzing the tree we get from a single
-# gene of orthologous sequences is a positive control in our computational
-# experiment. If these genes are indeed orthologues, a correct tree-building
-# program ought to give us a tree that exactly matches the species tree.
-# Evaluating how far off we are from the known correct result gives us a way to
-# validate our workflow and our algorithm. If we can't get that right, we can't
-# expect to get "real" data right either. Employing such positive controls in
-# every computational experiment is essential for research. Not doing so is
-# Cargo Cult Bioinformatics.
-
-
-# [END]
+# tocID <- "BIN-PHYLO-Tree_analysis.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
+#                  Rewrite of tip re-ordering. Better handling of
+#                  messages. pBar() for randomization.
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0.2  Typo in variable name, style changes
+#           1.0.1  Wrong section heading
+#           1.0    First 2017 version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                              Line
+#TOC> --------------------------------------------------
+#TOC>   1        Preparation and Tree Plot            50
+#TOC>   2        SPECIES REFERENCE TREE               66
+#TOC>   3        Tree Analysis                       117
+#TOC>   3.1        Rooting Trees                     177
+#TOC>   3.2        Rotating Clades                   222
+#TOC>   3.3        Computing tree distances          309
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Preparation and Tree Plot  ===========================================
+
+
+if (! requireNamespace("ape", quietly = TRUE)) {
+  install.packages("ape")
+}
+# Package information:
+#  library(help = ape)       # basic information
+#  browseVignettes("ape")    # available vignettes
+#  data(package = "ape")     # available datasets
+
+# We change the graphics parameters from time to time, let's define the
+# default so we can recreate a sane state:
+dev.off()
+PAR <- par()
+
+# =    2  SPECIES REFERENCE TREE  ==============================================
+
+# Before we do any kind of phylogenetic analysis of genes from several species,
+# we MUST have a reference tree of the taxonomic relationships in hand. This
+# context is absolutely required for the interpretation of our tree.
+
+# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
+
+if (! requireNamespace("taxize", quietly = TRUE)) {
+  install.packages("taxize")
+}
+# Package information:
+#  library(help   = taxize)       # basic information
+#  browseVignettes("taxize")    # available vignettes
+#  data(package  = "taxize")     # available datasets
+
+( mySOI <- c(myDB$taxonomy$ID, "83333") )
+myClass <- taxize::classification(mySOI, db = "ncbi")
+str(myClass)
+
+myClass[[1]]
+
+fungiTree <- taxize::class2tree(myClass, check = TRUE)
+plot(fungiTree)
+
+# The tree produced by taxize:: contains full length species names,
+# but it would be more convenient if it had bicodes instead. Also, the actual
+# tree is only part of the list(), which will cause problems later:
+str(fungiTree)
+
+# we therefor simplify
+fungiTree <- fungiTree$phylo
+str(fungiTree)
+
+# The species names are in a vector $phylo$tip.label of this list.
+# We can use biCode() to shorten them.
+fungiTree$tip.label <- biCode(fungiTree$tip.label)
+
+# Plot the tree
+nSP <- length(fungiTree$tip.label)
+plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
+text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
+ape::nodelabels(text = fungiTree$node.label,
+                cex = 0.6,
+                adj = 0.2,
+                bg = "#D4F2DA")
+# Note that you can use the arrow buttons in the menu above the plot pane to
+# scroll back to plots you have created earlier - so you can reference back to
+# this species tree in your later analysis.
+
+
+# =    3  Tree Analysis  =======================================================
+
+
+# 1.1  Visualizing your tree
+# The trees that are produced by Rphylip are stored as an object of class
+# "phylo". This is a class for phylogenetic trees that is widely used in the
+# community, practically all R phylogenetics packages will options to read and
+# manipulate such trees. Outside of R, a popular interchange format is the
+# Newick_format that you have seen above. It's easy to output your calculated
+# trees in Newick format and visualize them elsewhere.
+
+# The "phylo" class object is one of R's "S3" objects and methods to plot and
+# print it have been defined with the Rphylip package, and in ape. You can
+# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
+# to plot it. The underlying function is plot.phylo(), and documentation for its
+# many options can by found by typing:
+
+?plot.phylo
+
+# We load the APSES sequence tree that you produced in the
+# BIN-PHYLO-Tree_building unit:
+apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
+
+plot(apsTree) # default type is "phylogram"
+plot(apsTree, type = "unrooted")
+plot(apsTree, type = "fan", no.margin = TRUE)
+
+# rescale to show all of the labels:
+# record the current plot parameters by assigning them to a variable ...
+(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
+# ... and adjust the plot limits for a new plot:
+plot(apsTree,
+     type = "fan",
+     x.lim = tmp$x.lim * 1.8,
+     y.lim = tmp$y.lim * 1.8,
+     cex = 0.8,
+     no.margin = TRUE)
+
+# Inspect the tree object
+str(apsTree)
+apsTree$tip.label
+apsTree$edge
+apsTree$edge.length
+
+# show the node / edge and tip labels on a plot
+plot(apsTree)
+ape::nodelabels()
+ape::edgelabels()
+ape::tiplabels()
+
+# show the number of nodes, edges and tips
+ape::Nnode(apsTree)
+ape::Nedge(apsTree)
+ape::Ntip(apsTree)
+
+par(PAR)   # reset graphics state
+
+# Finally, write the tree to console in Newick format
+ape::write.tree(apsTree)
+
+# ==   3.1  Rooting Trees  =====================================================
+
+# In order to analyse the tree, it is helpful to root it first and reorder its
+# clades. Contrary to documentation, Rproml() returns an unrooted tree.
+
+ape::is.rooted(apsTree)
+
+# You can root the tree with the command root() from the "ape" package.
+
+plot(apsTree)
+
+# add labels for internal nodes and tips
+ape::nodelabels(cex = 0.5, frame = "circle")
+ape::tiplabels(cex = 0.5, frame = "rect")
+
+# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
+# number in yours. Substitute the correct node number below for "outgroup".
+apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
+plot(apsTree)
+ape::is.rooted(apsTree)
+
+# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
+# an edge of length zero was added to connect the MRCA (Most Recent Common
+# Ancestor) of the ingroup.
+
+# The edge lengths are stored in the phylo object:
+apsTree$edge.length
+
+# ... and you can assign a small arbitrary value to the edge
+# to show how it connects to the tree without having an
+# overlap.
+apsTree$edge.length[1] <- 0.1
+plot(apsTree, cex = 0.7)
+ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
+
+
+# This procedure does however not assign an actual length to a root edge, and
+# therefore no root edge is visible on the plot. Why? , you might ask. I ask
+# myself that too. We'll just add a length by hand.
+
+apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
+plot(apsTree, cex = 0.7, root.edge = TRUE)
+ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
+
+
+# ==   3.2  Rotating Clades  ===================================================
+
+# To interpret the tree, it is useful to rotate the clades so that they appear
+# in the order expected from the cladogram of species.
+
+# We can either rotate around individual internal nodes ...
+layout(matrix(1:2, 1, 2))
+plot(apsTree, no.margin = TRUE, root.edge = TRUE)
+ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
+plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
+ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
+# Note that the species at the bottom of the clade descending from node
+# 17 is now plotted at the top.
+
+par(PAR)   # reset graphics state
+
+# ... or we can rearrange the tree so it corresponds as well as possible to a
+# predefined tip ordering. Here we use the ordering that taxize:: has inferred
+# from the NCBI taxonomic classification.
+
+nOrg <- length(apsTree$tip.label)
+
+plot(fungiTree,
+     no.margin = FALSE, root.edge = TRUE)
+ape::nodelabels(text = fungiTree$node.label,
+                cex = 0.5,
+                adj = 0.2,
+                bg = "#D4F2DA")
+
+# These are the fungi tree tips ...
+fungiTree$tip.label
+# ... and their order is determined by the edge-list that is stored in
+fungiTree$edge
+# which edges join the tips?
+ape::tiplabels(cex = 0.5, frame = "rect")
+# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
+# ordered from bottom to top.
+# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
+
+sel <- fungiTree$edge[ , 2 ] <= nOrg
+( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
+
+# Now, here are the genes of the apsTree tips ...
+apsTree$tip.label
+
+# ... and the "constraint"  we need for reordering, according to the help page
+# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
+# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
+oSp <- gsub("^", "MBP1_", oSp)
+( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
+
+# Then we can plot the two trees to compare: the fungi- tree
+par(PAR)   # reset graphics state
+layout(matrix(1:2, 1, 2))
+plot(fungiTree,
+    no.margin = TRUE,
+     root.edge = TRUE)
+ape::nodelabels(text = fungiTree$node.label,
+                cex = 0.5,
+                adj = 0.2,
+                bg = "#D4F2DA")
+
+# and the re-organized apsesTree ...
+plot(ape::rotateConstr(apsTree, constraint = oSp[]),
+     no.margin = TRUE,
+     root.edge = TRUE)
+
+par(PAR)   # reset graphics state
+
+# As you can see, the reordering is not perfect, since the topologies are
+# different, mostly due to the unresolved nodes in the reference tree. One
+# could play with that ...
+
+
+# Task: Study the two trees and consider their similarities and differences.
+#         What do you expect? What do you find? Note that this is not a "mixed"
+#         gene tree yet, since it contains only a single gene for the species
+#         we considered. All of the branch points in this tree are speciation
+#         events. Thus the gene tree should have the same topology as the
+#         species tree. Does it? Are the differences important? How many
+#         branches would you need to remove and reinsert elsewhere to get the
+#         same topology as the species tree?
+
+# In order to quantify how different these two trees are, we need to compute
+# tree distances.
+
+
+# ==   3.3  Computing tree distances  ==========================================
+
+
+# Many superb phylogeny tools are contributed by the phangorn package.
+
+if (! requireNamespace("phangorn", quietly = TRUE)) {
+  install.packages("phangorn")
+}
+# Package information:
+#  library(help = phangorn)       # basic information
+#  browseVignettes("phangorn")    # available vignettes
+#  data(package = "phangorn")     # available datasets
+
+# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
+# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
+apsTree2 <- apsTree
+apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
+
+
+# phangorn provides several functions to compute tree-differences (and there
+# is a _whole_ lot of theory on how to compare trees). treedist() returns the
+# "symmetric difference"
+phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
+
+# Numbers. What do they mean? How much more similar is our apsTree to the
+# (presumably) ground truth of fungiTree than a random tree would be?
+# The ape package provides the function rtree()
+# to compute random trees.
+
+ape::rtree(n = length(apsTree2$tip.label), # number of tips
+          rooted = TRUE,                   # we rooted the tree above,
+                                           #  and fungiTree is rooted anyway
+          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
+          br = NULL)                       # don't generate branch lengths since
+                                           #   fungiTree has none, so we can't
+                                           #   compare them anyway.
+
+# (Note the warning message about non-binary trees; we'll suppress that later
+#  by wrapping the function call in supressMessages(); we don't want to
+#  print it 10,000 times :-)
+
+
+# Let's compute some random trees this way, calculate the distances to
+# fungiTree, and then compare the values we get for apsTree2. The random
+# trees are provided by ape::rtree().
+
+N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
+            # defined in .utilities.R  to keep track of where we are at:
+myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
+colnames(myTreeDistances) <- c("symm", "path")
+
+set.seed(112358)
+for (i in 1:N) {
+  pBar(i, N)
+  xTree <- ape::rtree(n = length(apsTree2$tip.label),
+                      rooted = TRUE,
+                      tip.label = apsTree2$tip.label,
+                      br = NULL)
+  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
+}
+set.seed(NULL)                      # reset the random number generator
+
+table(myTreeDistances[, "symm"])
+
+( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
+
+# Random events less-or-equal to observation, divided by total number of
+# events gives us the empirical p-value.
+cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
+            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
+
+par(PAR)   # reset graphics state
+hist(myTreeDistances[, "path"],
+     col = "aliceblue",
+     main = "Distances of random Trees to fungiTree")
+(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
+abline(v = pathObs, col = "chartreuse")
+
+# Random events less-or-equal to observation, divided by total number of
+# events gives us the empirical p-value.
+cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
+            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
+
+# Indeed, our apsTree is _very_ much more similar to the species tree than
+# we would expect by random chance.
+
+# What do we gain from that analysis? Analyzing the tree we get from a single
+# gene of orthologous sequences is a positive control in our computational
+# experiment. If these genes are indeed orthologues, a correct tree-building
+# program ought to give us a tree that exactly matches the species tree.
+# Evaluating how far off we are from the known correct result gives us a way to
+# validate our workflow and our algorithm. If we can't get that right, we can't
+# expect to get "real" data right either. Employing such positive controls in
+# every computational experiment is essential for research. Not doing so is
+# Cargo Cult Bioinformatics.
+
+
+# [END]
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@ -1,168 +1,168 @@
-# tocID <- "BIN-PHYLO-Tree_building.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Tree_building unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10   2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
-#                  instructions to authorize proml.app
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#           Add MrBayes
-# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                       Line
-#TOC> -----------------------------------------------------------
-#TOC>   1        Calculating Trees                             48
-#TOC>   1.1        PROMLPATH ...                               68
-#TOC>   1.1.1          ... on the Mac                          73
-#TOC>   1.1.2          ... on Windows                         101
-#TOC>   1.1.3          ... on Linux                           115
-#TOC>   1.1.4          Confirming PROMLPATH                   120
-#TOC>   1.2        Building a maximum likelihood tree         134
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Calculating Trees  ===================================================
-
-
-# Follow the instructions found at phylip's home on the Web to install. If you
-# are on a Windows computer, take note of the installation directory.
-
-# After you have installed Phylip on your computer, install the R package that
-# provides an interface to the Phylip functions.
-
-if (! requireNamespace("Rphylip", quietly = TRUE)) {
-  install.packages("Rphylip")
-}
-# Package information:
-#  library(help = Rphylip)       # basic information
-#  browseVignettes("Rphylip")    # available vignettes
-#  data(package = "Rphylip")     # available datasets
-
-# This will install RPhylip, as well as its dependency, the package "ape".
-
-
-# ==   1.1  PROMLPATH ...  =====================================================
-# The next part may be tricky. You will need to figure out where
-# on your computer Phylip has been installed and define the path
-# to the proml program that calculates a maximum-likelihood tree.
-
-# ===   1.1.1  ... on the Mac                    
-# On the Mac, the standard installation places a phylip folder
-# in the /Applications directory. That folder contains all the
-# individual phylip programs as <name>.app files. These are not
-# the actual executables, but "app" files are actually directories
-# that contain the required resources for a program to run.
-
-# The executable is in a subdirectory and you can point Rphylip
-# directly to that subdirectory to find the program it needs:
-# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
-
-# However, RPHYLIP will not be able to run PHYLIP applications immediately,
-# because they have not been "signed" by the PHYLIP developers. The process
-# will terminate by your system, with a warning.
-
-#   -  Navigate to the phylip folder in your ~/Applications directory
-#   -  Descend into the "exe" folder and find  proml.app
-#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
-#      says: "macOS cannot verify the developer of “proml.app”.
-#             Are you sure you want to open it?"
-#   -  Click open to continue. You may need to allow access to the terminal
-#      as well. When the proml terminal session open, you can type
-#      Ctrl-c to abort the program and close the window.
-#
-#   This adds proml.app to the list of known-good programs and you will not
-#   need to repeat this process.
-#
-
-# ===   1.1.2  ... on Windows                    
-# On Windows you need to know where the programs have been installed, and you
-# need to specify a path that is correct for the Windows OS. Find the folder
-# that is named "exe", and right-click to inspect its properties. The path
-# should be listed among them.
-
-# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
-# assignment has to be
-# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
-# (Note: "/", not "\")
-
-# I have heard that your path must not contain spaces, and it is prudent to
-# avoid other special characters as well.
-
-# ===   1.1.3  ... on Linux                      
-# If you are running Linux I trust you know what to do. It's probably
-# something like
-# PROMLPATH <- "/usr/local/phylip-3.695/bin"
-
-# ===   1.1.4  Confirming PROMLPATH              
-# Confirm that the settings are right.
-PROMLPATH                # returns the path
-list.dirs(PROMLPATH)     # returns the directories in that path
-list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
-
-# If "proml" is NOT among the files that the last command returns, you
-# can't continue. Ask on the mailing list for advice.
-
-# If everything is good, you can add the line that defines PROMLPATH to
-# myScripts/.myProfile.R - the path will then be automatically set when
-# you quit RStudio and return.
-
-
-# ==   1.2  Building a maximum likelihood tree  ================================
-# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
-# as a "proseq" object with the read.protein() function of the RPhylip package:
-
-apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
-str(apsIn)
-
-# ... and you are ready to build a tree.
-
-# There are many fast options in PHYLIP - we will use the most _accurate_ one
-# that it has: proml, a maximum-likelihood tree building program for protein
-# data.
-
-# Building maximum-likelihood trees can eat as much computer time
-# as you can throw at it. Calculating a tree of 48 APSES domains
-# with default parameters of Rproml() runs for more than half a day
-# on my computer. But we have only twelve sequences here, so the
-# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
-# of coffee while you are waiting.
-
-apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
-
-# A quick first look:
-
-plot(apsTree)
-
-# save your tree:
-saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
-
-# If this did not work, ask for advice.
-
-
-
-
-# [END]
+# tocID <- "BIN-PHYLO-Tree_building.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-PHYLO-Tree_building unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10   2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
+#                  instructions to authorize proml.app
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#           1.0    First 2017 version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#           Add MrBayes
+# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                       Line
+#TOC> -----------------------------------------------------------
+#TOC>   1        Calculating Trees                             48
+#TOC>   1.1        PROMLPATH ...                               68
+#TOC>   1.1.1          ... on the Mac                          73
+#TOC>   1.1.2          ... on Windows                         101
+#TOC>   1.1.3          ... on Linux                           115
+#TOC>   1.1.4          Confirming PROMLPATH                   120
+#TOC>   1.2        Building a maximum likelihood tree         134
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Calculating Trees  ===================================================
+
+
+# Follow the instructions found at phylip's home on the Web to install. If you
+# are on a Windows computer, take note of the installation directory.
+
+# After you have installed Phylip on your computer, install the R package that
+# provides an interface to the Phylip functions.
+
+if (! requireNamespace("Rphylip", quietly = TRUE)) {
+  install.packages("Rphylip")
+}
+# Package information:
+#  library(help = Rphylip)       # basic information
+#  browseVignettes("Rphylip")    # available vignettes
+#  data(package = "Rphylip")     # available datasets
+
+# This will install RPhylip, as well as its dependency, the package "ape".
+
+
+# ==   1.1  PROMLPATH ...  =====================================================
+# The next part may be tricky. You will need to figure out where
+# on your computer Phylip has been installed and define the path
+# to the proml program that calculates a maximum-likelihood tree.
+
+# ===   1.1.1  ... on the Mac                    
+# On the Mac, the standard installation places a phylip folder
+# in the /Applications directory. That folder contains all the
+# individual phylip programs as <name>.app files. These are not
+# the actual executables, but "app" files are actually directories
+# that contain the required resources for a program to run.
+
+# The executable is in a subdirectory and you can point Rphylip
+# directly to that subdirectory to find the program it needs:
+# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
+
+# However, RPHYLIP will not be able to run PHYLIP applications immediately,
+# because they have not been "signed" by the PHYLIP developers. The process
+# will terminate by your system, with a warning.
+
+#   -  Navigate to the phylip folder in your ~/Applications directory
+#   -  Descend into the "exe" folder and find  proml.app
+#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
+#      says: "macOS cannot verify the developer of “proml.app”.
+#             Are you sure you want to open it?"
+#   -  Click open to continue. You may need to allow access to the terminal
+#      as well. When the proml terminal session open, you can type
+#      Ctrl-c to abort the program and close the window.
+#
+#   This adds proml.app to the list of known-good programs and you will not
+#   need to repeat this process.
+#
+
+# ===   1.1.2  ... on Windows                    
+# On Windows you need to know where the programs have been installed, and you
+# need to specify a path that is correct for the Windows OS. Find the folder
+# that is named "exe", and right-click to inspect its properties. The path
+# should be listed among them.
+
+# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
+# assignment has to be
+# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
+# (Note: "/", not "\")
+
+# I have heard that your path must not contain spaces, and it is prudent to
+# avoid other special characters as well.
+
+# ===   1.1.3  ... on Linux                      
+# If you are running Linux I trust you know what to do. It's probably
+# something like
+# PROMLPATH <- "/usr/local/phylip-3.695/bin"
+
+# ===   1.1.4  Confirming PROMLPATH              
+# Confirm that the settings are right.
+PROMLPATH                # returns the path
+list.dirs(PROMLPATH)     # returns the directories in that path
+list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
+
+# If "proml" is NOT among the files that the last command returns, you
+# can't continue. Ask on the mailing list for advice.
+
+# If everything is good, you can add the line that defines PROMLPATH to
+# myScripts/.myProfile.R - the path will then be automatically set when
+# you quit RStudio and return.
+
+
+# ==   1.2  Building a maximum likelihood tree  ================================
+# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
+# as a "proseq" object with the read.protein() function of the RPhylip package:
+
+apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
+str(apsIn)
+
+# ... and you are ready to build a tree.
+
+# There are many fast options in PHYLIP - we will use the most _accurate_ one
+# that it has: proml, a maximum-likelihood tree building program for protein
+# data.
+
+# Building maximum-likelihood trees can eat as much computer time
+# as you can throw at it. Calculating a tree of 48 APSES domains
+# with default parameters of Rproml() runs for more than half a day
+# on my computer. But we have only twelve sequences here, so the
+# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
+# of coffee while you are waiting.
+
+apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
+
+# A quick first look:
+
+plot(apsTree)
+
+# save your tree:
+saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
+
+# If this did not work, ask for advice.
+
+
+
+
+# [END]
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@ -1,323 +1,323 @@
-# tocID <- "BIN-PPI-Analysis.R"
-#
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PPI-Analysis unit.
-#
-# Version:   1.4
-#
-# Date:     2017-08  -  2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.4    Update vector ID's for betweenness centrality.
-#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
-#           1.2    2020 Updates; Rewrite for new STRINg V11;
-#                  Deprecate save()/load() for saveRDS()/readRDS()
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0    First live version
-#           0.1    First code copied from 2016 material.
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------------
-#TOC>   1        Setup and data                                    50
-#TOC>   2        Functional Edges in the Human Proteome            86
-#TOC>   2.1        Cliques                                        129
-#TOC>   2.2        Communities                                    170
-#TOC>   2.3        Betweenness Centrality                         184
-#TOC>   3        biomaRt                                          231
-#TOC>   4        Task for submission                              302
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Setup and data  ======================================================
-
-
-# Not surprisingly, the analysis of PPI networks needs iGraph:
-
-if (! requireNamespace("igraph", quietly = TRUE)) {
-  install.packages("igraph")
-}
-# Package information:
-#  library(help = igraph)       # basic information
-#  browseVignettes("igraph")    # available vignettes
-#  data(package = "igraph")     # available datasets
-
-# In order for you to explore some real, biological networks, I give you a
-# dataframe of functional relationships of human proteins that I have downloaded
-# from the STRING database. The full table has 8.5 million records, here is a
-# subset of records with combined confidence scores > 980
-
-# The selected set of edges with a confidence of > 964 is a dataframe with about
-# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
-# a fungal proteome. You can load the saved dataframe here (To read more about
-# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
-
-STRINGedges <- readRDS("./data/STRINGedges.rds")
-
-head(STRINGedges)
-
-# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
-# Ensemble transcript identifiers that start with ENSP. We'll remove them:
-
-STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
-STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
-
-head(STRINGedges)
-
-
-# =    2  Functional Edges in the Human Proteome  ==============================
-
-
-# There are many possibilities to explore interesting aspects of biological
-# networks, we will keep with some very simple procedures here but you have
-# to be aware that this is barely scratching the surface of possibilities.
-# However, once the network exists in your computer, it is comparatively
-# easy to find information online about the many, many options to analyze.
-
-
-# Make a graph from this dataframe
-?igraph::graph_from_data_frame
-
-gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
-
-# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
-# layout of such large graphs is possible, but requires specialized code. Google
-# for <layout large graphs> if you are curious. Also, consider what one can
-# really learn from plotting such a graph ...
-
-# Of course simple computations on this graph are reasonably fast:
-
-compSTR <- igraph::components(gSTR)
-summary(compSTR) # our graph is fully connected!
-
-hist(log(igraph::degree(gSTR)), col="#FEE0AF")
-# this actually does look rather scale-free
-
-(freqRank <- table(igraph::degree(gSTR)))
-plot(log10(as.numeric(names(freqRank)) + 1),
-     log10(as.numeric(freqRank)), type = "b",
-     pch = 21, bg = "#FEE0AF",
-     xlab = "log(Rank)", ylab = "log(frequency)",
-     main = "8,400 nodes from the human functional interaction network")
-
-# This looks very scale-free indeed.
-
-(regressionLine <- lm(log10(as.numeric(freqRank)) ~
-                      log10(as.numeric(names(freqRank)) + 1)))
-abline(regressionLine, col = "firebrick")
-
-# Now explore some more:
-
-# ==   2.1  Cliques  ===========================================================
-
-# Let's find the largest cliques. Remember: a clique is a fully connected
-# subgraph, i.e. a subgraph in which every node is connected to every other.
-# Biological complexes often appear as cliques in interaction graphs.
-
-igraph::clique_num(gSTR)
-# The largest clique has 81 members.
-
-(C <- igraph::largest_cliques(gSTR)[[1]])
-
-# Pick one of the proteins and find out what this fully connected cluster of 81
-# proteins is (you can simply Google for any of the IDs). Is this expected?
-
-# Plot this ...
-R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
-
-# color the vertices along a color spectrum
-vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
-
-# color the edges to have the same color as the originating node
-eCol <- character()
-for (i in seq_along(vCol)) {
-  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
-}
-
-oPar <- par(mar= rep(0,4)) # Turn margins off
-plot(R,
-     layout = igraph::layout_in_circle(R),
-     vertex.size = 3,
-     vertex.color = vCol,
-     edge.color = eCol,
-     edge.width = 0.1,
-     vertex.label = NA)
-par(oPar)
-
-# ... well: remember: a clique means every node is connected to every other
-# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
-# networks looks like for large complexes.
-
-
-# ==   2.2  Communities  =======================================================
-
-set.seed(112358)                       # set RNG seed for repeatable randomness
-gSTRclusters <- igraph::cluster_infomap(gSTR)
-set.seed(NULL)                         # reset the RNG
-
-igraph::modularity(gSTRclusters) # ... measures how separated the different
-                                 # membership types are from each other
-tMem <- table(igraph::membership(gSTRclusters))
-length(tMem)  # About 700 communities identified
-hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
-range(tMem) # ... but one has > 200 members
-
-
-# ==   2.3  Betweenness Centrality  ============================================
-
-# Let's find the nodes with the 10 - highest betweenness centralities.
-#
-BC <- igraph::centr_betw(gSTR)
-
-# remember: BC$res contains the results
-head(BC$res)
-
-BC$res[1]   # betweenness centrality of node 1 in the graph ...
-# ... which one is node 1?
-igraph::V(gSTR)[1]
-
-# to get the ten-highest nodes, we simply label the elements of BC with their
-# index ...
-names(BC$res) <- as.character(1:length(BC$res))
-
-# ... and then we sort:
-sBC <- sort(BC$res, decreasing = TRUE)
-head(sBC)
-
-# This ordered vector means: node 3 has the highest betweenness centrality,
-# node 721 has the second highest, etc.
-
-(BCsel <- as.numeric(names(sBC)[1:10]))
-
-# We can use the first ten labels to subset the nodes in gSTR and fetch the
-# IDs...
-(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
-
-# Task:
-# =====
-# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
-# We are going to use these IDs to produce some output for a submitted task:
-# therefore I need you to execute the following line, note the "seal" that this
-# returns, and not change myENSPsel later:
-
-myENSPsel <- selectENSP(ENSPsel)
-
-#  Next, to find what these proteins are...
-
-# We could now Google for all of these IDs to learn more about them. But really,
-# googling for IDs one after the other, that would be lame. Let's instead use
-# the very, very useful biomaRt package to translate these Ensemble IDs into
-# gene symbols.
-
-
-# =    3  biomaRt  =============================================================
-
-
-# IDs are just labels, but for _bio_informatics we need to learn more about the
-# biological function of the genes or proteins that we retrieve via graph data
-# mining. biomaRt is the tool of choice. It's a package distributed by the
-# bioconductor project. This here is not a biomaRt tutorial (that's for another
-# day), simply a few lines of sample code to get you started on the specific use
-# case of retrieving descriptions for ensembl protein IDs.
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("biomaRt", quietly = TRUE)) {
-  BiocManager::install("biomaRt")
-}
-# Package information:
-#  library(help = biomaRt)       # basic information
-#  browseVignettes("biomaRt")    # available vignettes
-#  data(package = "biomaRt")     # available datasets
-
-# define which dataset to use ... this takes a while for download
-myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
-
-# what filters are defined?
-( filters <- biomaRt::listFilters(myMart) )
-
-
-# and what attributes can we filter for?
-( attributes <- biomaRt::listAttributes(myMart) )
-
-
-# Soooo many options - let's look for the correct name of filters that are
-# useful for ENSP IDs ...
-filters[grep("ENSP", filters$description), ]
-
-# ... and the correct attribute names for gene symbols and descriptions ...
-attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
-attributes[grep("description", attributes$description, ignore.case = TRUE), ]
-
-
-# ... so we can put this together: here is a syntax example:
-biomaRt::getBM(filters = "ensembl_peptide_id",
-               attributes = c("hgnc_symbol",
-                              "wikigene_description",
-                              "interpro_description",
-                              "phenotype_description"),
-               values = "ENSP00000000442",
-               mart = myMart)
-
-# A simple loop will now get us the information for our 10 most central genes
-# from the human subset of STRING.
-
-CPdefs <- list()  # Since we don't know how many matches one of our queries
-# will return, we'll put the result dataframes into a list.
-
-for (ID in myENSPsel) {
-  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
-                                 attributes = c("hgnc_symbol",
-                                                "wikigene_description",
-                                                "interpro_description",
-                                                "phenotype_description"),
-                                 values = ID,
-                                 mart = myMart)
-}
-
-
-# So what are the proteins with the ten highest betweenness centralities?
-#  ... are you surprised? (I am! Really.)
-
-
-# =    4  Task for submission  =================================================
-
-# Write a loop that will go through your personalized list of Ensemble IDs and
-#    for each ID:
-#    --  print the ID,
-#    --  print the first row's HGNC symbol,
-#    --  print the first row's wikigene description.
-#    --  print the first row's phenotype.
-#
-# Write your thoughts about this group of genes.
-#
-# (Hint, you can structure your loop in the same way as the loop that
-# created CPdefs. )
-
-# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
-# for this loop and its output into your report if you are submitting
-# anything for credit for this unit. Please read the requirements carefully.
-
-
-
-
-# [END]
+# tocID <- "BIN-PPI-Analysis.R"
+#
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-PPI-Analysis unit.
+#
+# Version:   1.4
+#
+# Date:     2017-08  -  2020-10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.4    Update vector ID's for betweenness centrality.
+#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
+#           1.2    2020 Updates; Rewrite for new STRINg V11;
+#                  Deprecate save()/load() for saveRDS()/readRDS()
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0    First live version
+#           0.1    First code copied from 2016 material.
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        Setup and data                                    50
+#TOC>   2        Functional Edges in the Human Proteome            86
+#TOC>   2.1        Cliques                                        129
+#TOC>   2.2        Communities                                    170
+#TOC>   2.3        Betweenness Centrality                         184
+#TOC>   3        biomaRt                                          231
+#TOC>   4        Task for submission                              302
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Setup and data  ======================================================
+
+
+# Not surprisingly, the analysis of PPI networks needs iGraph:
+
+if (! requireNamespace("igraph", quietly = TRUE)) {
+  install.packages("igraph")
+}
+# Package information:
+#  library(help = igraph)       # basic information
+#  browseVignettes("igraph")    # available vignettes
+#  data(package = "igraph")     # available datasets
+
+# In order for you to explore some real, biological networks, I give you a
+# dataframe of functional relationships of human proteins that I have downloaded
+# from the STRING database. The full table has 8.5 million records, here is a
+# subset of records with combined confidence scores > 980
+
+# The selected set of edges with a confidence of > 964 is a dataframe with about
+# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
+# a fungal proteome. You can load the saved dataframe here (To read more about
+# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
+
+STRINGedges <- readRDS("./data/STRINGedges.rds")
+
+head(STRINGedges)
+
+# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
+# Ensemble transcript identifiers that start with ENSP. We'll remove them:
+
+STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
+STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
+
+head(STRINGedges)
+
+
+# =    2  Functional Edges in the Human Proteome  ==============================
+
+
+# There are many possibilities to explore interesting aspects of biological
+# networks, we will keep with some very simple procedures here but you have
+# to be aware that this is barely scratching the surface of possibilities.
+# However, once the network exists in your computer, it is comparatively
+# easy to find information online about the many, many options to analyze.
+
+
+# Make a graph from this dataframe
+?igraph::graph_from_data_frame
+
+gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
+
+# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
+# layout of such large graphs is possible, but requires specialized code. Google
+# for <layout large graphs> if you are curious. Also, consider what one can
+# really learn from plotting such a graph ...
+
+# Of course simple computations on this graph are reasonably fast:
+
+compSTR <- igraph::components(gSTR)
+summary(compSTR) # our graph is fully connected!
+
+hist(log(igraph::degree(gSTR)), col="#FEE0AF")
+# this actually does look rather scale-free
+
+(freqRank <- table(igraph::degree(gSTR)))
+plot(log10(as.numeric(names(freqRank)) + 1),
+     log10(as.numeric(freqRank)), type = "b",
+     pch = 21, bg = "#FEE0AF",
+     xlab = "log(Rank)", ylab = "log(frequency)",
+     main = "8,400 nodes from the human functional interaction network")
+
+# This looks very scale-free indeed.
+
+(regressionLine <- lm(log10(as.numeric(freqRank)) ~
+                      log10(as.numeric(names(freqRank)) + 1)))
+abline(regressionLine, col = "firebrick")
+
+# Now explore some more:
+
+# ==   2.1  Cliques  ===========================================================
+
+# Let's find the largest cliques. Remember: a clique is a fully connected
+# subgraph, i.e. a subgraph in which every node is connected to every other.
+# Biological complexes often appear as cliques in interaction graphs.
+
+igraph::clique_num(gSTR)
+# The largest clique has 81 members.
+
+(C <- igraph::largest_cliques(gSTR)[[1]])
+
+# Pick one of the proteins and find out what this fully connected cluster of 81
+# proteins is (you can simply Google for any of the IDs). Is this expected?
+
+# Plot this ...
+R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
+
+# color the vertices along a color spectrum
+vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
+
+# color the edges to have the same color as the originating node
+eCol <- character()
+for (i in seq_along(vCol)) {
+  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
+}
+
+oPar <- par(mar= rep(0,4)) # Turn margins off
+plot(R,
+     layout = igraph::layout_in_circle(R),
+     vertex.size = 3,
+     vertex.color = vCol,
+     edge.color = eCol,
+     edge.width = 0.1,
+     vertex.label = NA)
+par(oPar)
+
+# ... well: remember: a clique means every node is connected to every other
+# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
+# networks looks like for large complexes.
+
+
+# ==   2.2  Communities  =======================================================
+
+set.seed(112358)                       # set RNG seed for repeatable randomness
+gSTRclusters <- igraph::cluster_infomap(gSTR)
+set.seed(NULL)                         # reset the RNG
+
+igraph::modularity(gSTRclusters) # ... measures how separated the different
+                                 # membership types are from each other
+tMem <- table(igraph::membership(gSTRclusters))
+length(tMem)  # About 700 communities identified
+hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
+range(tMem) # ... but one has > 200 members
+
+
+# ==   2.3  Betweenness Centrality  ============================================
+
+# Let's find the nodes with the 10 - highest betweenness centralities.
+#
+BC <- igraph::centr_betw(gSTR)
+
+# remember: BC$res contains the results
+head(BC$res)
+
+BC$res[1]   # betweenness centrality of node 1 in the graph ...
+# ... which one is node 1?
+igraph::V(gSTR)[1]
+
+# to get the ten-highest nodes, we simply label the elements of BC with their
+# index ...
+names(BC$res) <- as.character(1:length(BC$res))
+
+# ... and then we sort:
+sBC <- sort(BC$res, decreasing = TRUE)
+head(sBC)
+
+# This ordered vector means: node 3 has the highest betweenness centrality,
+# node 721 has the second highest, etc.
+
+(BCsel <- as.numeric(names(sBC)[1:10]))
+
+# We can use the first ten labels to subset the nodes in gSTR and fetch the
+# IDs...
+(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
+
+# Task:
+# =====
+# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
+# We are going to use these IDs to produce some output for a submitted task:
+# therefore I need you to execute the following line, note the "seal" that this
+# returns, and not change myENSPsel later:
+
+myENSPsel <- selectENSP(ENSPsel)
+
+#  Next, to find what these proteins are...
+
+# We could now Google for all of these IDs to learn more about them. But really,
+# googling for IDs one after the other, that would be lame. Let's instead use
+# the very, very useful biomaRt package to translate these Ensemble IDs into
+# gene symbols.
+
+
+# =    3  biomaRt  =============================================================
+
+
+# IDs are just labels, but for _bio_informatics we need to learn more about the
+# biological function of the genes or proteins that we retrieve via graph data
+# mining. biomaRt is the tool of choice. It's a package distributed by the
+# bioconductor project. This here is not a biomaRt tutorial (that's for another
+# day), simply a few lines of sample code to get you started on the specific use
+# case of retrieving descriptions for ensembl protein IDs.
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("biomaRt", quietly = TRUE)) {
+  BiocManager::install("biomaRt")
+}
+# Package information:
+#  library(help = biomaRt)       # basic information
+#  browseVignettes("biomaRt")    # available vignettes
+#  data(package = "biomaRt")     # available datasets
+
+# define which dataset to use ... this takes a while for download
+myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
+
+# what filters are defined?
+( filters <- biomaRt::listFilters(myMart) )
+
+
+# and what attributes can we filter for?
+( attributes <- biomaRt::listAttributes(myMart) )
+
+
+# Soooo many options - let's look for the correct name of filters that are
+# useful for ENSP IDs ...
+filters[grep("ENSP", filters$description), ]
+
+# ... and the correct attribute names for gene symbols and descriptions ...
+attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
+attributes[grep("description", attributes$description, ignore.case = TRUE), ]
+
+
+# ... so we can put this together: here is a syntax example:
+biomaRt::getBM(filters = "ensembl_peptide_id",
+               attributes = c("hgnc_symbol",
+                              "wikigene_description",
+                              "interpro_description",
+                              "phenotype_description"),
+               values = "ENSP00000000442",
+               mart = myMart)
+
+# A simple loop will now get us the information for our 10 most central genes
+# from the human subset of STRING.
+
+CPdefs <- list()  # Since we don't know how many matches one of our queries
+# will return, we'll put the result dataframes into a list.
+
+for (ID in myENSPsel) {
+  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
+                                 attributes = c("hgnc_symbol",
+                                                "wikigene_description",
+                                                "interpro_description",
+                                                "phenotype_description"),
+                                 values = ID,
+                                 mart = myMart)
+}
+
+
+# So what are the proteins with the ten highest betweenness centralities?
+#  ... are you surprised? (I am! Really.)
+
+
+# =    4  Task for submission  =================================================
+
+# Write a loop that will go through your personalized list of Ensemble IDs and
+#    for each ID:
+#    --  print the ID,
+#    --  print the first row's HGNC symbol,
+#    --  print the first row's wikigene description.
+#    --  print the first row's phenotype.
+#
+# Write your thoughts about this group of genes.
+#
+# (Hint, you can structure your loop in the same way as the loop that
+# created CPdefs. )
+
+# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
+# for this loop and its output into your report if you are submitting
+# anything for credit for this unit. Please read the requirements carefully.
+
+
+
+
+# [END]
--- a/BIN-SEQA-Composition.R
+++ b/BIN-SEQA-Composition.R
@ -1,252 +1,252 @@
-# tocID <- "BIN-SEQA-Composition.R"
-#
-# Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-SEQA-Comparison unit
-#
-# Version: 1.2
-#
-# Date:    2017-11  -  2020-09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-# Versions:
-#           1.0    First live version 2017
-#           0.1    First code copied from BCH441_A03_makeYFOlist.R
-#
-# TODO:
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# DO NOT SIMPLY  source()  THESE FILES!
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                      Line
-#TOC> ----------------------------------------------------------
-#TOC>   1        Preparation                                  48
-#TOC>   2        Aggregate properties                         69
-#TOC>   3        Sequence Composition Enrichment             113
-#TOC>   3.1        Barplot, and side-by-side barplot         136
-#TOC>   3.2        Plotting ratios                           171
-#TOC>   3.3        Plotting log ratios                       188
-#TOC>   3.4        Sort by frequency                         204
-#TOC>   3.5        Color by amino acid type                  221
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Preparation  =========================================================
-
-if (! requireNamespace("seqinr", quietly = TRUE)) {
-  install.packages("seqinr")
-}
-# Package information:
-#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
-
-# Load a reference sequence to work with:
-
-# If you have done the BIN-Storing_data unit:
-   source("makeProteinDB.R")
-   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
-   mySeq <- myDB$protein$sequence[sel]
-
-# If not, use the yeast Mbp1 sequence:
-   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
-
-
-# =    2  Aggregate properties  ================================================
-
-
-# Let's try a simple function from seqinr: computing the pI of the sequence
-?seqinr::computePI
-
-# This takes as input a vector of upper-case AA codes
-
-# We can use the function strsplit() to split the string
-# into single characters
-
-(s <- strsplit(mySeq, "")) # splitting on the empty spring
-                           # splits into single characters
-s <- unlist(s)             # strsplit() returns a list! Why?
-                           # (But we don't need a list now...)
-
-# Alternatively, seqinr provides
-# the function s2c() to convert strings into
-# character vectors (and c2s to convert them back).
-
-seqinr::s2c(mySeq)
-
-
-seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
-seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
-seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
-                                       # values along the sequence
-
-# A true Labor of Love has gone into the
-# compilation of the "aaindex" data:
-
-?seqinr::aaindex
-data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
-                                   # accessible as an R object
-
-length(aaindex)  # no seqinr:: needed for the dataset since we just
-                 # "attached" it with data()
-
-# Here are all the index descriptions
-for (i in 1:length(aaindex)) {
-  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
-}
-
-
-# =    3  Sequence Composition Enrichment  =====================================
-
-
-# Lets use one of the indices to calculate and plot amino-acid
-# composition enrichment:
-aaindex[[459]]$D
-
-#
-# Let's construct an enrichment plot to compare average frequencies
-# with the amino acid counts in our sequence.
-
-(refData <- aaindex[[459]]$I)                # reference frequencies in %
-names(refData) <- seqinr::a(names(refData))  # change names to single-letter
-                                             # code using seqinr's "a()" function
-sum(refData)
-refData        # ... in %
-
-
-# tabulate the amino acid counts in mySeq
-(obsData <- table(seqinr::s2c(mySeq)))        # counts
-(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
-
-
-# ==   3.1  Barplot, and side-by-side barplot  =================================
-
-barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
-abline(h = 100/20, col="#BB0000")
-
-barplot(refData, col = "#BB0000", cex.names = 0.7)
-abline(h = 100/20, col="#555555")
-
-# Ok: first problem - the values in obsData are in alphabetical order. But the
-# values in refData are in alphabetical order of amino acid name: alanine,
-# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
-# order a lot - one of the old biochemistry tropes in the field. So we need to
-# re-order one of the vectors to match the other. That's easy though:
-refData
-(refData <- refData[names(obsData)])
-
-barplot(refData, col = "#BB0000", cex.names = 0.7)
-abline(h = 100/20, col="#555555")
-
-# To compare the values, we want to see them in a barplot, side-by-side ...
-barplot(rbind(obsData, refData),
-        ylim = c(0, 12),
-        beside = TRUE,
-        col = c("#CCCCCC", "#BB0000"),
-        cex.names = 0.7)
-abline(h = 100/20, col="#00000044")
-
-# ... and add a legend
-legend (x = 1, y = 12,
-        legend = c("mySeq", "Average composition"),
-        fill = c("#CCCCCC", "#BB0000"),
-        cex = 0.7,
-        bty = "n")
-
-
-# ==   3.2  Plotting ratios  ===================================================
-
-# To better compare the values, we'll calculate ratios between
-# obsData and refData
-
-barplot(obsData / refData,
-        col = "#CCCCCC",
-        ylab = "Sequence / Average",
-        ylim = c(0, 2.5),
-        cex.names = 0.7)
-abline(h = 1, col="#BB0000")
-abline(h = c(1/2, 2), lty = 2, col="#BB000055")
-
-# ... but  ratios are not very good here, since the difference in height on the
-# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
-# lines) are exactly the same fold-difference !
-
-# ==   3.3  Plotting log ratios  ===============================================
-
-# A better way to display this
-# is to plot log(ratios).
-
-barplot(log(obsData / refData),
-        col = "#CCCCCC",
-        ylab = "log(Sequence / Average)",
-        ylim = log(c(1/3, 3)),
-        cex.names = 0.7)
-abline(h = log(1), col="#BB0000")
-abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
-
-# Note how the two-fold difference lines are now the same distance from the
-# line of equal ratio.
-
-# ==   3.4  Sort by frequency  =================================================
-
-barplot(sort(log(obsData / refData), decreasing = TRUE),
-        ylim = log(c(1/3, 3)),
-        col = "#CCCCCC",
-        ylab = "log(Sequence / Average)",
-        cex.names = 0.7)
-abline(h = log(1), col="#BB0000")
-abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
-
-yTxt <- log(0.9)
-arrows(4, yTxt, 0, yTxt, length = 0.07)
-text(5.5, yTxt, "Enriched", cex = 0.7)
-yTxt <- log(1.1)
-arrows(20, yTxt, 24, yTxt, length = 0.07)
-text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
-
-# ==   3.5  Color by amino acid type  ==========================================
-
-# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
-# script, or define your own.
-
-barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
-
-lR <- sort(log(obsData / refData), decreasing = TRUE)
-barplot(lR,
-        ylim = log(c(1/3, 3)),
-        col = AACOLS[names(lR)],
-        ylab = "log(Sequence / Average)",
-        cex.names = 0.7)
-abline(h = log(1), col="#00000055")
-abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
-
-yTxt <- log(0.9)
-arrows(4, yTxt, 0, yTxt, length = 0.07)
-text(5.5, yTxt, "Enriched", cex = 0.7)
-yTxt <- log(1.1)
-arrows(20, yTxt, 24, yTxt, length = 0.07)
-text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
-
-
-# Task:
-#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
-#   Depleted?
-
-
-
-
-# [END]
+# tocID <- "BIN-SEQA-Composition.R"
+#
+# Purpose: A Bioinformatics Course:
+#              R code accompanying the BIN-SEQA-Comparison unit
+#
+# Version: 1.2
+#
+# Date:    2017-11  -  2020-09
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+#           1.2    2020 Maintenance
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+# Versions:
+#           1.0    First live version 2017
+#           0.1    First code copied from BCH441_A03_makeYFOlist.R
+#
+# TODO:
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# DO NOT SIMPLY  source()  THESE FILES!
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                      Line
+#TOC> ----------------------------------------------------------
+#TOC>   1        Preparation                                  48
+#TOC>   2        Aggregate properties                         69
+#TOC>   3        Sequence Composition Enrichment             113
+#TOC>   3.1        Barplot, and side-by-side barplot         136
+#TOC>   3.2        Plotting ratios                           171
+#TOC>   3.3        Plotting log ratios                       188
+#TOC>   3.4        Sort by frequency                         204
+#TOC>   3.5        Color by amino acid type                  221
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Preparation  =========================================================
+
+if (! requireNamespace("seqinr", quietly = TRUE)) {
+  install.packages("seqinr")
+}
+# Package information:
+#  library(help = seqinr)       # basic information
+#  browseVignettes("seqinr")    # available vignettes
+#  data(package = "seqinr")     # available datasets
+
+# Load a reference sequence to work with:
+
+# If you have done the BIN-Storing_data unit:
+   source("makeProteinDB.R")
+   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
+   mySeq <- myDB$protein$sequence[sel]
+
+# If not, use the yeast Mbp1 sequence:
+   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
+
+
+# =    2  Aggregate properties  ================================================
+
+
+# Let's try a simple function from seqinr: computing the pI of the sequence
+?seqinr::computePI
+
+# This takes as input a vector of upper-case AA codes
+
+# We can use the function strsplit() to split the string
+# into single characters
+
+(s <- strsplit(mySeq, "")) # splitting on the empty spring
+                           # splits into single characters
+s <- unlist(s)             # strsplit() returns a list! Why?
+                           # (But we don't need a list now...)
+
+# Alternatively, seqinr provides
+# the function s2c() to convert strings into
+# character vectors (and c2s to convert them back).
+
+seqinr::s2c(mySeq)
+
+
+seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
+seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
+seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
+                                       # values along the sequence
+
+# A true Labor of Love has gone into the
+# compilation of the "aaindex" data:
+
+?seqinr::aaindex
+data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
+                                   # accessible as an R object
+
+length(aaindex)  # no seqinr:: needed for the dataset since we just
+                 # "attached" it with data()
+
+# Here are all the index descriptions
+for (i in 1:length(aaindex)) {
+  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
+}
+
+
+# =    3  Sequence Composition Enrichment  =====================================
+
+
+# Lets use one of the indices to calculate and plot amino-acid
+# composition enrichment:
+aaindex[[459]]$D
+
+#
+# Let's construct an enrichment plot to compare average frequencies
+# with the amino acid counts in our sequence.
+
+(refData <- aaindex[[459]]$I)                # reference frequencies in %
+names(refData) <- seqinr::a(names(refData))  # change names to single-letter
+                                             # code using seqinr's "a()" function
+sum(refData)
+refData        # ... in %
+
+
+# tabulate the amino acid counts in mySeq
+(obsData <- table(seqinr::s2c(mySeq)))        # counts
+(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
+
+
+# ==   3.1  Barplot, and side-by-side barplot  =================================
+
+barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
+abline(h = 100/20, col="#BB0000")
+
+barplot(refData, col = "#BB0000", cex.names = 0.7)
+abline(h = 100/20, col="#555555")
+
+# Ok: first problem - the values in obsData are in alphabetical order. But the
+# values in refData are in alphabetical order of amino acid name: alanine,
+# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
+# order a lot - one of the old biochemistry tropes in the field. So we need to
+# re-order one of the vectors to match the other. That's easy though:
+refData
+(refData <- refData[names(obsData)])
+
+barplot(refData, col = "#BB0000", cex.names = 0.7)
+abline(h = 100/20, col="#555555")
+
+# To compare the values, we want to see them in a barplot, side-by-side ...
+barplot(rbind(obsData, refData),
+        ylim = c(0, 12),
+        beside = TRUE,
+        col = c("#CCCCCC", "#BB0000"),
+        cex.names = 0.7)
+abline(h = 100/20, col="#00000044")
+
+# ... and add a legend
+legend (x = 1, y = 12,
+        legend = c("mySeq", "Average composition"),
+        fill = c("#CCCCCC", "#BB0000"),
+        cex = 0.7,
+        bty = "n")
+
+
+# ==   3.2  Plotting ratios  ===================================================
+
+# To better compare the values, we'll calculate ratios between
+# obsData and refData
+
+barplot(obsData / refData,
+        col = "#CCCCCC",
+        ylab = "Sequence / Average",
+        ylim = c(0, 2.5),
+        cex.names = 0.7)
+abline(h = 1, col="#BB0000")
+abline(h = c(1/2, 2), lty = 2, col="#BB000055")
+
+# ... but  ratios are not very good here, since the difference in height on the
+# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
+# lines) are exactly the same fold-difference !
+
+# ==   3.3  Plotting log ratios  ===============================================
+
+# A better way to display this
+# is to plot log(ratios).
+
+barplot(log(obsData / refData),
+        col = "#CCCCCC",
+        ylab = "log(Sequence / Average)",
+        ylim = log(c(1/3, 3)),
+        cex.names = 0.7)
+abline(h = log(1), col="#BB0000")
+abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
+
+# Note how the two-fold difference lines are now the same distance from the
+# line of equal ratio.
+
+# ==   3.4  Sort by frequency  =================================================
+
+barplot(sort(log(obsData / refData), decreasing = TRUE),
+        ylim = log(c(1/3, 3)),
+        col = "#CCCCCC",
+        ylab = "log(Sequence / Average)",
+        cex.names = 0.7)
+abline(h = log(1), col="#BB0000")
+abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
+
+yTxt <- log(0.9)
+arrows(4, yTxt, 0, yTxt, length = 0.07)
+text(5.5, yTxt, "Enriched", cex = 0.7)
+yTxt <- log(1.1)
+arrows(20, yTxt, 24, yTxt, length = 0.07)
+text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
+
+# ==   3.5  Color by amino acid type  ==========================================
+
+# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
+# script, or define your own.
+
+barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
+
+lR <- sort(log(obsData / refData), decreasing = TRUE)
+barplot(lR,
+        ylim = log(c(1/3, 3)),
+        col = AACOLS[names(lR)],
+        ylab = "log(Sequence / Average)",
+        cex.names = 0.7)
+abline(h = log(1), col="#00000055")
+abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
+
+yTxt <- log(0.9)
+arrows(4, yTxt, 0, yTxt, length = 0.07)
+text(5.5, yTxt, "Enriched", cex = 0.7)
+yTxt <- log(1.1)
+arrows(20, yTxt, 24, yTxt, length = 0.07)
+text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
+
+
+# Task:
+#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
+#   Depleted?
+
+
+
+
+# [END]
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -1,394 +1,394 @@
-# tocID <- "BIN-Sequence.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-Sequence unit.
-#
-# Version:  1.5
-#
-# Date:     2017-09  - 2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.5    2020 Updates
-#           1.4    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.3    Update set.seed() usage
-#           1.2    Removed irrelevant task. How did that even get in there? smh
-#           1.1    Add chartr()
-#           1.0    First live version 2017.
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                Line
-#TOC> ----------------------------------------------------
-#TOC>   1        Prepare                                63
-#TOC>   2        Storing Sequence                       80
-#TOC>   3        String properties                     109
-#TOC>   4        Substrings                            116
-#TOC>   5        Creating strings: sprintf()           137
-#TOC>   6        Changing strings                      172
-#TOC>   6.1.1          Changing case                   174
-#TOC>   6.1.2          Reverse                         179
-#TOC>   6.1.3          Change characters               183
-#TOC>   6.1.4          Substitute characters           211
-#TOC>   6.2        stringi and stringr                 231
-#TOC>   6.3        dbSanitizeSequence()                241
-#TOC>   7        Permuting and sampling                253
-#TOC>   7.1        Permutations                        260
-#TOC>   7.2        Sampling                            306
-#TOC>   7.2.1          Equiprobable characters         308
-#TOC>   7.2.2          Defined probability vector      350
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Prepare  =============================================================
-
-# Much basic sequence handling is supported by the Bioconductor package
-# Biostrings.
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-
-# =    2  Storing Sequence  ====================================================
-
-
-# Sequences can be represented and stored as vectors of single characters ...
-(v <- c("D", "I", "V", "M", "T", "Q"))
-
-# ... as strings ...
-(s <- "DIVMTQ")
-
-# ... or as more complex objects with rich metadata e.g. as a Biostrings
-# DNAstring, RNAstring, AAString, etc.
-(a <- Biostrings::AAString("DIVMTQ"))
-
-# ... and all of these representations can be interconverted:
-
-# string to vector ...
-unlist(strsplit(s, ""))
-
-# vector to string ...
-paste(v, sep = "", collapse = "")
-
-# ... and AAstring to plain string.
-as.character(a)
-
-# Since operations with character vectors trivially follow all other vector
-# conventions and syntax, and we will look at Biostrings methods in more
-# detail in a later unit, we will focus on basic strings in the following.
-
-
-# =    3  String properties  ===================================================
-
-
-length(s) # why ???
-nchar(s)  # Aha!
-
-
-# =    4  Substrings  ==========================================================
-
-# Use the substr() function
-substr(s, 2, 4)
-
-# or the similar substring()
-substring(s, 2, 4)
-
-# Note: both functions are vectorized (i.e. they operate on vectors
-# of arguments, you don't need to loop over input)...
-myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
-substr(   myBiCodes, 1, 3)
-substring(myBiCodes, 1, 3)
-
-# ... however only substring() will also use vectors for start and stop
-s <- "gatattgtgatgacccagtaa"       # a DNA sequence
-(vI <- seq(1, nchar(s), by = 3))   # an index vector
-substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
-substring(s, vI, vI+2)             # ... returns all triplets
-
-
-# =    5  Creating strings: sprintf()  =========================================
-
-
-# Sprintf is a very smart, very powerful function and has cognates in all
-# other programming languages. It has a bit of a  learning curve, but this is
-# totally worth it:
-# the function takes a format string, and a list of other arguments. It returns
-# a formatted string. Here are some examples - watch carefully for sprintf()
-# calls elsewhere in the code.
-
-sprintf("Just a string.")
-sprintf("A string and the number %d.", 5)
-sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
-sprintf("Pi is ~ %1.2f ...", pi)
-sprintf("or more accurately ~ %1.11f.", pi)
-x <- "bottles of beer"
-N <- 99
-sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
-        N, x, N, x, "one down, and pass it around", N - 1, x)
-
-# Note that in the last example, the value of the string was displayed with
-# R's usual print-formatting function and therefore the line-break "\n" did
-# not actually break the line. To have line breaks, tabs etc, you need to use
-# cat() to display the string:
-
-for (i in N:(N-4)) {
-  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
-              i, x, i, x, "one down, and pass it around", i - 1, x))
-}
-
-# sprintf() is vectorized: if one of its parameters is a vector, it
-# will generate one output string for each of the vector's elements:
-cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
-
-
-# =    6  Changing strings  ====================================================
-
-# ===   6.1.1  Changing case
-tolower(s)
-toupper(tolower(s))
-
-
-# ===   6.1.2  Reverse
-# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
-# Biostrings::str_rev(s)
-# The following works, of course, but awkward:
-s
-paste0(rev(unlist(strsplit(s, ""))), collapse = "")
-
-# reverse complement
-COMP <- c("t", "g", "c", "a")
-names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
-s
-paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
-
-
-# ===   6.1.3  Change characters
-# chartr(old, new, x) maps all characters in x that appear in "old" to the
-# correpsonding character in "new." Kind of like the COMP vector above ...
-
-chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
-
-# One could implement toupper() and tolower() with this - remember that R has
-# character vectors of uppercase and lowercase letters as language constants.
-chartr(paste0(letters, collapse = ""),
-       paste0(LETTERS, collapse = ""),
-       "Twinkle, twinkle little star, how I wonder what you are.")
-
-# One amusing way to use the function  is for a reversible substitution
-# cypher.
-alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
-set.seed(112358)                       # set RNG seed for repeatable randomness
-( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
-set.seed(NULL)                         # reset the RNG
-
-# encode ...
-(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
-
-# decode ...
-chartr(myCypher, alBet, x)
-# (Nb. substitution cyphers are easy to crack!)
-
-
-# ===   6.1.4  Substitute characters
-# gsub can change lengths.
-#   Example: implementing the binary Fibonacci sequence:
-#   0 -> 1; 1 -> 10 , in three nested gsub() statements
-( s <- 1 )
-( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
-
-# Iterate this line a few times ...
-#
-# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
-# for the features of the sequence.
-
-# I use gsub() often to delete unwanted characters ...
-# ... select something, and substitute the empty string for it.
-(s <- gsub("-", "", s))
-
-# For example: clean up a sequence
-# copy/paste from UniProt
-(s <- "        10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
-
-
-# remove numbers
-(s <- gsub("[0-9]", "", s))
-
-# remove "whitespace" (spaces, tabs, line breaks)...
-(s <- gsub("\\s", "", s))
-
-# ==   6.2  stringi and stringr  ===============================================
-
-# But there are also specialized functions eg. to remove leading/trailing
-# whitespace which may be important to sanitize user input etc. Have a look at
-# the function descriptions for the stringr and the stringi package. stringr is
-# part of the tidyverse, and for the most part a wrapper for stringi functions.
-# https://github.com/tidyverse/stringr
-
-
-
-# ==   6.3  dbSanitizeSequence()  ==============================================
-
-# In our learning units, we use a function dbSanitizeSequence() to clean up
-# sequences that may be copy/pasted from Web-sources
-
-cat( s <- ">FASTA header will be removed
-10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
-
-dbSanitizeSequence(s)
-
-
-# =    7  Permuting and sampling  ==============================================
-
-
-# An important aspect of working with strings is generating random strings
-# with given statistical properties: reference items to evaluate significance.
-
-
-# ==   7.1  Permutations  ======================================================
-
-
-# One way to produce such reference items is to permute a string. A permuted
-# string has the same composition as the original, but all positional
-# information is lost. The sample() function can be used to permute:
-
-# This is the sequence of the ompA secretion signal
-(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
-
-(x <- sample(s, length(s)))  # permuted
-
-# Here's a small example how such permuted strings may be useful. As you look
-# at the ompA sequence, you suspect that the two lysines near the +-charged
-# N-terminus may not be accidental, but selected for a positively charged
-# N-terminus. What is the chance that such a sequence has two lysines close to
-# the N-terminus simply by chance? Or put differently: what is the average
-# distance of two lysines in such a sequence to the N-terminus. First, we
-# need an expression that measures the distance. A simple use of the which()
-# function will do just fine.
-
-which(s == "K")        # shows they are in position 2 and 3, so ...
-mean(which(s == "K"))  # ... gives us the average, and ...
-mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
-
-# So what does the distribution look like? Lets do 10,000 trials.
-
-(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
-N <- 10000
-d <- numeric(N)
-
-set.seed(112358)                       # set RNG seed for repeatable randomness
-for (i in 1:N) {
-  d[i] <- mean(which(sample(s, length(s)) == "K"))
-}
-set.seed(NULL)                         # reset the RNG
-
-hist(d, breaks = 20)
-abline(v = 2.5, lwd = 2, col = "firebrick")
-sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
-              # N-terminus or more. That's just below the signifcance
-              # threshold of 5 %. It's a trend, but to be sure we are looking
-              # at a biological effect we would need to see more
-              # sequences.
-
-
-# ==   7.2  Sampling  ==========================================================
-
-# ===   7.2.1  Equiprobable characters
-
-# Assume you need a large random-nucleotide string for some statistical model.
-# How to create such a string? sample() can easily create it:
-
-nuc <- c("A", "C", "G", "T")
-N <- 100
-
-set.seed(16818)                        # set RNG seed for repeatable randomness
-v <- sample(nuc, N, replace = TRUE)
-set.seed(NULL)                         # reset the RNG
-
-(mySeq <- paste(v, collapse = ""))
-
-# What's the GC content?
-table(v)
-sum(table(v)[c("G", "C")]) # 51 is close to expected
-
-# What's the number of CpG motifs? Easy to check with the stringi
-# stri_match_all() function
-
-if (! requireNamespace("stringi", quietly = TRUE)) {
-  install.packages("stringi")
-}
-# Package information:
-#  library(help = stringi)       # basic information
-#  browseVignettes("stringi")    # available vignettes
-#  data(package = "stringi")     # available datasets
-
-
-(x <- stringi::stri_match_all(mySeq, regex = "CG"))
-length(unlist(x))
-
-# Now you could compare that number with yeast DNA sequences, and determine
-# whether there are more or less CpG motifs than expected by chance.
-# (cf. https://en.wikipedia.org/wiki/CpG_site)
-# But hold on: is that a fair comparison? sample() gives us all four nucleotides
-# with the same probability. But the yeast genomic DNA GC content is only
-# 38%. So you would expect fewer CpG motifs based on the statistical properties
-# of the smaller number of Cs and Gs - before biology even comes into play. How
-# do we account for that?
-
-# ===   7.2.2  Defined probability vector
-
-# This is where we need to know how to create samples with specific probability
-# distributions. A crude hack would be to create a sampling source vector with
-# 19 C, 19 G, 31 A and 31 T
-c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
-# ... but that doesn't scale if the numeric accuracy needs to be higher.
-#
-# However sample() has an argument that takes care of that: you can explicitly
-# specify the probabilities with which each element of the the sampling vector
-# should be chosen:
-
-nuc <- c("A", "C", "G", "T")
-N <- 100
-myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
-
-set.seed(16818)                       # set RNG seed for repeatable randomness
-v <- sample(nuc, N, prob = myProb, replace = TRUE)
-set.seed(NULL)                         # reset the RNG
-
-(mySeq <- paste(v, collapse = ""))
-
-# What's the GC content?
-table(v)
-sum(table(v)[c("G", "C")]) # Close to expected
-
-# What's the number of CpG motifs?
-(x <- stringi::stri_match_all(mySeq, regex = "CG"))
-# ... not a single one in this case.
-
-
-
-# [END]
+# tocID <- "BIN-Sequence.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the BIN-Sequence unit.
+#
+# Version:  1.5
+#
+# Date:     2017-09  - 2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.5    2020 Updates
+#           1.4    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.3    Update set.seed() usage
+#           1.2    Removed irrelevant task. How did that even get in there? smh
+#           1.1    Add chartr()
+#           1.0    First live version 2017.
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                Line
+#TOC> ----------------------------------------------------
+#TOC>   1        Prepare                                63
+#TOC>   2        Storing Sequence                       80
+#TOC>   3        String properties                     109
+#TOC>   4        Substrings                            116
+#TOC>   5        Creating strings: sprintf()           137
+#TOC>   6        Changing strings                      172
+#TOC>   6.1.1          Changing case                   174
+#TOC>   6.1.2          Reverse                         179
+#TOC>   6.1.3          Change characters               183
+#TOC>   6.1.4          Substitute characters           211
+#TOC>   6.2        stringi and stringr                 231
+#TOC>   6.3        dbSanitizeSequence()                241
+#TOC>   7        Permuting and sampling                253
+#TOC>   7.1        Permutations                        260
+#TOC>   7.2        Sampling                            306
+#TOC>   7.2.1          Equiprobable characters         308
+#TOC>   7.2.2          Defined probability vector      350
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Prepare  =============================================================
+
+# Much basic sequence handling is supported by the Bioconductor package
+# Biostrings.
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+# =    2  Storing Sequence  ====================================================
+
+
+# Sequences can be represented and stored as vectors of single characters ...
+(v <- c("D", "I", "V", "M", "T", "Q"))
+
+# ... as strings ...
+(s <- "DIVMTQ")
+
+# ... or as more complex objects with rich metadata e.g. as a Biostrings
+# DNAstring, RNAstring, AAString, etc.
+(a <- Biostrings::AAString("DIVMTQ"))
+
+# ... and all of these representations can be interconverted:
+
+# string to vector ...
+unlist(strsplit(s, ""))
+
+# vector to string ...
+paste(v, sep = "", collapse = "")
+
+# ... and AAstring to plain string.
+as.character(a)
+
+# Since operations with character vectors trivially follow all other vector
+# conventions and syntax, and we will look at Biostrings methods in more
+# detail in a later unit, we will focus on basic strings in the following.
+
+
+# =    3  String properties  ===================================================
+
+
+length(s) # why ???
+nchar(s)  # Aha!
+
+
+# =    4  Substrings  ==========================================================
+
+# Use the substr() function
+substr(s, 2, 4)
+
+# or the similar substring()
+substring(s, 2, 4)
+
+# Note: both functions are vectorized (i.e. they operate on vectors
+# of arguments, you don't need to loop over input)...
+myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
+substr(   myBiCodes, 1, 3)
+substring(myBiCodes, 1, 3)
+
+# ... however only substring() will also use vectors for start and stop
+s <- "gatattgtgatgacccagtaa"       # a DNA sequence
+(vI <- seq(1, nchar(s), by = 3))   # an index vector
+substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
+substring(s, vI, vI+2)             # ... returns all triplets
+
+
+# =    5  Creating strings: sprintf()  =========================================
+
+
+# Sprintf is a very smart, very powerful function and has cognates in all
+# other programming languages. It has a bit of a  learning curve, but this is
+# totally worth it:
+# the function takes a format string, and a list of other arguments. It returns
+# a formatted string. Here are some examples - watch carefully for sprintf()
+# calls elsewhere in the code.
+
+sprintf("Just a string.")
+sprintf("A string and the number %d.", 5)
+sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
+sprintf("Pi is ~ %1.2f ...", pi)
+sprintf("or more accurately ~ %1.11f.", pi)
+x <- "bottles of beer"
+N <- 99
+sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
+        N, x, N, x, "one down, and pass it around", N - 1, x)
+
+# Note that in the last example, the value of the string was displayed with
+# R's usual print-formatting function and therefore the line-break "\n" did
+# not actually break the line. To have line breaks, tabs etc, you need to use
+# cat() to display the string:
+
+for (i in N:(N-4)) {
+  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
+              i, x, i, x, "one down, and pass it around", i - 1, x))
+}
+
+# sprintf() is vectorized: if one of its parameters is a vector, it
+# will generate one output string for each of the vector's elements:
+cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
+
+
+# =    6  Changing strings  ====================================================
+
+# ===   6.1.1  Changing case
+tolower(s)
+toupper(tolower(s))
+
+
+# ===   6.1.2  Reverse
+# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
+# Biostrings::str_rev(s)
+# The following works, of course, but awkward:
+s
+paste0(rev(unlist(strsplit(s, ""))), collapse = "")
+
+# reverse complement
+COMP <- c("t", "g", "c", "a")
+names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
+s
+paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
+
+
+# ===   6.1.3  Change characters
+# chartr(old, new, x) maps all characters in x that appear in "old" to the
+# correpsonding character in "new." Kind of like the COMP vector above ...
+
+chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
+
+# One could implement toupper() and tolower() with this - remember that R has
+# character vectors of uppercase and lowercase letters as language constants.
+chartr(paste0(letters, collapse = ""),
+       paste0(LETTERS, collapse = ""),
+       "Twinkle, twinkle little star, how I wonder what you are.")
+
+# One amusing way to use the function  is for a reversible substitution
+# cypher.
+alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
+set.seed(112358)                       # set RNG seed for repeatable randomness
+( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
+set.seed(NULL)                         # reset the RNG
+
+# encode ...
+(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
+
+# decode ...
+chartr(myCypher, alBet, x)
+# (Nb. substitution cyphers are easy to crack!)
+
+
+# ===   6.1.4  Substitute characters
+# gsub can change lengths.
+#   Example: implementing the binary Fibonacci sequence:
+#   0 -> 1; 1 -> 10 , in three nested gsub() statements
+( s <- 1 )
+( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
+
+# Iterate this line a few times ...
+#
+# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
+# for the features of the sequence.
+
+# I use gsub() often to delete unwanted characters ...
+# ... select something, and substitute the empty string for it.
+(s <- gsub("-", "", s))
+
+# For example: clean up a sequence
+# copy/paste from UniProt
+(s <- "        10         20         30         40         50
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
+
+
+# remove numbers
+(s <- gsub("[0-9]", "", s))
+
+# remove "whitespace" (spaces, tabs, line breaks)...
+(s <- gsub("\\s", "", s))
+
+# ==   6.2  stringi and stringr  ===============================================
+
+# But there are also specialized functions eg. to remove leading/trailing
+# whitespace which may be important to sanitize user input etc. Have a look at
+# the function descriptions for the stringr and the stringi package. stringr is
+# part of the tidyverse, and for the most part a wrapper for stringi functions.
+# https://github.com/tidyverse/stringr
+
+
+
+# ==   6.3  dbSanitizeSequence()  ==============================================
+
+# In our learning units, we use a function dbSanitizeSequence() to clean up
+# sequences that may be copy/pasted from Web-sources
+
+cat( s <- ">FASTA header will be removed
+10         20         30         40         50
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
+
+dbSanitizeSequence(s)
+
+
+# =    7  Permuting and sampling  ==============================================
+
+
+# An important aspect of working with strings is generating random strings
+# with given statistical properties: reference items to evaluate significance.
+
+
+# ==   7.1  Permutations  ======================================================
+
+
+# One way to produce such reference items is to permute a string. A permuted
+# string has the same composition as the original, but all positional
+# information is lost. The sample() function can be used to permute:
+
+# This is the sequence of the ompA secretion signal
+(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
+
+(x <- sample(s, length(s)))  # permuted
+
+# Here's a small example how such permuted strings may be useful. As you look
+# at the ompA sequence, you suspect that the two lysines near the +-charged
+# N-terminus may not be accidental, but selected for a positively charged
+# N-terminus. What is the chance that such a sequence has two lysines close to
+# the N-terminus simply by chance? Or put differently: what is the average
+# distance of two lysines in such a sequence to the N-terminus. First, we
+# need an expression that measures the distance. A simple use of the which()
+# function will do just fine.
+
+which(s == "K")        # shows they are in position 2 and 3, so ...
+mean(which(s == "K"))  # ... gives us the average, and ...
+mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
+
+# So what does the distribution look like? Lets do 10,000 trials.
+
+(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
+N <- 10000
+d <- numeric(N)
+
+set.seed(112358)                       # set RNG seed for repeatable randomness
+for (i in 1:N) {
+  d[i] <- mean(which(sample(s, length(s)) == "K"))
+}
+set.seed(NULL)                         # reset the RNG
+
+hist(d, breaks = 20)
+abline(v = 2.5, lwd = 2, col = "firebrick")
+sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
+              # N-terminus or more. That's just below the signifcance
+              # threshold of 5 %. It's a trend, but to be sure we are looking
+              # at a biological effect we would need to see more
+              # sequences.
+
+
+# ==   7.2  Sampling  ==========================================================
+
+# ===   7.2.1  Equiprobable characters
+
+# Assume you need a large random-nucleotide string for some statistical model.
+# How to create such a string? sample() can easily create it:
+
+nuc <- c("A", "C", "G", "T")
+N <- 100
+
+set.seed(16818)                        # set RNG seed for repeatable randomness
+v <- sample(nuc, N, replace = TRUE)
+set.seed(NULL)                         # reset the RNG
+
+(mySeq <- paste(v, collapse = ""))
+
+# What's the GC content?
+table(v)
+sum(table(v)[c("G", "C")]) # 51 is close to expected
+
+# What's the number of CpG motifs? Easy to check with the stringi
+# stri_match_all() function
+
+if (! requireNamespace("stringi", quietly = TRUE)) {
+  install.packages("stringi")
+}
+# Package information:
+#  library(help = stringi)       # basic information
+#  browseVignettes("stringi")    # available vignettes
+#  data(package = "stringi")     # available datasets
+
+
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
+length(unlist(x))
+
+# Now you could compare that number with yeast DNA sequences, and determine
+# whether there are more or less CpG motifs than expected by chance.
+# (cf. https://en.wikipedia.org/wiki/CpG_site)
+# But hold on: is that a fair comparison? sample() gives us all four nucleotides
+# with the same probability. But the yeast genomic DNA GC content is only
+# 38%. So you would expect fewer CpG motifs based on the statistical properties
+# of the smaller number of Cs and Gs - before biology even comes into play. How
+# do we account for that?
+
+# ===   7.2.2  Defined probability vector
+
+# This is where we need to know how to create samples with specific probability
+# distributions. A crude hack would be to create a sampling source vector with
+# 19 C, 19 G, 31 A and 31 T
+c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
+# ... but that doesn't scale if the numeric accuracy needs to be higher.
+#
+# However sample() has an argument that takes care of that: you can explicitly
+# specify the probabilities with which each element of the the sampling vector
+# should be chosen:
+
+nuc <- c("A", "C", "G", "T")
+N <- 100
+myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
+
+set.seed(16818)                       # set RNG seed for repeatable randomness
+v <- sample(nuc, N, prob = myProb, replace = TRUE)
+set.seed(NULL)                         # reset the RNG
+
+(mySeq <- paste(v, collapse = ""))
+
+# What's the GC content?
+table(v)
+sum(table(v)[c("G", "C")]) # Close to expected
+
+# What's the number of CpG motifs?
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
+# ... not a single one in this case.
+
+
+
+# [END]
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
--- a/FND-Genetic_code.R
+++ b/FND-Genetic_code.R
@ -1,349 +1,349 @@
-# tocID <- "FND-Genetic_code.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-Genetic_code unit.
-#
-# Version:  1.2
-#
-# Date:     2017  10  -  2019  01
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0.1  Comment on "incomplete final line" warning in FASTA
-#           1.0    First live version
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                            Line
-#TOC> ----------------------------------------------------------------
-#TOC>   1        Storing the genetic code                           45
-#TOC>   1.1        Genetic code in Biostrings                       63
-#TOC>   2        Working with the genetic code                      94
-#TOC>   2.1        Translate a sequence.                           129
-#TOC>   3        An alternative representation: 3D array           212
-#TOC>   3.1        Print a Genetic code table                      246
-#TOC>   4        Tasks                                             272
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Storing the genetic code  ============================================
-
-# The genetic code maps trinucleotide codons to amino acids. To store it, we
-# need some mechanism to associate the two representations. The most
-# convenient way to do that is a "named vector" which holds the amino acid
-# code and assigns the codons as names to its elements.
-
-x <- c("M", "H", "H", "*", "*", "*")
-names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
-x
-
-# Then we can access the vector by the codon as name, and retrieve the
-# amino acid ...
-
-x["ATG"]
-x["CAC"]
-x["TAA"]
-
-# ... or the names of elements, to retrieve the codon(s)
-names(x)[x == "M"]
-names(x)[x == "H"]
-names(x)[x == "*"]
-
-
-# ==   1.1  Genetic code in Biostrings  ========================================
-
-# Coveniently, the standard genetic code as well as its alternatives are
-# available in the Bioconductor "Biostrings" package:
-
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-
-# The standard genetic code vector
-Biostrings::GENETIC_CODE
-
-# The table of genetic codes. This information corresponds to this page
-# at the NCBI:
-# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
-Biostrings::GENETIC_CODE_TABLE
-
-# Most of the alternative codes are mitochondrial codes. The id of the
-# Alternative Yeast Nuclear code is "12"
-Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
-
-
-# =    2  Working with the genetic code  =======================================
-
-# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
-# to a "local" variable, rather than retrieving it from the package all the
-# time.
-
-GC <- Biostrings::GENETIC_CODE
-
-# This is a named vector of characters ...
-
-str(GC)
-
-# ... which also stores the alternative initiation codons TTG and CTG in
-# an attribute of the vector. (Alternative initiation codons sometimes are
-# used instead of ATG to intiate translation, if translation is not initiated
-# at ATG thses are still translated with fMet.)
-
-attr(GC, "alt_init_codons")
-
-# But the key to use this vector is in the "names" which we use for subsetting
-# the list of amino acids in whatever way we need.
-names(GC)
-
-# The translation of "TGG" ...
-GC["TGG"]
-
-# All stop codons
-names(GC)[GC == "*"]
-
-# All start codons
-names(GC)[GC == "M"] # ... or
-c(names(GC)[GC == "M"],
-  attr(GC, "alt_init_codons"))
-
-
-# ==   2.1  Translate a sequence.  =============================================
-
-
-# I have provided a gene sequence in the data directory:
-# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
-
-# read it
-mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-
-# You will notice that this generates a Warning message:
-#      Warning message:
-#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
-#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
-
-# The reason for this is that the last character of the file is the letter "A"
-# and not a "\n" line break. This file is exactly how it was sent from the
-# NCBI server; I think good, defensive programming practice would have been to
-# include some kind of an end-marker in the file, like a final "\n". This helps
-# us recognize an incomplete transmission. Let's parse the actual sequence from
-# the file, and then check for completeness.
-
-
-head(mbp1)
-
-# drop the first line (header)
-mbp1 <- mbp1[-1]
-head(mbp1)
-
-# concatenate it all to a single string
-mbp1 <- paste(mbp1, sep = "", collapse = "")
-
-# how long is it?
-nchar(mbp1)
-
-# how many codons?
-nchar(mbp1)/3
-
-# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
-# first verification that the file we read is complete, the nucleotides of a
-# complete ORF should be divisible by 3.
-
-# Extract the codons. There are many ways to split a long string into chunks
-# of three characters. Here we use the Biostrings  codons()  function. codons()
-# requires an object of type DNAstring - a special kind of string with
-# attributes that are useful for Biostrings. Thus we convert the sequence first
-# with DNAstring(), then split it up, then convert it into a plain
-# character vector.
-mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
-
-head(mbp1Codons)
-
-# now translate each codon
-
-mbp1AA <- character(834)
-for (i in seq_along(mbp1Codons)) {
-  mbp1AA[i] <- GC[mbp1Codons[i]]
-}
-
-head(mbp1Codons)
-head(mbp1AA)
-
-tail(mbp1Codons)
-tail(mbp1AA) # Note the stop!
-
-# The TAA "ochre" stop codon is our second verification that the nucleotide
-# sequence is complete: a stop codon can't appear internally in an ORF.
-
-# We can work with the mbp1AA vector, for example to tabulate the
-# amino acid frequencies:
-table(mbp1AA)
-sort(table(mbp1AA), decreasing = TRUE)
-
-# Or we can paste all elements together into a single string. But let's remove
-# the stop, it's not actually a part of the sequence. To remove the last element
-# of a vector, re-assign it with a vector minus the index of the last element:
-mbp1AA <- mbp1AA[-(length(mbp1AA))]
-tail(mbp1AA) # Note the stop is gone!
-
-# paste it together, collapsing the elements using an empty string as the
-# separation-character (i.e.: nothing)
-(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
-
-
-# =    3  An alternative representation: 3D array  =============================
-
-
-# We don't use 3D arrays often - usually just 2D tables and data frames, so
-# here is a good opportunity to review the syntax of 3D arrays with a
-# genetic code cube:
-
-# Initialize, using A G C T as the names of the elements in each dimension
-cCube <- array(data     = character(64),
-               dim      = c(4, 4, 4),
-               dimnames = list(c("A", "G", "C", "T"),
-                               c("A", "G", "C", "T"),
-                               c("A", "G", "C", "T")))
-
-# fill it with amino acid codes using three nested loops
-for (i in 1:4) {
-  for (j in 1:4) {
-    for (k in 1:4) {
-      myCodon <- paste(dimnames(cCube)[[1]][i],
-                       dimnames(cCube)[[2]][j],
-                       dimnames(cCube)[[3]][k],
-                       sep = "",
-                       collapse = "")
-      cCube[i, j, k] <- GC[myCodon]
-    }
-  }
-}
-
-# confirm
-cCube["A", "T", "G"] # methionine
-cCube["T", "T", "T"] # phenylalanine
-cCube["T", "A", "G"] # stop (amber)
-
-
-
-# ==   3.1  Print a Genetic code table  ========================================
-
-
-# The data structure of our cCube is well suited to print a table. In the
-# "standard" way to print the genetic code, we write codons with the same
-# second nucleotide in columns, and arrange rows in blocks of same
-# first nucleotide, varying the third nucleotide fastest. This maximizes the
-# similarity of adjacent amino acids in the table if we print the
-# nucleotides in the order T C A G. It's immidiately obvious that the code
-# is not random: the universal genetic code is exceptionally error tolerant in
-# the sense that mutations (or single-nucleotide translation errors) are likely
-# to result in an amino acid with similar biophysical properties as the
-# original.
-
-nuc <- c("T", "C", "A", "G")
-
-# (calling variables f, s, t to indicate first, second, and third position ...)
-for (f in nuc) {      # first varies in blocks
-  for (t in nuc) {    # third varies in columns
-    for (s in nuc) {  # second varies in rows
-      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
-    }
-    cat("\n")
-  }
-  cat("\n")
-}
-
-
-# =    4  Tasks  ===============================================================
-
-
-# Task: What do you need to change to print the table with U instead
-#         of T? Try it.
-
-
-# Task: Point mutations are more often transitions (purine -> purine;
-#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
-#         pyrimidine -> purine), even though twice as many transversions
-#         are possible in the code. This is most likely due a deamination /
-#         tautomerization process that favours C -> T changes. If the code
-#         indeed minimizes the effect of mutations, you would expect that
-#         codons that differ by a transition code for more similar amino acids
-#         than codons that differ by a transversion. Is that true? List the set
-#         of all amino acid pairs that are encoded by codons with a C -> T
-#         transition. Then list the set of amino acid pairs with a C -> A
-#         transversion. Which set of pairs is more similar?
-
-
-# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
-#         have if you translate them in the 2. or the 3. frame?
-
-
-# Task: How does the amino acid composition change if you translate the mbp1
-#         gene with the Alternative Yeast Nuclear code that is used by the
-#         "GTC clade" of fungi?
-#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
-
-# Solution:
-
-    # Fetch the code
-    Biostrings::GENETIC_CODE_TABLE
-    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
-    altYcode <- Biostrings::getGeneticCode("12")
-
-    # what's the difference?
-    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
-
-    Biostrings::GENETIC_CODE[delta]
-    altYcode[delta]
-
-    # translate
-    altYAA <- character(834)
-    for (i in seq_along(mbp1Codons)) {
-      altYAA[i] <- altYcode[mbp1Codons[i]]
-    }
-
-    table(mbp1AA)
-    table(altYAA)
-
-# Task: The genetic code has significant redundacy, i.e. there are up to six
-#         codons that code for the same amino acid. Write code that lists how
-#         many amino acids are present how often i.e. it should tell you that
-#         two amino acids are encoded only with a single codon, three amino
-#         acids have six codons, etc. Solution below, but don't peek. There
-#         are many possible ways to do this.
-#
-#
-# Solution:
-( x <- table(table(Biostrings::GENETIC_CODE)) )
-
-# confirm
-sum(x * as.numeric(names(x)))
-
-
-
-# [END]
+# tocID <- "FND-Genetic_code.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the FND-Genetic_code unit.
+#
+# Version:  1.2
+#
+# Date:     2017  10  -  2019  01
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0.1  Comment on "incomplete final line" warning in FASTA
+#           1.0    First live version
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                            Line
+#TOC> ----------------------------------------------------------------
+#TOC>   1        Storing the genetic code                           45
+#TOC>   1.1        Genetic code in Biostrings                       63
+#TOC>   2        Working with the genetic code                      94
+#TOC>   2.1        Translate a sequence.                           129
+#TOC>   3        An alternative representation: 3D array           212
+#TOC>   3.1        Print a Genetic code table                      246
+#TOC>   4        Tasks                                             272
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Storing the genetic code  ============================================
+
+# The genetic code maps trinucleotide codons to amino acids. To store it, we
+# need some mechanism to associate the two representations. The most
+# convenient way to do that is a "named vector" which holds the amino acid
+# code and assigns the codons as names to its elements.
+
+x <- c("M", "H", "H", "*", "*", "*")
+names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
+x
+
+# Then we can access the vector by the codon as name, and retrieve the
+# amino acid ...
+
+x["ATG"]
+x["CAC"]
+x["TAA"]
+
+# ... or the names of elements, to retrieve the codon(s)
+names(x)[x == "M"]
+names(x)[x == "H"]
+names(x)[x == "*"]
+
+
+# ==   1.1  Genetic code in Biostrings  ========================================
+
+# Coveniently, the standard genetic code as well as its alternatives are
+# available in the Bioconductor "Biostrings" package:
+
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+# The standard genetic code vector
+Biostrings::GENETIC_CODE
+
+# The table of genetic codes. This information corresponds to this page
+# at the NCBI:
+# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
+Biostrings::GENETIC_CODE_TABLE
+
+# Most of the alternative codes are mitochondrial codes. The id of the
+# Alternative Yeast Nuclear code is "12"
+Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
+
+
+# =    2  Working with the genetic code  =======================================
+
+# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
+# to a "local" variable, rather than retrieving it from the package all the
+# time.
+
+GC <- Biostrings::GENETIC_CODE
+
+# This is a named vector of characters ...
+
+str(GC)
+
+# ... which also stores the alternative initiation codons TTG and CTG in
+# an attribute of the vector. (Alternative initiation codons sometimes are
+# used instead of ATG to intiate translation, if translation is not initiated
+# at ATG thses are still translated with fMet.)
+
+attr(GC, "alt_init_codons")
+
+# But the key to use this vector is in the "names" which we use for subsetting
+# the list of amino acids in whatever way we need.
+names(GC)
+
+# The translation of "TGG" ...
+GC["TGG"]
+
+# All stop codons
+names(GC)[GC == "*"]
+
+# All start codons
+names(GC)[GC == "M"] # ... or
+c(names(GC)[GC == "M"],
+  attr(GC, "alt_init_codons"))
+
+
+# ==   2.1  Translate a sequence.  =============================================
+
+
+# I have provided a gene sequence in the data directory:
+# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
+
+# read it
+mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+
+# You will notice that this generates a Warning message:
+#      Warning message:
+#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
+#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
+
+# The reason for this is that the last character of the file is the letter "A"
+# and not a "\n" line break. This file is exactly how it was sent from the
+# NCBI server; I think good, defensive programming practice would have been to
+# include some kind of an end-marker in the file, like a final "\n". This helps
+# us recognize an incomplete transmission. Let's parse the actual sequence from
+# the file, and then check for completeness.
+
+
+head(mbp1)
+
+# drop the first line (header)
+mbp1 <- mbp1[-1]
+head(mbp1)
+
+# concatenate it all to a single string
+mbp1 <- paste(mbp1, sep = "", collapse = "")
+
+# how long is it?
+nchar(mbp1)
+
+# how many codons?
+nchar(mbp1)/3
+
+# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
+# first verification that the file we read is complete, the nucleotides of a
+# complete ORF should be divisible by 3.
+
+# Extract the codons. There are many ways to split a long string into chunks
+# of three characters. Here we use the Biostrings  codons()  function. codons()
+# requires an object of type DNAstring - a special kind of string with
+# attributes that are useful for Biostrings. Thus we convert the sequence first
+# with DNAstring(), then split it up, then convert it into a plain
+# character vector.
+mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
+
+head(mbp1Codons)
+
+# now translate each codon
+
+mbp1AA <- character(834)
+for (i in seq_along(mbp1Codons)) {
+  mbp1AA[i] <- GC[mbp1Codons[i]]
+}
+
+head(mbp1Codons)
+head(mbp1AA)
+
+tail(mbp1Codons)
+tail(mbp1AA) # Note the stop!
+
+# The TAA "ochre" stop codon is our second verification that the nucleotide
+# sequence is complete: a stop codon can't appear internally in an ORF.
+
+# We can work with the mbp1AA vector, for example to tabulate the
+# amino acid frequencies:
+table(mbp1AA)
+sort(table(mbp1AA), decreasing = TRUE)
+
+# Or we can paste all elements together into a single string. But let's remove
+# the stop, it's not actually a part of the sequence. To remove the last element
+# of a vector, re-assign it with a vector minus the index of the last element:
+mbp1AA <- mbp1AA[-(length(mbp1AA))]
+tail(mbp1AA) # Note the stop is gone!
+
+# paste it together, collapsing the elements using an empty string as the
+# separation-character (i.e.: nothing)
+(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
+
+
+# =    3  An alternative representation: 3D array  =============================
+
+
+# We don't use 3D arrays often - usually just 2D tables and data frames, so
+# here is a good opportunity to review the syntax of 3D arrays with a
+# genetic code cube:
+
+# Initialize, using A G C T as the names of the elements in each dimension
+cCube <- array(data     = character(64),
+               dim      = c(4, 4, 4),
+               dimnames = list(c("A", "G", "C", "T"),
+                               c("A", "G", "C", "T"),
+                               c("A", "G", "C", "T")))
+
+# fill it with amino acid codes using three nested loops
+for (i in 1:4) {
+  for (j in 1:4) {
+    for (k in 1:4) {
+      myCodon <- paste(dimnames(cCube)[[1]][i],
+                       dimnames(cCube)[[2]][j],
+                       dimnames(cCube)[[3]][k],
+                       sep = "",
+                       collapse = "")
+      cCube[i, j, k] <- GC[myCodon]
+    }
+  }
+}
+
+# confirm
+cCube["A", "T", "G"] # methionine
+cCube["T", "T", "T"] # phenylalanine
+cCube["T", "A", "G"] # stop (amber)
+
+
+
+# ==   3.1  Print a Genetic code table  ========================================
+
+
+# The data structure of our cCube is well suited to print a table. In the
+# "standard" way to print the genetic code, we write codons with the same
+# second nucleotide in columns, and arrange rows in blocks of same
+# first nucleotide, varying the third nucleotide fastest. This maximizes the
+# similarity of adjacent amino acids in the table if we print the
+# nucleotides in the order T C A G. It's immidiately obvious that the code
+# is not random: the universal genetic code is exceptionally error tolerant in
+# the sense that mutations (or single-nucleotide translation errors) are likely
+# to result in an amino acid with similar biophysical properties as the
+# original.
+
+nuc <- c("T", "C", "A", "G")
+
+# (calling variables f, s, t to indicate first, second, and third position ...)
+for (f in nuc) {      # first varies in blocks
+  for (t in nuc) {    # third varies in columns
+    for (s in nuc) {  # second varies in rows
+      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
+    }
+    cat("\n")
+  }
+  cat("\n")
+}
+
+
+# =    4  Tasks  ===============================================================
+
+
+# Task: What do you need to change to print the table with U instead
+#         of T? Try it.
+
+
+# Task: Point mutations are more often transitions (purine -> purine;
+#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
+#         pyrimidine -> purine), even though twice as many transversions
+#         are possible in the code. This is most likely due a deamination /
+#         tautomerization process that favours C -> T changes. If the code
+#         indeed minimizes the effect of mutations, you would expect that
+#         codons that differ by a transition code for more similar amino acids
+#         than codons that differ by a transversion. Is that true? List the set
+#         of all amino acid pairs that are encoded by codons with a C -> T
+#         transition. Then list the set of amino acid pairs with a C -> A
+#         transversion. Which set of pairs is more similar?
+
+
+# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
+#         have if you translate them in the 2. or the 3. frame?
+
+
+# Task: How does the amino acid composition change if you translate the mbp1
+#         gene with the Alternative Yeast Nuclear code that is used by the
+#         "GTC clade" of fungi?
+#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
+
+# Solution:
+
+    # Fetch the code
+    Biostrings::GENETIC_CODE_TABLE
+    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
+    altYcode <- Biostrings::getGeneticCode("12")
+
+    # what's the difference?
+    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
+
+    Biostrings::GENETIC_CODE[delta]
+    altYcode[delta]
+
+    # translate
+    altYAA <- character(834)
+    for (i in seq_along(mbp1Codons)) {
+      altYAA[i] <- altYcode[mbp1Codons[i]]
+    }
+
+    table(mbp1AA)
+    table(altYAA)
+
+# Task: The genetic code has significant redundacy, i.e. there are up to six
+#         codons that code for the same amino acid. Write code that lists how
+#         many amino acids are present how often i.e. it should tell you that
+#         two amino acids are encoded only with a single codon, three amino
+#         acids have six codons, etc. Solution below, but don't peek. There
+#         are many possible ways to do this.
+#
+#
+# Solution:
+( x <- table(table(Biostrings::GENETIC_CODE)) )
+
+# confirm
+sum(x * as.numeric(names(x)))
+
+
+
+# [END]
--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
--- a/FND-STA-Information_theory.R
+++ b/FND-STA-Information_theory.R
@ -1,224 +1,224 @@
-# tocID <- "FND-STA-Information_theory.R"
-#
-# ==============================================================================
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-STA-Information_theory unit.
-#
-# Version:  0.2.1
-#
-# Date:     2017 - 2021
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           0.2.1  Maintenance
-#           0.2    Under development
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                  Line
-#TOC> --------------------------------------
-#TOC>   1        ___Section___            39
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  ___Section___  =======================================================
-
-# What level of information is "significant"
-
-# Assume the background distribution is the database frequencies of
-# amino acids:
-
-AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
-# sum to 1.0
-AAref["A"] <- 0.0904
-AAref["C"] <- 0.0123
-AAref["D"] <- 0.0545
-AAref["E"] <- 0.0617
-AAref["F"] <- 0.0394
-AAref["G"] <- 0.0724
-AAref["H"] <- 0.0221
-AAref["I"] <- 0.0573
-AAref["K"] <- 0.0504
-AAref["L"] <- 0.0986
-AAref["M"] <- 0.0240
-AAref["N"] <- 0.0392
-AAref["P"] <- 0.0486
-AAref["Q"] <- 0.0381
-AAref["R"] <- 0.0570
-AAref["S"] <- 0.0673
-AAref["T"] <- 0.0558
-AAref["V"] <- 0.0686
-AAref["W"] <- 0.0129
-AAref["Y"] <- 0.0294
-sum(AAref)
-
-# Function to calculate Shannon entropy
-H <- function(pmf) {
-  # Calculate Shannon entropy
-  # Parameters:
-  #   pmf (numeric) probability mass function: a vector of states and
-  #                 associated probabilities. Each element of
-  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
-  # Value:
-  #   Shannon entropy in bits.
-  # Examples:
-  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
-  #                                         # nucleotide sequence
-  #   H(1)     # If all elements are the same, entropy is zero
-  #
-  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
-    stop("Input is not a discrete probability distribution.")
-  }
-  H <- -sum(pmf * (log(pmf) / log(2)))
-  return(H)
-}
-
-# Why use all.equal()? Exact comparisons with floating point numbers are
-# brittle. Consider for example:
-1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
-print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
-# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
-
-
-
-# Entropy of the database frequencies (in bits):
-(Href <- H(AAref))
-
-# for comparison: entropy if all amino acids are equiprobable
-H(rep(0.05, 20))
-
-
-# Set up a simulation to estimate the distribution of Information values
-# from random sequences drawn from AAref. This is the distribution for the
-# statistical null hypothesis:
-nObs <- 15                      # number of observations (e.g aligned sequences)
-# nObs <- 80
-nTrials <- 10000                # number of trials
-IObs <- numeric(nTrials)        # vector to store Information in each trial
-simCounts <- numeric(20)        # vector to tabulate our information ...
-names(simCounts) <- names(AAref)# ... with the names of AAref
-
-
-for (i in 1:nTrials) {  # simulate ...
-
-  # sample AAref letters, nObs times, with the probabilities of AAref:
-  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
-
-  x <- table(AAobs)                            # table simulated observations
-  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
-  simCounts[names(x)] <- x                     # overwrite with observed counts
-  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
-  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
-  IObs[i] <- Href - Hobs                       # store information
-}
-
-# evaluate
-hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
-abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
-
-# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
-# i.e. an actual observation that lies outside the purple lines is deemed
-# "significant"(1)(2). Of course, this is only true to the degree that the
-# database frequencies are a valid model for the null-hypothesis on the
-# sequence position we are considering here.
-
-#  (1) If we use 5% quantiles, this means a value is significantly larger
-#      than expected, and we ignore cases when the value is < 0; if we
-#      consider both smaller and larger values, we need to use 2.5% quantiles,
-#      since 5% of all observations lie outside the 0.025 and 0.975
-#      quantiles.
-#
-#  (2) For an actual observation of counts, we calculate its observed
-#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
-
-
-# You can probably now appreciate that information is a bit of a shortcut for
-# biological sequences, and does not really take the different inherent
-# frequencies based on the character of the amino acids into account. For
-# example, L is the most frequent and C is the least frequent, but if we have an
-# alignment of 1000 sequences and we see that the frequencies for L and C are
-# swapped, that would be _very_ surprising - nevertheless, the information would
-# be 0. In order to take that into account, we should actually compute
-# Kullback-Leibler divergences.
-
-
-# Swap C and L frequencies
-p <- AAref
-q <- AAref
-q["L"] <- AAref["C"]
-q["C"] <- AAref["L"]
-H(p)
-H(q)
-
-KLdiv <- function(p, q) {
-  # p and q are two pmfs of discrete probability distributions
-  # with the same outcomes, which are nowhere 0.
-  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
-
-  if (length(p) != length(q)) {
-    stop("PANIC: input vector lengths differ!")
-  }
-  if (any(c((p == 0), (q == 0)))) {
-    stop("PANIC: 0's found in input vectors!")
-  }
-
-  return(sum(p * log( p / q )))
-}
-
-KLdiv(p, p)
-KLdiv(p, q)
-
-
-nObs <- 15                      # number of observations (e.g aligned sequences)
-# nObs <- 80
-nTrials <- 10000                # number of trials
-KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
-simCounts <- numeric(20)        # vector to tabulate our information ...
-names(simCounts) <- names(AAref)# ... with the names of AAref
-
-
-for (i in 1:nTrials) {  # simulate ...
-
-  # sample AAref letters, nObs times, with the probabilities of AAref:
-  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
-
-  x <- table(AAobs)                            # table simulated observations
-  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
-  simCounts[names(x)] <- x                     # overwrite with observed counts
-  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
-  simCounts <- simCounts/sum(simCounts)        # counts to frequency
-  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
-}
-
-# evaluate
-hist(KLdivObs, col = "#C9F4E3", breaks = 25)
-abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
-quantile(KLdivObs, 0.992)
-
-# Running the simulation with KL does not give a fundamentally
-# different behaviour - since we are just randomly sampling. But KL would be
-# more sensitive in case there is biological selection, where the sampling is no
-# longer random. If I run the same simulation, with nObs <- 80 but calculating
-# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
-# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
-# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
-# nice addition to the toolbox.
-
-
-# [END]
+# tocID <- "FND-STA-Information_theory.R"
+#
+# ==============================================================================
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the FND-STA-Information_theory unit.
+#
+# Version:  0.2.1
+#
+# Date:     2017 - 2021
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           0.2.1  Maintenance
+#           0.2    Under development
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                  Line
+#TOC> --------------------------------------
+#TOC>   1        ___Section___            39
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  ___Section___  =======================================================
+
+# What level of information is "significant"
+
+# Assume the background distribution is the database frequencies of
+# amino acids:
+
+AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
+# sum to 1.0
+AAref["A"] <- 0.0904
+AAref["C"] <- 0.0123
+AAref["D"] <- 0.0545
+AAref["E"] <- 0.0617
+AAref["F"] <- 0.0394
+AAref["G"] <- 0.0724
+AAref["H"] <- 0.0221
+AAref["I"] <- 0.0573
+AAref["K"] <- 0.0504
+AAref["L"] <- 0.0986
+AAref["M"] <- 0.0240
+AAref["N"] <- 0.0392
+AAref["P"] <- 0.0486
+AAref["Q"] <- 0.0381
+AAref["R"] <- 0.0570
+AAref["S"] <- 0.0673
+AAref["T"] <- 0.0558
+AAref["V"] <- 0.0686
+AAref["W"] <- 0.0129
+AAref["Y"] <- 0.0294
+sum(AAref)
+
+# Function to calculate Shannon entropy
+H <- function(pmf) {
+  # Calculate Shannon entropy
+  # Parameters:
+  #   pmf (numeric) probability mass function: a vector of states and
+  #                 associated probabilities. Each element of
+  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
+  # Value:
+  #   Shannon entropy in bits.
+  # Examples:
+  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
+  #                                         # nucleotide sequence
+  #   H(1)     # If all elements are the same, entropy is zero
+  #
+  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
+    stop("Input is not a discrete probability distribution.")
+  }
+  H <- -sum(pmf * (log(pmf) / log(2)))
+  return(H)
+}
+
+# Why use all.equal()? Exact comparisons with floating point numbers are
+# brittle. Consider for example:
+1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
+print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
+# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
+
+
+
+# Entropy of the database frequencies (in bits):
+(Href <- H(AAref))
+
+# for comparison: entropy if all amino acids are equiprobable
+H(rep(0.05, 20))
+
+
+# Set up a simulation to estimate the distribution of Information values
+# from random sequences drawn from AAref. This is the distribution for the
+# statistical null hypothesis:
+nObs <- 15                      # number of observations (e.g aligned sequences)
+# nObs <- 80
+nTrials <- 10000                # number of trials
+IObs <- numeric(nTrials)        # vector to store Information in each trial
+simCounts <- numeric(20)        # vector to tabulate our information ...
+names(simCounts) <- names(AAref)# ... with the names of AAref
+
+
+for (i in 1:nTrials) {  # simulate ...
+
+  # sample AAref letters, nObs times, with the probabilities of AAref:
+  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
+
+  x <- table(AAobs)                            # table simulated observations
+  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
+  simCounts[names(x)] <- x                     # overwrite with observed counts
+  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
+  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
+  IObs[i] <- Href - Hobs                       # store information
+}
+
+# evaluate
+hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
+abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
+
+# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
+# i.e. an actual observation that lies outside the purple lines is deemed
+# "significant"(1)(2). Of course, this is only true to the degree that the
+# database frequencies are a valid model for the null-hypothesis on the
+# sequence position we are considering here.
+
+#  (1) If we use 5% quantiles, this means a value is significantly larger
+#      than expected, and we ignore cases when the value is < 0; if we
+#      consider both smaller and larger values, we need to use 2.5% quantiles,
+#      since 5% of all observations lie outside the 0.025 and 0.975
+#      quantiles.
+#
+#  (2) For an actual observation of counts, we calculate its observed
+#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
+
+
+# You can probably now appreciate that information is a bit of a shortcut for
+# biological sequences, and does not really take the different inherent
+# frequencies based on the character of the amino acids into account. For
+# example, L is the most frequent and C is the least frequent, but if we have an
+# alignment of 1000 sequences and we see that the frequencies for L and C are
+# swapped, that would be _very_ surprising - nevertheless, the information would
+# be 0. In order to take that into account, we should actually compute
+# Kullback-Leibler divergences.
+
+
+# Swap C and L frequencies
+p <- AAref
+q <- AAref
+q["L"] <- AAref["C"]
+q["C"] <- AAref["L"]
+H(p)
+H(q)
+
+KLdiv <- function(p, q) {
+  # p and q are two pmfs of discrete probability distributions
+  # with the same outcomes, which are nowhere 0.
+  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
+
+  if (length(p) != length(q)) {
+    stop("PANIC: input vector lengths differ!")
+  }
+  if (any(c((p == 0), (q == 0)))) {
+    stop("PANIC: 0's found in input vectors!")
+  }
+
+  return(sum(p * log( p / q )))
+}
+
+KLdiv(p, p)
+KLdiv(p, q)
+
+
+nObs <- 15                      # number of observations (e.g aligned sequences)
+# nObs <- 80
+nTrials <- 10000                # number of trials
+KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
+simCounts <- numeric(20)        # vector to tabulate our information ...
+names(simCounts) <- names(AAref)# ... with the names of AAref
+
+
+for (i in 1:nTrials) {  # simulate ...
+
+  # sample AAref letters, nObs times, with the probabilities of AAref:
+  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
+
+  x <- table(AAobs)                            # table simulated observations
+  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
+  simCounts[names(x)] <- x                     # overwrite with observed counts
+  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
+  simCounts <- simCounts/sum(simCounts)        # counts to frequency
+  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
+}
+
+# evaluate
+hist(KLdivObs, col = "#C9F4E3", breaks = 25)
+abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
+quantile(KLdivObs, 0.992)
+
+# Running the simulation with KL does not give a fundamentally
+# different behaviour - since we are just randomly sampling. But KL would be
+# more sensitive in case there is biological selection, where the sampling is no
+# longer random. If I run the same simulation, with nObs <- 80 but calculating
+# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
+# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
+# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
+# nice addition to the toolbox.
+
+
+# [END]
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -1,351 +1,351 @@
-# tocID <- "FND-STA-Significance.R"
-#
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-STA-Significance unit.
-#
-# Version:  1.3
-#
-# Date:     2017-09  - 2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.3    2020 Maintenance. Add sample solution.
-#           1.2    Update set.seed() usage
-#           1.1    Corrected treatment of empirical p-value
-#           1.0    First contents
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                              Line
-#TOC> ------------------------------------------------------------------
-#TOC>   1        Significance and p-value                             49
-#TOC>   1.1        Significance levels                                60
-#TOC>   1.2        probability and p-value                            77
-#TOC>   1.2.1          p-value illustrated                           109
-#TOC>   2        One- or two-sided                                   165
-#TOC>   3        Significance by integration                         209
-#TOC>   4        Significance by simulation or permutation           215
-#TOC>   5        Final tasks                                         327
-#TOC>   6        Sample solutions                                    336
-#TOC>   6.1                                                          338
-#TOC>   6.2                                                          342
-#TOC>   6.3                                                          346
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Significance and p-value  ============================================
-
-# The idea of the probability of an event has a precise mathematical
-# interpretation, but how is it useful to know the probability? Usually we are
-# interested in whether we should accept or reject a hypothesis based on the
-# observations we have. A rational way to do this is to say: if the probability
-# of observing the data is very small under the null-hypothesis, then we will
-# assume the observation is due to something other than the null-hypothesis. But
-# what do we mean by the "probability of our observation"? And what is "very
-# small"?
-
-# ==   1.1  Significance levels  ===============================================
-
-# A "very small" probability is purely a matter of convention - a cultural
-# convention. In the biomedical field we usually call probabilities of less then
-# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
-# observations with a probability of less than 0.05 "significant" and if we want
-# to highlight this in text or in a graph, we often mark them with an asterisk
-# (*). Also we often call observations with a probability of less than 0.01
-# "highly significant" and mark them with two asterisks (**). But there is no
-# special significance in these numbers, the cutoff point for significance could
-# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
-# British statistician Ronald Fisher happened to propose for this purpose in
-# 1925. Incidentally, Fisher later recommended to use different cutoffs for
-# different purposes (cf.
-# https://en.wikipedia.org/wiki/Statistical_significance).
-
-
-# ==   1.2  probability and p-value  ===========================================
-
-# But what do we even mean by the probability of an observation?
-# Assume I am drawing samples from a normal distribution with a mean of 0 and a
-# standard deviation of 1. The sample I get is ...
-
-set.seed(sqrt(5))
-x <- rnorm(1)
-set.seed(NULL)
-
-print(x, digits = 22)
-# [1] -0.8969145466249813791748
-
-# So what's the probability of that number? Obviously, the probability of
-# getting exactly this number is very, very, very small. But also obviously,
-# this does not mean that observing this number is in any way significant - we
-# always observe some number. That's not what we mean in this case. There are
-# several implicit assumptions when we speak of the probability of an
-# observation:
-
-# 1: the observation can be compared to a probability distribution;
-# 2: that distribution can be integrated between any specific value
-#      and its upper and lower bounds (or +- infinity).
-
-# Then what we really mean by the probability of an observation in the context
-# of that distribution is: the probability of observing that value, or a value
-# more extreme than the one we have. We call this the p-value. Note that we are
-# not talking about an individual number anymore, we are talking about the area
-# under the curve between our observation and the upper (or lower) bound of the
-# curve, as a fraction of the whole.
-
-
-# ===   1.2.1  p-value illustrated                      
-
-# Let's illustrate. First we draw a million random values from our
-# standard, normal distribution:
-
-N <- 1e6                             # one million
-set.seed(112358)                     # set RNG seed for repeatable randomness
-r <- rnorm(N)                        # N values from a normal distribution
-set.seed(NULL)                       # reset the RNG
-
-# Let's see what the distribution looks like:
-
-(h <- hist(r))
-
-# The histogram details are now available in the list h -  e.g. h$counts
-
-# Where is the value we have drawn previously?
-abline(v = x, col = "#EE0000")
-
-# How many values are smaller?
-sum(r < x)
-
-# Let's color the bars:
-#    first, make a vector of red and green colors for the bars with breaks
-#    smaller and larger then x, white for the bar that contains x ...
-hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
-hCol <- c(hCol, "#FFFFFFFF")
-hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
-# ... then plot the histogram, with colored bars ...
-hist(r, col = hCol)
-# ... add two colored rectangles into the white bar ...
-idx <- sum(h$breaks < x)
-xMin <- h$breaks[idx]
-xMax <- h$breaks[idx + 1]
-y <- h$counts[idx]
-rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
-rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
-# ... and a red line for our observation.
-abline(v = x, col = "#EE0000", lwd = 2)
-
-# The p-value of our observation is the red area as a fraction of the
-# whole histogram (red + green).
-
-
-# Task:
-#    Explain how the expression sum(r < x) works to give us a count of values
-#    with the property we are looking for. E.g., examine -4:4 < x
-
-# Task:
-#    Write an expression to estimate the probability that a value
-#    drawn from the vector r is less-or-equal to x. The result you get
-#    will depend on the exact values that went into the vector r but it should
-#    be close to 0.185  That expression is the p-value associated with x.
-#    (Sample solution 6.1)
-
-
-# =    2  One- or two-sided  ===================================================
-
-# The shape of our histogram confirms that the rnorm() function has returned
-# values that appear distributed according to a normal distribution. In a normal
-# distribution, readily available tables tell us that 5% of the values (i.e. our
-# significance level) lie 1.96 (or approximately 2) standard deviations away
-# from the mean. Is this the case here? How many values in our vector r are
-# larger than 1.96?
-
-sum(r > 1.96)
-# [1] 24589
-
-# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
-
-# The answer is: we have to be careful with two-sided distributions. 2 standard
-# deviations away from the mean means either larger or smaller than 1.96 . This
-# can give rise to errors. If we are simply are interested in outliers, no
-# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
-# But if we are specifically interested in, say, larger values, because a
-# smaller value is not meaningful, then the significance cutoff, expressed as
-# standard deviations, is relaxed. We can use the quantile function to see what
-# the cutoff values are:
-
-quantile(r)
-quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
-# close to ± 1.96, as expected
-quantile(r, probs = 0.95) # for the single 5% boundary
-# close to 1.64 . Check counts to confirm:
-sum(r > quantile(r, probs = 0.95))
-# [1] 50000
-# which is 5%, as expected.
-
-# Task:
-# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
-# (Sample solution 6.2)
-
-# To summarize: when we evaluate the significance of an event, we divide a
-# probability distribution into two parts at the point where the event was
-# observed. We then ask whether the integral over the more extreme part is less
-# or more than 5% of the whole. If it is less, we deem the event to be
-# significant.
-#
-
-
-# =    3  Significance by integration  =========================================
-
-# If the underlying probability distribution can be analytically or numerically
-# integrated, the siginificance of an observation can be directly computed.
-
-
-# =    4  Significance by simulation or permutation  ===========================
-
-# But whether the integration is correct, or relies on assumptions that may not
-# be warranted for biological data, can be a highly technical question.
-# Fortunately, we can often simply run a simulation, a random resampling, or a
-# permutation and then count the number of outcomes, just as we did with our
-# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
-# p-value" is defined as (Nobs + 1) / (N + 1).  )
-
-# Here is an example. Assume you have a protein sequence and
-# you speculate that positively charged residues are close to negatively charged
-# residues to balance charge locally. A statistic that would capture this is the
-# mean minimum distance between all D,E residues and the closest R,K,H
-# residue. Let's compute this for the sequence of yeast Mbp1.
-
-MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
-               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
-               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
-               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
-               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
-               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
-               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
-               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
-               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
-               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
-               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
-               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
-               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
-               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
-
-# first we split this string into individual characters:
-v <- unlist(strsplit(MBP1, ""))
-
-# and find the positions of our charged residues
-
-ED  <- grep("[ED]", v)
-RKH <- grep("[RKH]", v)
-
-sep <- numeric(length(ED)) # this vector will hold the distances
-for (i in seq_along(ED)) {
-  sep[i] <- min(abs(RKH - ED[i]))
-}
-
-# Task: read and explain this bit of code
-
-# Now that sep is computed, what does it look like?
-
-table(sep)  # these are the minimum distances
-# 24 of D,E residues are adjacent to R,K,H;
-# the longest separation is 28 residues.
-
-# What is the mean separation?
-mean(sep)
-
-# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
-# to solve this analytically. But by permutation it's soooo easy.
-
-# First, we combine what we have done above into a function:
-
-chSep <- function(v) {
-  # computes the mean minimum separation of oppositely charged residues
-  # Parameter: v (char) a vector of amino acids in the one-letter code
-  # Value: msep (numeric) mean minimum separation
-
-  ED  <- grep("[EDed]", v)
-  RKH <- grep("[RKHrkh]", v)
-
-  sep <- numeric(length(ED))
-  for (i in seq_along(ED)) {
-    sep[i] <- min(abs(RKH - ED[i]))
-  }
-  return(mean(sep))
-}
-
-# Execute the function to define it.
-
-# Confirm that the function gives the same result as the number we
-# calculated above:
-chSep(v)
-
-# Now we can produce a random permutation of v, and recalculate
-
-set.seed(pi)                       # set RNG seed for repeatable randomness
-w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
-                                   # code paradigm. It is very useful.
-set.seed(NULL)                     # reset the RNG
-
-
-
-chSep(w)
-# 3.773 ... that's actually less than what we had before.
-
-# Let's do this 10000 times and record the results (takes a few seconds):
-
-N <- 10000
-chs <- numeric(N)
-for (i in 1:N) {
-  chs[i] <- chSep(sample(v, length(v))) # charge
-}
-
-hist(chs, breaks = 50)
-abline(v = chSep(v), col = "#EE0000")
-
-# Contrary to our expectations, the actual observed mean minimum charge
-# separation seems to be larger than what we observe in randomly permuted
-# sequences. But is this significant? Your task to find out.
-
-# Task:
-# Calculate the empirical p-value for chsep(v)
-# (Sample solution 6.3)
-
-
-# =    5  Final tasks  =========================================================
-
-# From chs, compute the empirical p-value of a mean minimum charge separation to
-#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
-#   the result in your journal. Is it significant? Also note the result of
-#   the following expression for validation:
-seal(sum(chs))
-
-
-# =    6  Sample solutions  ====================================================
-
-# ==   6.1    ==================================================================
-#
-sum(r <= x) / length(r)
-
-# ==   6.2    ==================================================================
-#
-abline(v = quantile(r, probs = c(0.05)))
-
-# ==   6.3    ==================================================================
-#
-( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
-
-
-# [END]
+# tocID <- "FND-STA-Significance.R"
+#
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the FND-STA-Significance unit.
+#
+# Version:  1.3
+#
+# Date:     2017-09  - 2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.3    2020 Maintenance. Add sample solution.
+#           1.2    Update set.seed() usage
+#           1.1    Corrected treatment of empirical p-value
+#           1.0    First contents
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                              Line
+#TOC> ------------------------------------------------------------------
+#TOC>   1        Significance and p-value                             49
+#TOC>   1.1        Significance levels                                60
+#TOC>   1.2        probability and p-value                            77
+#TOC>   1.2.1          p-value illustrated                           109
+#TOC>   2        One- or two-sided                                   165
+#TOC>   3        Significance by integration                         209
+#TOC>   4        Significance by simulation or permutation           215
+#TOC>   5        Final tasks                                         327
+#TOC>   6        Sample solutions                                    336
+#TOC>   6.1                                                          338
+#TOC>   6.2                                                          342
+#TOC>   6.3                                                          346
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Significance and p-value  ============================================
+
+# The idea of the probability of an event has a precise mathematical
+# interpretation, but how is it useful to know the probability? Usually we are
+# interested in whether we should accept or reject a hypothesis based on the
+# observations we have. A rational way to do this is to say: if the probability
+# of observing the data is very small under the null-hypothesis, then we will
+# assume the observation is due to something other than the null-hypothesis. But
+# what do we mean by the "probability of our observation"? And what is "very
+# small"?
+
+# ==   1.1  Significance levels  ===============================================
+
+# A "very small" probability is purely a matter of convention - a cultural
+# convention. In the biomedical field we usually call probabilities of less then
+# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
+# observations with a probability of less than 0.05 "significant" and if we want
+# to highlight this in text or in a graph, we often mark them with an asterisk
+# (*). Also we often call observations with a probability of less than 0.01
+# "highly significant" and mark them with two asterisks (**). But there is no
+# special significance in these numbers, the cutoff point for significance could
+# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
+# British statistician Ronald Fisher happened to propose for this purpose in
+# 1925. Incidentally, Fisher later recommended to use different cutoffs for
+# different purposes (cf.
+# https://en.wikipedia.org/wiki/Statistical_significance).
+
+
+# ==   1.2  probability and p-value  ===========================================
+
+# But what do we even mean by the probability of an observation?
+# Assume I am drawing samples from a normal distribution with a mean of 0 and a
+# standard deviation of 1. The sample I get is ...
+
+set.seed(sqrt(5))
+x <- rnorm(1)
+set.seed(NULL)
+
+print(x, digits = 22)
+# [1] -0.8969145466249813791748
+
+# So what's the probability of that number? Obviously, the probability of
+# getting exactly this number is very, very, very small. But also obviously,
+# this does not mean that observing this number is in any way significant - we
+# always observe some number. That's not what we mean in this case. There are
+# several implicit assumptions when we speak of the probability of an
+# observation:
+
+# 1: the observation can be compared to a probability distribution;
+# 2: that distribution can be integrated between any specific value
+#      and its upper and lower bounds (or +- infinity).
+
+# Then what we really mean by the probability of an observation in the context
+# of that distribution is: the probability of observing that value, or a value
+# more extreme than the one we have. We call this the p-value. Note that we are
+# not talking about an individual number anymore, we are talking about the area
+# under the curve between our observation and the upper (or lower) bound of the
+# curve, as a fraction of the whole.
+
+
+# ===   1.2.1  p-value illustrated                      
+
+# Let's illustrate. First we draw a million random values from our
+# standard, normal distribution:
+
+N <- 1e6                             # one million
+set.seed(112358)                     # set RNG seed for repeatable randomness
+r <- rnorm(N)                        # N values from a normal distribution
+set.seed(NULL)                       # reset the RNG
+
+# Let's see what the distribution looks like:
+
+(h <- hist(r))
+
+# The histogram details are now available in the list h -  e.g. h$counts
+
+# Where is the value we have drawn previously?
+abline(v = x, col = "#EE0000")
+
+# How many values are smaller?
+sum(r < x)
+
+# Let's color the bars:
+#    first, make a vector of red and green colors for the bars with breaks
+#    smaller and larger then x, white for the bar that contains x ...
+hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
+hCol <- c(hCol, "#FFFFFFFF")
+hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
+# ... then plot the histogram, with colored bars ...
+hist(r, col = hCol)
+# ... add two colored rectangles into the white bar ...
+idx <- sum(h$breaks < x)
+xMin <- h$breaks[idx]
+xMax <- h$breaks[idx + 1]
+y <- h$counts[idx]
+rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
+rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
+# ... and a red line for our observation.
+abline(v = x, col = "#EE0000", lwd = 2)
+
+# The p-value of our observation is the red area as a fraction of the
+# whole histogram (red + green).
+
+
+# Task:
+#    Explain how the expression sum(r < x) works to give us a count of values
+#    with the property we are looking for. E.g., examine -4:4 < x
+
+# Task:
+#    Write an expression to estimate the probability that a value
+#    drawn from the vector r is less-or-equal to x. The result you get
+#    will depend on the exact values that went into the vector r but it should
+#    be close to 0.185  That expression is the p-value associated with x.
+#    (Sample solution 6.1)
+
+
+# =    2  One- or two-sided  ===================================================
+
+# The shape of our histogram confirms that the rnorm() function has returned
+# values that appear distributed according to a normal distribution. In a normal
+# distribution, readily available tables tell us that 5% of the values (i.e. our
+# significance level) lie 1.96 (or approximately 2) standard deviations away
+# from the mean. Is this the case here? How many values in our vector r are
+# larger than 1.96?
+
+sum(r > 1.96)
+# [1] 24589
+
+# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
+
+# The answer is: we have to be careful with two-sided distributions. 2 standard
+# deviations away from the mean means either larger or smaller than 1.96 . This
+# can give rise to errors. If we are simply are interested in outliers, no
+# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
+# But if we are specifically interested in, say, larger values, because a
+# smaller value is not meaningful, then the significance cutoff, expressed as
+# standard deviations, is relaxed. We can use the quantile function to see what
+# the cutoff values are:
+
+quantile(r)
+quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
+# close to ± 1.96, as expected
+quantile(r, probs = 0.95) # for the single 5% boundary
+# close to 1.64 . Check counts to confirm:
+sum(r > quantile(r, probs = 0.95))
+# [1] 50000
+# which is 5%, as expected.
+
+# Task:
+# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
+# (Sample solution 6.2)
+
+# To summarize: when we evaluate the significance of an event, we divide a
+# probability distribution into two parts at the point where the event was
+# observed. We then ask whether the integral over the more extreme part is less
+# or more than 5% of the whole. If it is less, we deem the event to be
+# significant.
+#
+
+
+# =    3  Significance by integration  =========================================
+
+# If the underlying probability distribution can be analytically or numerically
+# integrated, the siginificance of an observation can be directly computed.
+
+
+# =    4  Significance by simulation or permutation  ===========================
+
+# But whether the integration is correct, or relies on assumptions that may not
+# be warranted for biological data, can be a highly technical question.
+# Fortunately, we can often simply run a simulation, a random resampling, or a
+# permutation and then count the number of outcomes, just as we did with our
+# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
+# p-value" is defined as (Nobs + 1) / (N + 1).  )
+
+# Here is an example. Assume you have a protein sequence and
+# you speculate that positively charged residues are close to negatively charged
+# residues to balance charge locally. A statistic that would capture this is the
+# mean minimum distance between all D,E residues and the closest R,K,H
+# residue. Let's compute this for the sequence of yeast Mbp1.
+
+MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
+               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
+               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
+               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
+               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
+               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
+               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
+               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
+               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
+               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
+               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
+               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
+               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
+               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
+
+# first we split this string into individual characters:
+v <- unlist(strsplit(MBP1, ""))
+
+# and find the positions of our charged residues
+
+ED  <- grep("[ED]", v)
+RKH <- grep("[RKH]", v)
+
+sep <- numeric(length(ED)) # this vector will hold the distances
+for (i in seq_along(ED)) {
+  sep[i] <- min(abs(RKH - ED[i]))
+}
+
+# Task: read and explain this bit of code
+
+# Now that sep is computed, what does it look like?
+
+table(sep)  # these are the minimum distances
+# 24 of D,E residues are adjacent to R,K,H;
+# the longest separation is 28 residues.
+
+# What is the mean separation?
+mean(sep)
+
+# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
+# to solve this analytically. But by permutation it's soooo easy.
+
+# First, we combine what we have done above into a function:
+
+chSep <- function(v) {
+  # computes the mean minimum separation of oppositely charged residues
+  # Parameter: v (char) a vector of amino acids in the one-letter code
+  # Value: msep (numeric) mean minimum separation
+
+  ED  <- grep("[EDed]", v)
+  RKH <- grep("[RKHrkh]", v)
+
+  sep <- numeric(length(ED))
+  for (i in seq_along(ED)) {
+    sep[i] <- min(abs(RKH - ED[i]))
+  }
+  return(mean(sep))
+}
+
+# Execute the function to define it.
+
+# Confirm that the function gives the same result as the number we
+# calculated above:
+chSep(v)
+
+# Now we can produce a random permutation of v, and recalculate
+
+set.seed(pi)                       # set RNG seed for repeatable randomness
+w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
+                                   # code paradigm. It is very useful.
+set.seed(NULL)                     # reset the RNG
+
+
+
+chSep(w)
+# 3.773 ... that's actually less than what we had before.
+
+# Let's do this 10000 times and record the results (takes a few seconds):
+
+N <- 10000
+chs <- numeric(N)
+for (i in 1:N) {
+  chs[i] <- chSep(sample(v, length(v))) # charge
+}
+
+hist(chs, breaks = 50)
+abline(v = chSep(v), col = "#EE0000")
+
+# Contrary to our expectations, the actual observed mean minimum charge
+# separation seems to be larger than what we observe in randomly permuted
+# sequences. But is this significant? Your task to find out.
+
+# Task:
+# Calculate the empirical p-value for chsep(v)
+# (Sample solution 6.3)
+
+
+# =    5  Final tasks  =========================================================
+
+# From chs, compute the empirical p-value of a mean minimum charge separation to
+#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
+#   the result in your journal. Is it significant? Also note the result of
+#   the following expression for validation:
+seal(sum(chs))
+
+
+# =    6  Sample solutions  ====================================================
+
+# ==   6.1    ==================================================================
+#
+sum(r <= x) / length(r)
+
+# ==   6.2    ==================================================================
+#
+abline(v = quantile(r, probs = c(0.05)))
+
+# ==   6.3    ==================================================================
+#
+( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
+
+
+# [END]
--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
-# BCH441-WORK-ABC-units
-
+# BCH441-WORK-ABC-units
+
 This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@ -1,245 +1,245 @@
-# tocID <- "RPR-Biostrings.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Biostrings unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.0    2017 Revisions
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                             Line
-#TOC> -----------------------------------------------------------------
-#TOC>   1        The Biostrings:: Package                            56
-#TOC>   2        Getting Data into Biostrings:: Objects              88
-#TOC>   3        Working with Biostrings:: Objects                  110
-#TOC>   3.1        Properties                                       127
-#TOC>   3.2        Subsetting                                       168
-#TOC>   3.3        Operators                                        180
-#TOC>   3.4        Transformations                                  187
-#TOC>   4        Getting Data out of Biostrings:: Objects           194
-#TOC>   5        More                                               203
-#TOC>   5.1        Views                                            205
-#TOC>   5.2        Iranges                                          219
-#TOC>   5.3        StringSets                                       225
-#TOC> 
-#TOC> ==========================================================================
-
-
-# This is a very brief introduction to the Biostrings:: package, other units will
-# be using more of the Biostrings:: functions.
-
-
-# =    1  The Biostrings:: Package  ============================================
-
-
-# First, we install and load the Biostrings:: package from bioconductor (if we
-# haven't done so already).
-
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Examine the package information:
-library(help = Biostrings)       # basic information
-browseVignettes("Biostrings")    # available vignettes
-data(package = "Biostrings")     # available datasets
-
-
-# At its core, Biostrings:: objects are "classes" of type XString (you can think
-# of a "class" in R as a special kind of list), that can take on particular
-# flavours for RNA, DNA or amino acid sequence information.
-
-class(Biostrings::RNAString("AUG"))
-class(Biostrings::DNAString("ATG"))
-class(Biostrings::AAString("M"))
-
-# An essential property of Biostrings:: objects is that they only allow letters
-# from the applicable IUPAC alphabet:
-Biostrings::RNAString("AUG")
-Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
-
-
-# =    2  Getting Data into Biostrings:: Objects  ==============================
-
-
-# Example: read FASTA. Extract sequence. Convert to DNAString object.
-rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-rawSeq <- dbSanitizeSequence(rawSeq)
-biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
-                                            # into an object of class DNAstring
-
-# Multi FASTA files can be read directly as a "XStringSet) ...
-rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
-(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
-
-# ... and if you subset one sequence from the set, you get an XString object
-# back again.
-(Xseq <- biosDNASet[[1]])
-
-biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
-identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
-
-
-
-# =    3  Working with Biostrings:: Objects  ===================================
-
-# Biostrings:: is a highly engineered package that is tightly integrated into
-# the Bioconductor world - unfortunately that brings with it a somewhat
-# undesirable level of computational overhead and dependencies. Using the
-# package as we normally do - i.e. calling required functions with their
-# explicit package prefix is therefore not advisable. There are generics
-# that won't be propery dispatched. If you only need a small number of
-# functions for a very specific context, you will probably get away with
-# Biostrings::<function>() - but even in the demonstration code of this script
-# not everything works out of the box. We'll therefore load the library,
-# but we'll (redundantly) use the prefix anyway so as to emphasize where
-# the functions come from.
-
-library(Biostrings)
-
-
-# ==   3.1  Properties  ========================================================
-str(rawSeq)
-str(biosDNAseq)
-
-length(rawSeq)       # ... is 1: one string only. To get the number of
-                     # characters in a string, you need nchar().
-length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
-nchar(rawSeq)
-nchar(biosDNAseq)    # ... but nchar() works too.
-
-(uL <- Biostrings::uniqueLetters(biosDNAseq))
-
-# Count frequencies - with strings, you would strsplit() into a character
-# vector and then use table(). biost
-Biostrings::alphabetFrequency(biosDNAseq)
-
-# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
-# returns.
-Biostrings::letterFrequency(biosDNAseq, uL)
-sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
-  length(biosDNAseq) # GC contents
-
-Biostrings::dinucleotideFrequency(biosDNAseq)
-barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
-
-(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
-barplot(sort(triNuc), col="#4499EE33")
-triNuc[triNuc == max(triNuc)]
-triNuc[triNuc == min(triNuc)]
-max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
-
-# compare to a shuffled sequence:
-(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
-barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
-max(triNuc)
-# Interpret this plot.
-(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
-barplot(sort(triNuc), col="#EEEE4433")
-max(triNuc)
-
-
-# ==   3.2  Subsetting  ========================================================
-
-# Subsetting any XString object works as expected:
-biosDNAseq[4:15]
-
-# ... well - maybe not expected, because rawSeq[4:15] would not work.
-
-# Alternatively to the "[" operator, use the subseq() function - especially for
-# long sequences. This is far more efficient.
-Biostrings::subseq(biosDNAseq, start = 1, end = 30)
-
-
-# ==   3.3  Operators  =========================================================
-
-# RNAstring() and DNAstring() objects compare U and T as equals!
-  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
-  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
-
-
-# ==   3.4  Transformations  ===================================================
-
-biosDNAseq[4:15]
-Biostrings::reverseComplement(biosDNAseq[4:15])
-Biostrings::translate(biosDNAseq[4:15])
-
-
-# =    4  Getting Data out of Biostrings:: Objects  ============================
-
-# If you need a character object, use toString():
-
-Biostrings::toString(biosDNAseq[4:15])
-
-# saveRDS() and readRDS() works like on all other R objects.
-
-
-# =    5  More  ================================================================
-
-# ==   5.1  Views  =============================================================
-
-# Biostring "Views" are objects that store multiple substrings of one
-# Biostring object.
-
-(myView <- Biostrings::Views(biosDNAseq,
-                             start = c(1, 19, 37),
-                             end = c(15, 30, 45)))
-
-# Views are convenient to store feature annotations
-names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
-cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
-
-
-# ==   5.2  Iranges  ===========================================================
-
-# Biostrings:: Iranges are like Views with a common start point. These can be
-# useful for feature annotations. Instead of start/end you store start/width.
-
-
-# ==   5.3  StringSets  ========================================================
-
-# Biostring "StringSets" store multiple sequences.
-#
-ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
-sample(ompA) # sample can work directly on a Biostring object to shuffle it
-
-x <- Biostrings::toString(ompA)
-for (i in 2:10) {
-  x[i] <- Biostrings::toString(sample(ompA))
-}
-shuffledPeptideSet <- Biostrings::AAStringSet(x)
-names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
-shuffledPeptideSet
-
-length(shuffledPeptideSet)
-Biostrings::width(shuffledPeptideSet)
-Biostrings::alphabetFrequency(shuffledPeptideSet)
-
-
-# [END]
+# tocID <- "RPR-Biostrings.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Biostrings unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Updates
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.0    2017 Revisions
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                             Line
+#TOC> -----------------------------------------------------------------
+#TOC>   1        The Biostrings:: Package                            56
+#TOC>   2        Getting Data into Biostrings:: Objects              88
+#TOC>   3        Working with Biostrings:: Objects                  110
+#TOC>   3.1        Properties                                       127
+#TOC>   3.2        Subsetting                                       168
+#TOC>   3.3        Operators                                        180
+#TOC>   3.4        Transformations                                  187
+#TOC>   4        Getting Data out of Biostrings:: Objects           194
+#TOC>   5        More                                               203
+#TOC>   5.1        Views                                            205
+#TOC>   5.2        Iranges                                          219
+#TOC>   5.3        StringSets                                       225
+#TOC> 
+#TOC> ==========================================================================
+
+
+# This is a very brief introduction to the Biostrings:: package, other units will
+# be using more of the Biostrings:: functions.
+
+
+# =    1  The Biostrings:: Package  ============================================
+
+
+# First, we install and load the Biostrings:: package from bioconductor (if we
+# haven't done so already).
+
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Examine the package information:
+library(help = Biostrings)       # basic information
+browseVignettes("Biostrings")    # available vignettes
+data(package = "Biostrings")     # available datasets
+
+
+# At its core, Biostrings:: objects are "classes" of type XString (you can think
+# of a "class" in R as a special kind of list), that can take on particular
+# flavours for RNA, DNA or amino acid sequence information.
+
+class(Biostrings::RNAString("AUG"))
+class(Biostrings::DNAString("ATG"))
+class(Biostrings::AAString("M"))
+
+# An essential property of Biostrings:: objects is that they only allow letters
+# from the applicable IUPAC alphabet:
+Biostrings::RNAString("AUG")
+Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
+
+
+# =    2  Getting Data into Biostrings:: Objects  ==============================
+
+
+# Example: read FASTA. Extract sequence. Convert to DNAString object.
+rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+rawSeq <- dbSanitizeSequence(rawSeq)
+biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
+                                            # into an object of class DNAstring
+
+# Multi FASTA files can be read directly as a "XStringSet) ...
+rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
+(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
+
+# ... and if you subset one sequence from the set, you get an XString object
+# back again.
+(Xseq <- biosDNASet[[1]])
+
+biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
+identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
+
+
+
+# =    3  Working with Biostrings:: Objects  ===================================
+
+# Biostrings:: is a highly engineered package that is tightly integrated into
+# the Bioconductor world - unfortunately that brings with it a somewhat
+# undesirable level of computational overhead and dependencies. Using the
+# package as we normally do - i.e. calling required functions with their
+# explicit package prefix is therefore not advisable. There are generics
+# that won't be propery dispatched. If you only need a small number of
+# functions for a very specific context, you will probably get away with
+# Biostrings::<function>() - but even in the demonstration code of this script
+# not everything works out of the box. We'll therefore load the library,
+# but we'll (redundantly) use the prefix anyway so as to emphasize where
+# the functions come from.
+
+library(Biostrings)
+
+
+# ==   3.1  Properties  ========================================================
+str(rawSeq)
+str(biosDNAseq)
+
+length(rawSeq)       # ... is 1: one string only. To get the number of
+                     # characters in a string, you need nchar().
+length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
+nchar(rawSeq)
+nchar(biosDNAseq)    # ... but nchar() works too.
+
+(uL <- Biostrings::uniqueLetters(biosDNAseq))
+
+# Count frequencies - with strings, you would strsplit() into a character
+# vector and then use table(). biost
+Biostrings::alphabetFrequency(biosDNAseq)
+
+# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
+# returns.
+Biostrings::letterFrequency(biosDNAseq, uL)
+sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
+  length(biosDNAseq) # GC contents
+
+Biostrings::dinucleotideFrequency(biosDNAseq)
+barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
+
+(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
+barplot(sort(triNuc), col="#4499EE33")
+triNuc[triNuc == max(triNuc)]
+triNuc[triNuc == min(triNuc)]
+max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
+
+# compare to a shuffled sequence:
+(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
+barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
+max(triNuc)
+# Interpret this plot.
+(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
+barplot(sort(triNuc), col="#EEEE4433")
+max(triNuc)
+
+
+# ==   3.2  Subsetting  ========================================================
+
+# Subsetting any XString object works as expected:
+biosDNAseq[4:15]
+
+# ... well - maybe not expected, because rawSeq[4:15] would not work.
+
+# Alternatively to the "[" operator, use the subseq() function - especially for
+# long sequences. This is far more efficient.
+Biostrings::subseq(biosDNAseq, start = 1, end = 30)
+
+
+# ==   3.3  Operators  =========================================================
+
+# RNAstring() and DNAstring() objects compare U and T as equals!
+  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
+  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
+
+
+# ==   3.4  Transformations  ===================================================
+
+biosDNAseq[4:15]
+Biostrings::reverseComplement(biosDNAseq[4:15])
+Biostrings::translate(biosDNAseq[4:15])
+
+
+# =    4  Getting Data out of Biostrings:: Objects  ============================
+
+# If you need a character object, use toString():
+
+Biostrings::toString(biosDNAseq[4:15])
+
+# saveRDS() and readRDS() works like on all other R objects.
+
+
+# =    5  More  ================================================================
+
+# ==   5.1  Views  =============================================================
+
+# Biostring "Views" are objects that store multiple substrings of one
+# Biostring object.
+
+(myView <- Biostrings::Views(biosDNAseq,
+                             start = c(1, 19, 37),
+                             end = c(15, 30, 45)))
+
+# Views are convenient to store feature annotations
+names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
+cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
+
+
+# ==   5.2  Iranges  ===========================================================
+
+# Biostrings:: Iranges are like Views with a common start point. These can be
+# useful for feature annotations. Instead of start/end you store start/width.
+
+
+# ==   5.3  StringSets  ========================================================
+
+# Biostring "StringSets" store multiple sequences.
+#
+ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
+sample(ompA) # sample can work directly on a Biostring object to shuffle it
+
+x <- Biostrings::toString(ompA)
+for (i in 2:10) {
+  x[i] <- Biostrings::toString(sample(ompA))
+}
+shuffledPeptideSet <- Biostrings::AAStringSet(x)
+names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
+shuffledPeptideSet
+
+length(shuffledPeptideSet)
+Biostrings::width(shuffledPeptideSet)
+Biostrings::alphabetFrequency(shuffledPeptideSet)
+
+
+# [END]
--- a/RPR-ChimeraX_remote.R
+++ b/RPR-ChimeraX_remote.R
@ -1,165 +1,165 @@
-# tocID <- "RPR-ChimeraX_remote.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code demonstrating remote scripting of ChimeraX.
-#
-# Version:  1.0.1
-#
-# Date:     2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.0.1  2021 Minimal updates
-#           1.0    First ABC units version
-#
-#
-# TODO:
-#    %-encode and escape quotes, or just pass-through?
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                  Line
-#TOC> ------------------------------------------------------
-#TOC>   1        ChimeraX REMOTE SCRIPTING                41
-#TOC>   1.1        Defining a Port                        59
-#TOC>   1.2        Open ChimeraX                          81
-#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
-
-
-# One of the cool features of ChimeraX is that it can be driven by Python code,
-# both within a running session and through Python scripts. What I find even
-# cooler though is that ChimeraX can be driven from any programming language via
-# its remote control function that can listen to commands sent from any other
-# application. The interface that is used here is the standard REST (method) -
-# the GET and POST verbs that ubiquitously underly the communication of clients
-# and servers on the Web.
-
-# In order to establish the communication between this script and ChimeraX, all
-# we need to do is:
-#  - open ChimeraX;
-#  - tell it to listen on a specific "port";
-#  - send commands to that port via httr::
-
-
-# ==   1.1  Defining a Port  ===================================================
-
-# The httr:: package needs to be available
-
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-# Package information:
-#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
-
-# We need to think od a port. Any available port number between 49152-65535 is
-# fine. We'll choose 61803 because that's the fractional part of the golden
-# ratio. But one could choose another.
-
-CXPORT <- 61803
-
-# Check that our current version of R supports sockets (default since V 3.3)
-capabilities("sockets")   # MUST be TRUE. If not, don't continue.
-
-
-# ==   1.2  Open ChimeraX  =====================================================
-
-#  - Open a fresh, new session of recently updated version of ChimeraX
-#  - type:
-#
-#       remotecontrol rest start port 61803
-#
-#    ... or whatever the value of CXPORT is.
-
-# Now watch what happens in ChimeraX when you execute the following line:
-( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
-
-# The .utilities.R script includes the function CX(), based on this principle,
-# through which you can send commands to ChimeraX
-
-CX("camera sbs")
-CX("lighting soft")
-CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
-
-# The command echos Chimera's response if the parameter "quietly" is
-# FALSE (default), and we can silence output with quietly = TRUE :
-CX("info models #1 attribute num_residues")
-CX("info models #1 attribute num_residues", quietly = TRUE)
-
-# Either way, the command also returns Chimera's responses "invisibly";
-# i.e. we can use the results by assigning the output to a variable:
-hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
-x <- read.table(file = textConnection(hBonds), skip = 9,
-                blank.lines.skip = TRUE, fill = TRUE)
-hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
-
-
-# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
-
-# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
-# to explore possible DNA binding regions in 1BM8
-
-# The model for 1BM8 is already open as model 1  (#1)
-CX("hide #1 cartoons")        # hide model 1 cartoon representation
-CX("open 1DUX")               # assume this is opened as model #2
-CX("hide #2")                 # hide everything ...
-CX("select #2/C")             # chain c (protein)
-CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
-CX("color sequential sel target c palette steelblue:darkmagenta")
-CX("view #2/C")               # re-center the display
-CX("cofr #2/C:62@CA")         # set pivot to an interface residue
-CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
-CX("style sel stick")
-CX("show sel target ab")      # show atoms/bonds
-CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
-CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
-CX("surface sel enclose sel") # compute joint accessible surface of both chains
-CX("transparency 50")
-CX("select clear")
-
-# Now superimpose the 1BM8 chain onto 1DUX chain C
-CX("show #1 cartoons")
-CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
-
-# study the general layout, and the position of the 1mb8 secondary structure
-# elements relative to 1DUX
-
-# Let's examine side chain orientations in more detail
-CX("hide #2/C cartoons")  # hide the 1DUX protein
-
-# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
-CX("select zone #2/A,B 3.5 #1 & protein residues true")
-CX("~select sel & H")  # de-select H atoms
-CX("show sel target ab")
-CX("size stickRadius 0.4")
-CX("select clear")
-
-# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
-# transcription factor binding mode; the detailed conformations of side chains
-# would need to change only to a minor degree. There is a very significant
-# degree of structural similarity; remarkable, given that the DNA is not the
-# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
-# determined without a DNA ligand.
-
-CX("remotecontrol rest stop")  # release the socket
-# Done.
-
-
-
-# [END]
+# tocID <- "RPR-ChimeraX_remote.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code demonstrating remote scripting of ChimeraX.
+#
+# Version:  1.0.1
+#
+# Date:     2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0.1  2021 Minimal updates
+#           1.0    First ABC units version
+#
+#
+# TODO:
+#    %-encode and escape quotes, or just pass-through?
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                  Line
+#TOC> ------------------------------------------------------
+#TOC>   1        ChimeraX REMOTE SCRIPTING                41
+#TOC>   1.1        Defining a Port                        59
+#TOC>   1.2        Open ChimeraX                          81
+#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
+
+
+# One of the cool features of ChimeraX is that it can be driven by Python code,
+# both within a running session and through Python scripts. What I find even
+# cooler though is that ChimeraX can be driven from any programming language via
+# its remote control function that can listen to commands sent from any other
+# application. The interface that is used here is the standard REST (method) -
+# the GET and POST verbs that ubiquitously underly the communication of clients
+# and servers on the Web.
+
+# In order to establish the communication between this script and ChimeraX, all
+# we need to do is:
+#  - open ChimeraX;
+#  - tell it to listen on a specific "port";
+#  - send commands to that port via httr::
+
+
+# ==   1.1  Defining a Port  ===================================================
+
+# The httr:: package needs to be available
+
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+# Package information:
+#  library(help = httr)       # basic information
+#  browseVignettes("httr")    # available vignettes
+#  data(package = "httr")     # available datasets
+
+# We need to think od a port. Any available port number between 49152-65535 is
+# fine. We'll choose 61803 because that's the fractional part of the golden
+# ratio. But one could choose another.
+
+CXPORT <- 61803
+
+# Check that our current version of R supports sockets (default since V 3.3)
+capabilities("sockets")   # MUST be TRUE. If not, don't continue.
+
+
+# ==   1.2  Open ChimeraX  =====================================================
+
+#  - Open a fresh, new session of recently updated version of ChimeraX
+#  - type:
+#
+#       remotecontrol rest start port 61803
+#
+#    ... or whatever the value of CXPORT is.
+
+# Now watch what happens in ChimeraX when you execute the following line:
+( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
+
+# The .utilities.R script includes the function CX(), based on this principle,
+# through which you can send commands to ChimeraX
+
+CX("camera sbs")
+CX("lighting soft")
+CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
+
+# The command echos Chimera's response if the parameter "quietly" is
+# FALSE (default), and we can silence output with quietly = TRUE :
+CX("info models #1 attribute num_residues")
+CX("info models #1 attribute num_residues", quietly = TRUE)
+
+# Either way, the command also returns Chimera's responses "invisibly";
+# i.e. we can use the results by assigning the output to a variable:
+hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
+x <- read.table(file = textConnection(hBonds), skip = 9,
+                blank.lines.skip = TRUE, fill = TRUE)
+hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
+
+
+# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
+
+# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
+# to explore possible DNA binding regions in 1BM8
+
+# The model for 1BM8 is already open as model 1  (#1)
+CX("hide #1 cartoons")        # hide model 1 cartoon representation
+CX("open 1DUX")               # assume this is opened as model #2
+CX("hide #2")                 # hide everything ...
+CX("select #2/C")             # chain c (protein)
+CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
+CX("color sequential sel target c palette steelblue:darkmagenta")
+CX("view #2/C")               # re-center the display
+CX("cofr #2/C:62@CA")         # set pivot to an interface residue
+CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
+CX("style sel stick")
+CX("show sel target ab")      # show atoms/bonds
+CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
+CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
+CX("surface sel enclose sel") # compute joint accessible surface of both chains
+CX("transparency 50")
+CX("select clear")
+
+# Now superimpose the 1BM8 chain onto 1DUX chain C
+CX("show #1 cartoons")
+CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
+
+# study the general layout, and the position of the 1mb8 secondary structure
+# elements relative to 1DUX
+
+# Let's examine side chain orientations in more detail
+CX("hide #2/C cartoons")  # hide the 1DUX protein
+
+# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
+CX("select zone #2/A,B 3.5 #1 & protein residues true")
+CX("~select sel & H")  # de-select H atoms
+CX("show sel target ab")
+CX("size stickRadius 0.4")
+CX("select clear")
+
+# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
+# transcription factor binding mode; the detailed conformations of side chains
+# would need to change only to a minor degree. There is a very significant
+# degree of structural similarity; remarkable, given that the DNA is not the
+# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
+# determined without a DNA ligand.
+
+CX("remotecontrol rest stop")  # release the socket
+# Done.
+
+
+
+# [END]
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@ -1,322 +1,322 @@
-# tocID <- "RPR-FASTA.R"
-#
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-FASTA unit.
-#
-# Version:  1.1.2
-#
-# Date:     2017-10  -  2021-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.1.2  style update
-#           1.1.1  bugfix - wrong function name
-#           1.1    2020 Maintenance. Rewrite validation logic. Add data
-#                  to utilities. Define AACOLS
-#           1.0    New unit.
-#
-#
-# TODO: Make a simple solution first, then extend it to error checking, and
-#       to handle .mfa files.
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                 Line
-#TOC> -----------------------------------------------------
-#TOC>   1        Reading and validating FASTA            45
-#TOC>   1.1        Validating FASTA                      81
-#TOC>   2        Parsing FASTA                          227
-#TOC>   3        Interpreting FASTA                     247
-#TOC>   4        Writing FASTA                          274
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Reading and validating FASTA  ========================================
-
-# FASTA is a text based format, structured in lines that are separated by
-# line-feed or paragraph-break characters. Which one of these is used, depends
-# on your operating system. But R's readLines() function knows how to handle
-# these correctly, accross platforms. Don't try to read such files "by hand".
-# Here is the yeast Mbp1 gene, via SGD.
-
-file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
-faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-
-# The warning is generated because the programmer at the NCBI who implemented
-# the code to write this FASTA file neglected to place a line-break character
-# after the last sequence character. While this is not technically incorrect,
-# it is poor practice: the resulting file can't be distinguished from one that
-# has been truncated in transmission.
-
-head(faMBP1)
-
-# Note that there are NO line-break characters ("\n") at the end of these
-# strings, even though they were present in the original file. readLines()
-# has "consumed" these characters while reading - but every single line is in
-# a vector of its own.
-
-tail(faMBP1)
-
-# Also note that the last line has fewer characters - this means readLines()
-# imported the whole line, despite it not being terminated by "\n".
-
-# It's very straightforward to work with such data, for example by collapsing
-# everything except the first line into a single string ...
-
-f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
-
-f[1]
-nchar(f[2])
-
-# ==   1.1  Validating FASTA  ==================================================
-
-# The code above is making the assumption that everything from line 2 until
-#  the end IS sequence, the whole sequence and nothing but sequence.
-#  That assumption can break down in many ways:
-#
-#  - there could be more than one header line. The specification says otherwise,
-#       but some older files use multiple, consecutive header lines. You don't
-#       want that to end up in your sequence.
-#  - this could be not a FASTA file at all. It could be raw sequence, a
-#       different sequence file format, or a wholly different file altogether.
-#       If you look at the file, you can immediately tell, but if you are
-#       reading the file in a complex workflow, your could easily import wrong
-#       data into your analysis.
-#  - there could be more than one sequence in the file. Such Multi-FASTA files
-#       occur commonly, as downloads of ORFs from genome regions or other
-#       sets of genes or proteins, or as the input / output for multiple
-#       sequence alignment programs.
-#
-# Data "from the wild" can (and usually does) have the most unexpected
-# variations and it is really, really important to be clear about the
-# assumptions that you are making. It is possible to "fix" things, according
-# to the "Robustness Principle" :
-#      "Be conservative in what you send,
-#       be liberal in what you accept".
-#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
-# ... but if you think about this, that's actually a really poor idea,
-# which is much more likely to dilute standards, make unwarranted
-# assumptions, and allow errors to pass silently and corrupt data.
-#
-# Let's discard this principle on the trash-heap of
-# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
-# identify problems, and follow the principle: "crash early, crash often". Of
-# course I can write code that would reformat any possible input as a FASTA
-# file - but what good will it do me if it parses the file I receive
-# from a server into FASTA format like:
-#
-#   >404- Page Not Found</title</head>
-#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
-#   spellingrcntacttheadministratrsdyhtml
-#
-# Therefore, we write ourselves a FASTA checker that will enforce the following:
-#   (1) a FASTA file contains one or more sequences separated by zero or
-#       more empty lines
-#   (2) a sequence contains one header line followed by
-#       one or more sequence lines
-#   (3) a sequence line contains one or more uppercase or lowercase single
-#       letter amino acid codes, hyphens (gap character), or * (stop).
-#
-#   Anything else should generate an error.
-
-#   (Case 1): Header(s) exist
-fX <- c("ABC",
-        "defghi",
-        "klmnpq")
-sel <- grepl("^>", fX)  # "^>" is a regular expression that
-                        # means: the exact character ">" at the
-                        # beginning ("^") of the line.
-if ( ! any(sel) ) { stop("no header lines in input.") }
-
-
-#   (Case 2) No adjacent header lines
-fX <- c(">ABC",
-        ">123",
-        "defghi",
-        "klmnpq")
-sel <- grepl("^>", fX)
-sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
-if ( any(sel)) { stop("adjacent header lines in input.") }
-
-#   (Case 3.1) all sequence lines contain only valid characters
-#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
-#               are defined with the .utilities.R script)
-AAVALID
-fX <- c(">ABC",
-        "def ;-) ghi",
-        "klmnpq")
-myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
-sel <- ! grepl("^>", fX)              # NOT headers
-if (any(grepl(myRegex, fX[sel]))) {
-  stop("invalid chracter(s) outside of header lines.")
-}
-
-#   (Case 3.2) all headers are followed directly by
-#              at least one letter of sequence
-fX <- c(">ABC",
-        "",
-        ">123",
-        "defghi",
-        "klmnpq")
-sel <- grep("^>", fX) + 1             # indexes of headers + 1
-myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
-if (! all(grepl(myRegex, fX[sel]))) {
-  stop("a header has no adjacent sequence.")
-}
-# Ah, you might ask - couldn't we just have dropped all empty lines, and
-# then caught this in Case 2? No - for two reasons: we would still miss headers
-# at the end of file, and, we would have changed the line numbering - and
-# ideally our "production" function will create information about where the
-# error is to be found.
-
-
-# Now combine this into a function ...
-
-val <- function(fa) {
-
-  if ( ! any(grepl("^>", fa)) ) {
-    stop("no header lines in input.")
-  }
-
-  sel <- grepl("^>", fa)
-  if ( any(sel[- length(sel)] & sel[-1])) {
-    stop("adjacent header lines in input.")
-  }
-
-  sel <- ! grepl("^>", fa)
-  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
-    stop("invalid chracter(s) outside of header lines.")
-  }
-
-  sel <- grep("^>", fa) + 1
-  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
-    stop("a header has no adjacent sequence.")
-  }
-
-  return(invisible(NULL))
-}
-
-# Here is an example
-FA <- c(">head1",
-        "acdef",
-        "ghi",
-        "",
-        ">head2",
-        "kl",
-        ">head3",
-        "mn",
-        "pqrs")
-val(FA)     # ... should not create an error
-
-
-# A somewhat more elaborate validateFA() function was loaded with the
-# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
-# fasta files have space-characters in their spacer lines. Try it ...
-validateFA(FA)
-
-# =    2  Parsing FASTA  =======================================================
-
-# Once we have validated our assumptions about our input, it's quite
-# painless to parse it. I have put this together as a function and the function
-# gets loaded from ./.utilities.R
-#
-
-# Lets try this:
-#   - the first 3 elements of faMBP1:
-readFASTA(faMBP1[1:3])
-
-#   - a multi FASTA file of aligned APSES domain sequences:
-
-refAPSES <- readFASTA("./data/refAPSES.mfa")
-
-# Subset the sequence with "P39678" in the header
-refAPSES[grep("P39678", refAPSES$head) ,]
-
-
-
-# =    3  Interpreting FASTA  ==================================================
-
-
-# FASTA files are straightforward to interpret - just one thing may be of note:
-# when working with strings, we can use substr(<string>, <start>, <stop>) to
-# extract substrings, but more often we expand the string into a vector of
-# single characters with strsplit(<string>, ""). strsplit() returns a list,
-# to accommodate that <string> could be a vector of many elements, therefore
-# we usually unlist() the result if we use it only on a single string.
-
-# Example: How many positive charged residues in "MBP1_SACCE"?
-
-s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
-s
-
-sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
-                       # for the characters, sum() coerces to 1 and 0
-                       # respectively, and that gives us the result.
-
-100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
-
-# residue distribution
-x <- factor(s, levels = names(AACOLS))
-pie(table(x)[names(AACOLS)], col = AACOLS)
-
-
-
-# =    4  Writing FASTA  =======================================================
-
-
-# Writing FASTA files is mostly just the reverse of reading, with one
-# twist: we need to break the long sequence string into chunks of the desired
-# width. The FASTA specification calls for a maximum of 120 characters per line,
-# but writing out much less than that is common, since it allows to comfortably
-# view lines on the console, or printing them on a sheet of paper (do we still
-# do that actually?). How do we break a string into chunks? A combination of
-# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
-# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
-# loop through our FASTA object in memory, we can build the output by c()'ing
-# blocks of header + sequence to each other. For VERY large objects this might
-# be slow - in that case, we might want to precalculate the size of the output
-# object. But that's more of a hypothetical consideration.
-
-( s <- refAPSES$seq[2] )
-nchar(s)
-w <- 30     # width of chunk
-(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
-(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
-
-# Task: Is this safe? What happens if nchar(s) is shorter than w?
-#       What happens if nchar(s) is an exact multiple of w?
-
-substring(s, starts, ends)
-# confirm that the output contains the first and last residue, and both
-# residues adjacent to the breaks
-
-# As always, the function has been defined in ".utilities.R" for to use
-# any time...  type   writeFASTA  to examine it.
-
-# Let's try this...
-
-writeFASTA(refAPSES, width = 40)
-
-# roundtrip for validation: write refAPSES with a different format,
-# read it back in - the new dataframe must be identical
-# to the original dataframe.
-fname <- tempfile()
-writeFASTA(refAPSES, fn = fname, width = 30)
-identical(refAPSES, readFASTA(fname))
-
-# ...works for me  :-)
-
-
-# [END]
+# tocID <- "RPR-FASTA.R"
+#
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-FASTA unit.
+#
+# Version:  1.1.2
+#
+# Date:     2017-10  -  2021-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.1.2  style update
+#           1.1.1  bugfix - wrong function name
+#           1.1    2020 Maintenance. Rewrite validation logic. Add data
+#                  to utilities. Define AACOLS
+#           1.0    New unit.
+#
+#
+# TODO: Make a simple solution first, then extend it to error checking, and
+#       to handle .mfa files.
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                 Line
+#TOC> -----------------------------------------------------
+#TOC>   1        Reading and validating FASTA            45
+#TOC>   1.1        Validating FASTA                      81
+#TOC>   2        Parsing FASTA                          227
+#TOC>   3        Interpreting FASTA                     247
+#TOC>   4        Writing FASTA                          274
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Reading and validating FASTA  ========================================
+
+# FASTA is a text based format, structured in lines that are separated by
+# line-feed or paragraph-break characters. Which one of these is used, depends
+# on your operating system. But R's readLines() function knows how to handle
+# these correctly, accross platforms. Don't try to read such files "by hand".
+# Here is the yeast Mbp1 gene, via SGD.
+
+file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
+faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+
+# The warning is generated because the programmer at the NCBI who implemented
+# the code to write this FASTA file neglected to place a line-break character
+# after the last sequence character. While this is not technically incorrect,
+# it is poor practice: the resulting file can't be distinguished from one that
+# has been truncated in transmission.
+
+head(faMBP1)
+
+# Note that there are NO line-break characters ("\n") at the end of these
+# strings, even though they were present in the original file. readLines()
+# has "consumed" these characters while reading - but every single line is in
+# a vector of its own.
+
+tail(faMBP1)
+
+# Also note that the last line has fewer characters - this means readLines()
+# imported the whole line, despite it not being terminated by "\n".
+
+# It's very straightforward to work with such data, for example by collapsing
+# everything except the first line into a single string ...
+
+f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
+
+f[1]
+nchar(f[2])
+
+# ==   1.1  Validating FASTA  ==================================================
+
+# The code above is making the assumption that everything from line 2 until
+#  the end IS sequence, the whole sequence and nothing but sequence.
+#  That assumption can break down in many ways:
+#
+#  - there could be more than one header line. The specification says otherwise,
+#       but some older files use multiple, consecutive header lines. You don't
+#       want that to end up in your sequence.
+#  - this could be not a FASTA file at all. It could be raw sequence, a
+#       different sequence file format, or a wholly different file altogether.
+#       If you look at the file, you can immediately tell, but if you are
+#       reading the file in a complex workflow, your could easily import wrong
+#       data into your analysis.
+#  - there could be more than one sequence in the file. Such Multi-FASTA files
+#       occur commonly, as downloads of ORFs from genome regions or other
+#       sets of genes or proteins, or as the input / output for multiple
+#       sequence alignment programs.
+#
+# Data "from the wild" can (and usually does) have the most unexpected
+# variations and it is really, really important to be clear about the
+# assumptions that you are making. It is possible to "fix" things, according
+# to the "Robustness Principle" :
+#      "Be conservative in what you send,
+#       be liberal in what you accept".
+#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
+# ... but if you think about this, that's actually a really poor idea,
+# which is much more likely to dilute standards, make unwarranted
+# assumptions, and allow errors to pass silently and corrupt data.
+#
+# Let's discard this principle on the trash-heap of
+# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
+# identify problems, and follow the principle: "crash early, crash often". Of
+# course I can write code that would reformat any possible input as a FASTA
+# file - but what good will it do me if it parses the file I receive
+# from a server into FASTA format like:
+#
+#   >404- Page Not Found</title</head>
+#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
+#   spellingrcntacttheadministratrsdyhtml
+#
+# Therefore, we write ourselves a FASTA checker that will enforce the following:
+#   (1) a FASTA file contains one or more sequences separated by zero or
+#       more empty lines
+#   (2) a sequence contains one header line followed by
+#       one or more sequence lines
+#   (3) a sequence line contains one or more uppercase or lowercase single
+#       letter amino acid codes, hyphens (gap character), or * (stop).
+#
+#   Anything else should generate an error.
+
+#   (Case 1): Header(s) exist
+fX <- c("ABC",
+        "defghi",
+        "klmnpq")
+sel <- grepl("^>", fX)  # "^>" is a regular expression that
+                        # means: the exact character ">" at the
+                        # beginning ("^") of the line.
+if ( ! any(sel) ) { stop("no header lines in input.") }
+
+
+#   (Case 2) No adjacent header lines
+fX <- c(">ABC",
+        ">123",
+        "defghi",
+        "klmnpq")
+sel <- grepl("^>", fX)
+sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
+if ( any(sel)) { stop("adjacent header lines in input.") }
+
+#   (Case 3.1) all sequence lines contain only valid characters
+#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
+#               are defined with the .utilities.R script)
+AAVALID
+fX <- c(">ABC",
+        "def ;-) ghi",
+        "klmnpq")
+myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
+sel <- ! grepl("^>", fX)              # NOT headers
+if (any(grepl(myRegex, fX[sel]))) {
+  stop("invalid chracter(s) outside of header lines.")
+}
+
+#   (Case 3.2) all headers are followed directly by
+#              at least one letter of sequence
+fX <- c(">ABC",
+        "",
+        ">123",
+        "defghi",
+        "klmnpq")
+sel <- grep("^>", fX) + 1             # indexes of headers + 1
+myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
+if (! all(grepl(myRegex, fX[sel]))) {
+  stop("a header has no adjacent sequence.")
+}
+# Ah, you might ask - couldn't we just have dropped all empty lines, and
+# then caught this in Case 2? No - for two reasons: we would still miss headers
+# at the end of file, and, we would have changed the line numbering - and
+# ideally our "production" function will create information about where the
+# error is to be found.
+
+
+# Now combine this into a function ...
+
+val <- function(fa) {
+
+  if ( ! any(grepl("^>", fa)) ) {
+    stop("no header lines in input.")
+  }
+
+  sel <- grepl("^>", fa)
+  if ( any(sel[- length(sel)] & sel[-1])) {
+    stop("adjacent header lines in input.")
+  }
+
+  sel <- ! grepl("^>", fa)
+  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
+    stop("invalid chracter(s) outside of header lines.")
+  }
+
+  sel <- grep("^>", fa) + 1
+  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
+    stop("a header has no adjacent sequence.")
+  }
+
+  return(invisible(NULL))
+}
+
+# Here is an example
+FA <- c(">head1",
+        "acdef",
+        "ghi",
+        "",
+        ">head2",
+        "kl",
+        ">head3",
+        "mn",
+        "pqrs")
+val(FA)     # ... should not create an error
+
+
+# A somewhat more elaborate validateFA() function was loaded with the
+# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
+# fasta files have space-characters in their spacer lines. Try it ...
+validateFA(FA)
+
+# =    2  Parsing FASTA  =======================================================
+
+# Once we have validated our assumptions about our input, it's quite
+# painless to parse it. I have put this together as a function and the function
+# gets loaded from ./.utilities.R
+#
+
+# Lets try this:
+#   - the first 3 elements of faMBP1:
+readFASTA(faMBP1[1:3])
+
+#   - a multi FASTA file of aligned APSES domain sequences:
+
+refAPSES <- readFASTA("./data/refAPSES.mfa")
+
+# Subset the sequence with "P39678" in the header
+refAPSES[grep("P39678", refAPSES$head) ,]
+
+
+
+# =    3  Interpreting FASTA  ==================================================
+
+
+# FASTA files are straightforward to interpret - just one thing may be of note:
+# when working with strings, we can use substr(<string>, <start>, <stop>) to
+# extract substrings, but more often we expand the string into a vector of
+# single characters with strsplit(<string>, ""). strsplit() returns a list,
+# to accommodate that <string> could be a vector of many elements, therefore
+# we usually unlist() the result if we use it only on a single string.
+
+# Example: How many positive charged residues in "MBP1_SACCE"?
+
+s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
+s
+
+sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
+                       # for the characters, sum() coerces to 1 and 0
+                       # respectively, and that gives us the result.
+
+100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
+
+# residue distribution
+x <- factor(s, levels = names(AACOLS))
+pie(table(x)[names(AACOLS)], col = AACOLS)
+
+
+
+# =    4  Writing FASTA  =======================================================
+
+
+# Writing FASTA files is mostly just the reverse of reading, with one
+# twist: we need to break the long sequence string into chunks of the desired
+# width. The FASTA specification calls for a maximum of 120 characters per line,
+# but writing out much less than that is common, since it allows to comfortably
+# view lines on the console, or printing them on a sheet of paper (do we still
+# do that actually?). How do we break a string into chunks? A combination of
+# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
+# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
+# loop through our FASTA object in memory, we can build the output by c()'ing
+# blocks of header + sequence to each other. For VERY large objects this might
+# be slow - in that case, we might want to precalculate the size of the output
+# object. But that's more of a hypothetical consideration.
+
+( s <- refAPSES$seq[2] )
+nchar(s)
+w <- 30     # width of chunk
+(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
+(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
+
+# Task: Is this safe? What happens if nchar(s) is shorter than w?
+#       What happens if nchar(s) is an exact multiple of w?
+
+substring(s, starts, ends)
+# confirm that the output contains the first and last residue, and both
+# residues adjacent to the breaks
+
+# As always, the function has been defined in ".utilities.R" for to use
+# any time...  type   writeFASTA  to examine it.
+
+# Let's try this...
+
+writeFASTA(refAPSES, width = 40)
+
+# roundtrip for validation: write refAPSES with a different format,
+# read it back in - the new dataframe must be identical
+# to the original dataframe.
+fname <- tempfile()
+writeFASTA(refAPSES, fn = fname, width = 30)
+identical(refAPSES, readFASTA(fname))
+
+# ...works for me  :-)
+
+
+# [END]
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -1,385 +1,385 @@
-# tocID <- "RPR-Genetic_code_optimality.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Genetic_code_optimality unit.
-#
-# Version:  1.3
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.3    2020 Maintenance
-#           1.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
-#           1.1      Update set.seed() usage
-#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
-#           1.0      New material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                          Line
-#TOC> --------------------------------------------------------------
-#TOC>   1        Designing a computational experiment             58
-#TOC>   2        Setting up the tools                             74
-#TOC>   2.1        Natural and alternative genetic codes          77
-#TOC>   2.2        Effect of mutations                           135
-#TOC>   2.2.1          reverse-translate                         146
-#TOC>   2.2.2          Randomly mutate                           171
-#TOC>   2.2.3          Forward- translate                        196
-#TOC>   2.2.4          measure effect                            213
-#TOC>   3        Run the experiment                              267
-#TOC>   4        Task solutions                                  363
-#TOC> 
-#TOC> ==========================================================================
-
-
-# This unit demonstrates R code to simulate alternate genetic codes and evaluate
-# their robsustness to code changes. The approaches are quite simple and you
-# will be able to come up with obvious refinements; the point of this code is to
-# demonstrate some R programming techniques, in preparation for more
-# sophisticated questions later.
-
-
-# =    1  Designing a computational experiment  ================================
-
-# Computational experiments are conducted like wet-lab experiments. We begin
-# with a hypothesis, then define the observables that relate to the hypothesis,
-# then define the measures we apply to observations, and finally we interpret
-# our observations. If we want to learn something about the evolution of the
-# genetic code ...
-
-#  - we construct a hypothesis such as: the genetic code has evolved so as to
-#      minimize the effect of mutations;
-#  - we define the observables: the effect of mutations in
-#      sequences, given the natural and possible alternative codes;
-#  - we define the measures to quantify the effect of mutations;
-#  - then we compute alternatives and interpret the results.
-
-
-# =    2  Setting up the tools  ================================================
-
-
-# ==   2.1  Natural and alternative genetic codes  =============================
-
-# Load genetic code tables from the Biostrings package
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
-}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
-}
-# Package information:
-#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
-
-
-# There are many ways to generate alternative codes. The simplest way is to
-# randomly assign amino acids to codons. A more sophisticated way is to keep the
-# redundancy of codons intact, since it may reflect some form of symmetry
-# breaking that ignores the third nucleotide of a codon for the most part;
-# therefore we only replace the amino acids of the existing code with random
-# others. Here are two functions that implement these two ideas about alternate
-# codes.
-
-randomGC <- function(GC) {
-  # Return a genetic code with randomly assigned amino acids.
-  # Parameters:
-  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
-  #                       codes plus "*" (stop), named with the codon triplet.
-  # Value:  named chr  same vector with random amino acid assignments in which
-  #                       every amino acid and "*" is encoded at least once.
-
-  aa <- unique(GC)                           # the amino acids in the input code
-  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
-  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
-                                             # does not contain all amino acids,
-                                             # then we sample() again.
-    GC[1:64] <- sample(aa, 64, replace = TRUE)
-  }
-  return(GC)
-}
-
-swappedGC <- function(GC) {
-  # Return a genetic code with randomly swapped amino acids.
-  # Parameters:
-  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
-  #                       codes plus "*" (stop), named with the codon triplet.
-  # Value:  named chr  same vector with random amino acid assignments where the
-  #                       amino acids have been swapped.
-
-  aaOrig <- unique(GC)                       # the amino acids in the input code
-  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
-  names(aaSwap) <- aaOrig                    # name them after the original
-  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
-
-  return(GC)
-}
-
-
-# ==   2.2  Effect of mutations  ===============================================
-
-
-# To evaluate the effects of mutations we will do the following:
-#   - we take an amino acid sequence (Mbp1 will do just nicely);
-#   - we reverse-translate it into a nucleotide sequence;
-#   - we mutate it randomly;
-#   - we translate it back to amino acids;
-#   - we count the number of mutations and evaluate their severity.
-
-
-# ===   2.2.1  reverse-translate                    
-
-# To reverse-translate an amino acid vector, we randomly pick one of its
-# codons from a genetic code, and assemble all codons to a sequence.
-
-traRev <- function(s, GC) {
-  # Parameters:
-  #      s   chr   a sequence vector
-  #      GC  chr   a genetic code
-  # Value:
-  #      A reverse-translated vector of codons
-  vC <- character(length(s))
-
-  for (i in seq_along(s)) {
-    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
-    if (length(codon) > 1) {         # if there's more than one ...
-      codon <- sample(codon, 1)      # pick one at random ...
-    }
-    vC[i] <- codon                   # store it
-  }
-
-  return(vC)
-}
-
-
-# ===   2.2.2  Randomly mutate                      
-
-# To mutate, we split a codon into it's three nucleotides, then randomly replace
-# one of the three with another nucleotide.
-
-randMut <- function(vC) {
-  # Parameter:
-  #    vC   chr     a vector of codons
-  # Value:  chr     a vector of codons with a single point mutation from vC
-
-  nuc <- c("A", "C", "G", "T")
-
-  for (i in seq_along(vC)) {
-    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
-    iNuc <- sample(1:3, 1)                         # choose one of the three
-    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
-    triplet[iNuc] <- mutNuc                        # replace the original
-    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
-  }
-  return(vC)
-
-}
-
-
-
-# ===   2.2.3  Forward- translate                   
-
-traFor <- function(vC, GC) {
-  # Parameters:
-  #      vC   chr   a codon vector
-  #      GC   chr   a genetic code
-  # Value:
-  #      A vector of amino acids
-  vAA <- character(length(vC))
-
-  for (i in seq_along(vC)) {
-    vAA[i] <- GC[vC[i]]         # translate and store
-  }
-  return(vAA)
-}
-
-
-# ===   2.2.4  measure effect                       
-
-# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
-# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
-# categories, according to their free energy of transfer from water to octanol:
-aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
-aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
-aaNeutral <- c("A", "H", "T", "S", "V", "G")
-
-# Then we will penalize as follows:
-# Changes within one category: 0.1
-# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
-# Changes from hydrophobic to hydrophilic or back: 1.0
-# Changes to stop-codon: 3.0
-
-evalMut <- function(nat, mut) {
-  # Evaluate severity of mutations between amino acid sequence vectors nat and
-  # mut in an ad hoc approach based on hydrophobicity changes.
-  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
-  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
-  aaNeutral <- c("A", "H", "T", "S", "V", "G")
-
-  penalties <- numeric(length(nat))
-  lMut <- nat != mut    # logical TRUE for all mutated positions
-
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
-
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
-
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
-
-  return(sum(penalties))
-}
-
-# A more sophisticated approach could take additional quantities into account,
-# such as charge, size, or flexibility - and it could add heuristics, such as:
-# proline is always bad in secondary structure, charged amino acids are terrible
-# in the folded core of a protein, replacing a small by a large amino acid in
-# the core is very disruptive ... etc.
-#
-# For our experiment, we should not  use a mutation data matrix however:
-# empirical mutation probabilities are superbly suited to estimate evolutionary
-# relationships. Here however, as we are trying to evaluate effects of random
-# mutations on genetic codes, our reasoning would be circular - we would
-# discover that the natural genetic code is optimal ... because it is most
-# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
-
-
-# =    3  Run the experiment  ==================================================
-
-# Fetch the standard Genetic code from Biostrings::
-
-stdCode <- Biostrings::GENETIC_CODE
-
-# Fetch the nucleotide sequence for MBP1:
-
-myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
-myDNA <- paste0(myDNA, collapse = "")
-myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
-myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
-
-myAA <- traFor(myDNA, stdCode)
-
-# Mutate and evaluate
-set.seed(112358)
-x <- randMut(myDNA)
-set.seed(NULL)
-x <- traFor(x, stdCode)
-evalMut(myAA, x)  # 166.4
-
-# Try this 200 times, and see how the values are distributed.
-N <- 200
-valSTDC <- numeric(N)
-
-set.seed(112358)                   # set RNG seed for repeatable randomness
-for (i in 1:N) {                   # this takes a few seconds ...
-  x <- randMut(myDNA)              # mutate
-  x <- traFor(x, stdCode)     # translate
-  valSTDC[i] <- evalMut(myAA, x)    # evaluate
-}
-set.seed(NULL)                     # reset the RNG
-
-hist(valSTDC,
-     breaks = 15,
-     col = "palegoldenrod",
-     xlim = c(0, 400),
-     ylim = c(0, N/4),
-     main = "Standard vs. Synthetic Genetic Code",
-     xlab = "Mutation penalty")
-
-# This looks like a normal distribution. Let's assume the effect of mutations
-# under the standard genetic code is the mean of this distribution:
-effectSTDC <- mean(valSTDC)  # 178.1
-
-# Now we can look at the effects of alternate genetic codes:
-
-set.seed(112358)
-# choose a new code
-GC <- randomGC(stdCode)
-set.seed(NULL)
-
-# reverse translate hypothetical sequence according to the new code
-x <- traRev(myAA, GC)
-
-x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
-x <- traFor(x, GC)     # translate back, with the new code
-evalMut(myAA, x)       # evaluate mutation effects: 298.5
-
-# That seems a fair bit higher than what we saw as "effectUGC"
-# Let's try with different genetic codes. 200 trials - but this time every trial
-# is with a different, synthetic genetic code.
-
-N <- 200
-valXGC <- numeric(N)
-
-set.seed(1414214)                # set RNG seed for repeatable randomness
-for (i in 1:N) {
-  GC <- randomGC(stdCode)   # Choose code
-  x <- traRev(myAA, GC)          # reverse translate
-  x <- randMut(x)                # mutate
-  x <- traFor(x, GC)             # translate
-  valXGC[i] <- evalMut(myAA, x)  # evaluate
-}
-set.seed(NULL)                   # reset the RNG
-
-hist(valXGC,
-     col = "plum",
-     breaks = 15,
-     add = TRUE)
-
-# These two distributions are very widely separated!
-
-# Task: Perform the same experiment with the swapped genetic code.
-#       Compare the distributions. Interpret the result.
-
-
-# These are simple experiments, under assumptions that can be refined in
-# meaningful ways. Yet, even those simple computational experiments show
-# that the Universal Genetic Code has features that one would predict if
-# it has evolved under selective pressure to minimize the effects of mutations.
-# Gradual change under mutation is benificial to evolution, disruptive
-# change is not.
-
-
-# =    4  Task solutions  ======================================================
-
-N <- 200
-valSGC <- numeric(N)
-
-set.seed(2718282)                # set RNG seed for repeatable randomness
-for (i in 1:N) {
-  GC <- swappedGC(stdCode)  # Choose code
-  x <- traRev(myAA, GC)          # reverse translate
-  x <- randMut(x)                # mutate
-  x <- traFor(x, GC)             # translate
-  valSGC[i] <- evalMut(myAA, x)  # evaluate
-}
-set.seed(NULL)                   # reset the RNG
-
-hist(valSGC,
-     col = "#6688FF88",
-     breaks = 15,
-     add = TRUE)
-
-
-
-# [END]
+# tocID <- "RPR-Genetic_code_optimality.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Genetic_code_optimality unit.
+#
+# Version:  1.3
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.3    2020 Maintenance
+#           1.2    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#                      use Biocmanager:: not biocLite()
+#           1.1      Update set.seed() usage
+#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
+#           1.0      New material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                          Line
+#TOC> --------------------------------------------------------------
+#TOC>   1        Designing a computational experiment             58
+#TOC>   2        Setting up the tools                             74
+#TOC>   2.1        Natural and alternative genetic codes          77
+#TOC>   2.2        Effect of mutations                           135
+#TOC>   2.2.1          reverse-translate                         146
+#TOC>   2.2.2          Randomly mutate                           171
+#TOC>   2.2.3          Forward- translate                        196
+#TOC>   2.2.4          measure effect                            213
+#TOC>   3        Run the experiment                              267
+#TOC>   4        Task solutions                                  363
+#TOC> 
+#TOC> ==========================================================================
+
+
+# This unit demonstrates R code to simulate alternate genetic codes and evaluate
+# their robsustness to code changes. The approaches are quite simple and you
+# will be able to come up with obvious refinements; the point of this code is to
+# demonstrate some R programming techniques, in preparation for more
+# sophisticated questions later.
+
+
+# =    1  Designing a computational experiment  ================================
+
+# Computational experiments are conducted like wet-lab experiments. We begin
+# with a hypothesis, then define the observables that relate to the hypothesis,
+# then define the measures we apply to observations, and finally we interpret
+# our observations. If we want to learn something about the evolution of the
+# genetic code ...
+
+#  - we construct a hypothesis such as: the genetic code has evolved so as to
+#      minimize the effect of mutations;
+#  - we define the observables: the effect of mutations in
+#      sequences, given the natural and possible alternative codes;
+#  - we define the measures to quantify the effect of mutations;
+#  - then we compute alternatives and interpret the results.
+
+
+# =    2  Setting up the tools  ================================================
+
+
+# ==   2.1  Natural and alternative genetic codes  =============================
+
+# Load genetic code tables from the Biostrings package
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager")
+}
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
+  BiocManager::install("Biostrings")
+}
+# Package information:
+#  library(help = Biostrings)       # basic information
+#  browseVignettes("Biostrings")    # available vignettes
+#  data(package = "Biostrings")     # available datasets
+
+
+# There are many ways to generate alternative codes. The simplest way is to
+# randomly assign amino acids to codons. A more sophisticated way is to keep the
+# redundancy of codons intact, since it may reflect some form of symmetry
+# breaking that ignores the third nucleotide of a codon for the most part;
+# therefore we only replace the amino acids of the existing code with random
+# others. Here are two functions that implement these two ideas about alternate
+# codes.
+
+randomGC <- function(GC) {
+  # Return a genetic code with randomly assigned amino acids.
+  # Parameters:
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #                       codes plus "*" (stop), named with the codon triplet.
+  # Value:  named chr  same vector with random amino acid assignments in which
+  #                       every amino acid and "*" is encoded at least once.
+
+  aa <- unique(GC)                           # the amino acids in the input code
+  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
+  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
+                                             # does not contain all amino acids,
+                                             # then we sample() again.
+    GC[1:64] <- sample(aa, 64, replace = TRUE)
+  }
+  return(GC)
+}
+
+swappedGC <- function(GC) {
+  # Return a genetic code with randomly swapped amino acids.
+  # Parameters:
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #                       codes plus "*" (stop), named with the codon triplet.
+  # Value:  named chr  same vector with random amino acid assignments where the
+  #                       amino acids have been swapped.
+
+  aaOrig <- unique(GC)                       # the amino acids in the input code
+  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
+  names(aaSwap) <- aaOrig                    # name them after the original
+  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
+
+  return(GC)
+}
+
+
+# ==   2.2  Effect of mutations  ===============================================
+
+
+# To evaluate the effects of mutations we will do the following:
+#   - we take an amino acid sequence (Mbp1 will do just nicely);
+#   - we reverse-translate it into a nucleotide sequence;
+#   - we mutate it randomly;
+#   - we translate it back to amino acids;
+#   - we count the number of mutations and evaluate their severity.
+
+
+# ===   2.2.1  reverse-translate                    
+
+# To reverse-translate an amino acid vector, we randomly pick one of its
+# codons from a genetic code, and assemble all codons to a sequence.
+
+traRev <- function(s, GC) {
+  # Parameters:
+  #      s   chr   a sequence vector
+  #      GC  chr   a genetic code
+  # Value:
+  #      A reverse-translated vector of codons
+  vC <- character(length(s))
+
+  for (i in seq_along(s)) {
+    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
+    if (length(codon) > 1) {         # if there's more than one ...
+      codon <- sample(codon, 1)      # pick one at random ...
+    }
+    vC[i] <- codon                   # store it
+  }
+
+  return(vC)
+}
+
+
+# ===   2.2.2  Randomly mutate                      
+
+# To mutate, we split a codon into it's three nucleotides, then randomly replace
+# one of the three with another nucleotide.
+
+randMut <- function(vC) {
+  # Parameter:
+  #    vC   chr     a vector of codons
+  # Value:  chr     a vector of codons with a single point mutation from vC
+
+  nuc <- c("A", "C", "G", "T")
+
+  for (i in seq_along(vC)) {
+    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
+    iNuc <- sample(1:3, 1)                         # choose one of the three
+    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
+    triplet[iNuc] <- mutNuc                        # replace the original
+    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
+  }
+  return(vC)
+
+}
+
+
+
+# ===   2.2.3  Forward- translate                   
+
+traFor <- function(vC, GC) {
+  # Parameters:
+  #      vC   chr   a codon vector
+  #      GC   chr   a genetic code
+  # Value:
+  #      A vector of amino acids
+  vAA <- character(length(vC))
+
+  for (i in seq_along(vC)) {
+    vAA[i] <- GC[vC[i]]         # translate and store
+  }
+  return(vAA)
+}
+
+
+# ===   2.2.4  measure effect                       
+
+# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
+# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
+# categories, according to their free energy of transfer from water to octanol:
+aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+aaNeutral <- c("A", "H", "T", "S", "V", "G")
+
+# Then we will penalize as follows:
+# Changes within one category: 0.1
+# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
+# Changes from hydrophobic to hydrophilic or back: 1.0
+# Changes to stop-codon: 3.0
+
+evalMut <- function(nat, mut) {
+  # Evaluate severity of mutations between amino acid sequence vectors nat and
+  # mut in an ad hoc approach based on hydrophobicity changes.
+  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+  aaNeutral <- c("A", "H", "T", "S", "V", "G")
+
+  penalties <- numeric(length(nat))
+  lMut <- nat != mut    # logical TRUE for all mutated positions
+
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
+
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
+
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
+
+  return(sum(penalties))
+}
+
+# A more sophisticated approach could take additional quantities into account,
+# such as charge, size, or flexibility - and it could add heuristics, such as:
+# proline is always bad in secondary structure, charged amino acids are terrible
+# in the folded core of a protein, replacing a small by a large amino acid in
+# the core is very disruptive ... etc.
+#
+# For our experiment, we should not  use a mutation data matrix however:
+# empirical mutation probabilities are superbly suited to estimate evolutionary
+# relationships. Here however, as we are trying to evaluate effects of random
+# mutations on genetic codes, our reasoning would be circular - we would
+# discover that the natural genetic code is optimal ... because it is most
+# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
+
+
+# =    3  Run the experiment  ==================================================
+
+# Fetch the standard Genetic code from Biostrings::
+
+stdCode <- Biostrings::GENETIC_CODE
+
+# Fetch the nucleotide sequence for MBP1:
+
+myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
+myDNA <- paste0(myDNA, collapse = "")
+myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
+myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
+
+myAA <- traFor(myDNA, stdCode)
+
+# Mutate and evaluate
+set.seed(112358)
+x <- randMut(myDNA)
+set.seed(NULL)
+x <- traFor(x, stdCode)
+evalMut(myAA, x)  # 166.4
+
+# Try this 200 times, and see how the values are distributed.
+N <- 200
+valSTDC <- numeric(N)
+
+set.seed(112358)                   # set RNG seed for repeatable randomness
+for (i in 1:N) {                   # this takes a few seconds ...
+  x <- randMut(myDNA)              # mutate
+  x <- traFor(x, stdCode)     # translate
+  valSTDC[i] <- evalMut(myAA, x)    # evaluate
+}
+set.seed(NULL)                     # reset the RNG
+
+hist(valSTDC,
+     breaks = 15,
+     col = "palegoldenrod",
+     xlim = c(0, 400),
+     ylim = c(0, N/4),
+     main = "Standard vs. Synthetic Genetic Code",
+     xlab = "Mutation penalty")
+
+# This looks like a normal distribution. Let's assume the effect of mutations
+# under the standard genetic code is the mean of this distribution:
+effectSTDC <- mean(valSTDC)  # 178.1
+
+# Now we can look at the effects of alternate genetic codes:
+
+set.seed(112358)
+# choose a new code
+GC <- randomGC(stdCode)
+set.seed(NULL)
+
+# reverse translate hypothetical sequence according to the new code
+x <- traRev(myAA, GC)
+
+x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
+x <- traFor(x, GC)     # translate back, with the new code
+evalMut(myAA, x)       # evaluate mutation effects: 298.5
+
+# That seems a fair bit higher than what we saw as "effectUGC"
+# Let's try with different genetic codes. 200 trials - but this time every trial
+# is with a different, synthetic genetic code.
+
+N <- 200
+valXGC <- numeric(N)
+
+set.seed(1414214)                # set RNG seed for repeatable randomness
+for (i in 1:N) {
+  GC <- randomGC(stdCode)   # Choose code
+  x <- traRev(myAA, GC)          # reverse translate
+  x <- randMut(x)                # mutate
+  x <- traFor(x, GC)             # translate
+  valXGC[i] <- evalMut(myAA, x)  # evaluate
+}
+set.seed(NULL)                   # reset the RNG
+
+hist(valXGC,
+     col = "plum",
+     breaks = 15,
+     add = TRUE)
+
+# These two distributions are very widely separated!
+
+# Task: Perform the same experiment with the swapped genetic code.
+#       Compare the distributions. Interpret the result.
+
+
+# These are simple experiments, under assumptions that can be refined in
+# meaningful ways. Yet, even those simple computational experiments show
+# that the Universal Genetic Code has features that one would predict if
+# it has evolved under selective pressure to minimize the effects of mutations.
+# Gradual change under mutation is benificial to evolution, disruptive
+# change is not.
+
+
+# =    4  Task solutions  ======================================================
+
+N <- 200
+valSGC <- numeric(N)
+
+set.seed(2718282)                # set RNG seed for repeatable randomness
+for (i in 1:N) {
+  GC <- swappedGC(stdCode)  # Choose code
+  x <- traRev(myAA, GC)          # reverse translate
+  x <- randMut(x)                # mutate
+  x <- traFor(x, GC)             # translate
+  valSGC[i] <- evalMut(myAA, x)  # evaluate
+}
+set.seed(NULL)                   # reset the RNG
+
+hist(valSGC,
+     col = "#6688FF88",
+     breaks = 15,
+     add = TRUE)
+
+
+
+# [END]
--- a/RPR-Introduction.R
+++ b/RPR-Introduction.R
@ -1,50 +1,50 @@
-# tocID <- "RPR-Introduction.R"
-#
-#
-# Purpose: A Bioinformatics Course:
-#              R code accompanying the RPR-Introduction unit
-#
-# Version: 1.0
-#
-# Date:    2020-09-18
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# V 1.0    Updtaed workflow; live
-# V 0.1    First code
-#
-# TODO:
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# DO NOT SIMPLY  source()  THESE FILES!
-
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# ==============================================================================
-
-# === TASK: Local script
-#
-# - Open the file myScript.R
-#
-# - Create a section header with a date.
-# - Enter an R-expression that will produce the first 11 powers of 2 (starting
-#     from 0). Not a loop - a single expression. The first number you get must
-#     be 1. The last number you get must be 1024.
-#
-# - Save the file in the myScripts folder, and close it.
-#
-# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
-#   to execute it.
-#
-# - Done
-
-# (This task is meant  to make sure that writing R expressions, saving
-#  them in scripts, opening script files and executing code in the file works
-#  for you. If there is an issue, get in touch.)
-
-
-
-# [END]
+# tocID <- "RPR-Introduction.R"
+#
+#
+# Purpose: A Bioinformatics Course:
+#              R code accompanying the RPR-Introduction unit
+#
+# Version: 1.0
+#
+# Date:    2020-09-18
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# V 1.0    Updtaed workflow; live
+# V 0.1    First code
+#
+# TODO:
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# DO NOT SIMPLY  source()  THESE FILES!
+
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+# ==============================================================================
+
+# === TASK: Local script
+#
+# - Open the file myScript.R
+#
+# - Create a section header with a date.
+# - Enter an R-expression that will produce the first 11 powers of 2 (starting
+#     from 0). Not a loop - a single expression. The first number you get must
+#     be 1. The last number you get must be 1024.
+#
+# - Save the file in the myScripts folder, and close it.
+#
+# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
+#   to execute it.
+#
+# - Done
+
+# (This task is meant  to make sure that writing R expressions, saving
+#  them in scripts, opening script files and executing code in the file works
+#  for you. If there is an issue, get in touch.)
+
+
+
+# [END]
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@ -1,168 +1,168 @@
-# tocID <- "RPR-PROSITE_POST.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
-#           1.0.1  Updates for slightly changed interfaces
-#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                                 Line
-#TOC> ---------------------------------------------------------------------
-#TOC>   1        Constructing a POST command from a Web query            43
-#TOC>   1.1        Task - fetchPrositeFeatures() function               148
-#TOC>   2        Task solutions                                         156
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Constructing a POST command from a Web query  ========================
-
-
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-# Package information:
-#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
-
-
-
-
-# We have reverse engineered the Web form for a ScanProsite request, and can
-# construct a valid POST request from knowing the required field names. The POST
-# command is similar to GET(), but we need an explicit request body that
-# contains a list of key/value pairs
-
-UniProtID <- "P39678"
-
-URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
-
-response <- httr::POST(URL,
-                       body = list(meta = "opt1",
-                                   meta1_protein = "opt1",
-                                   seq = UniProtID,
-                                   skip = "on",
-                                   output = "tabular"))
-
-# Send off this request, and you should have a response in a few
-# seconds. Let's check the status first:
-
-httr::status_code(response)  # If this is not 200, something went wrong and it
-                             # makes no sense to continue. If this persists, ask
-                             # on the Discussion Board what to do.
-
-
-# The text contents of the response is available with the
-# content() function:
-httr::content(response, "text")
-
-# ... should show you the same as the page contents that you have seen in the
-# browser. Now we need to extract the data from the page. For this simple
-# example we can get away with using regular expressions, but in general we need
-# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
-# strsplit() the response into individual lines, since each of our data elements
-# is on its own line, and then capture the contents. The way Prosite has
-# formatted their HTML we can simply split on the "\\n" newline character - but
-# they could write the same valid HTML without any newline-characters at all.
-# Understand that we are working with a bit of a "hack" here: exploting
-# empirical assumptions rather than a formal specification. But sometimes quick
-# and dirty is fine, because quick.
-
-lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
-head(lines)
-
-# Now we define a query pattern for the lines we want:
-# we can use the uID, bracketed by two "|" pipe
-# characters:
-
-patt <- sprintf("\\|%s\\|", UniProtID)
-
-# ... and select only the lines that match this
-# pattern:
-
-( lines <- lines[grep(patt, lines)] )
-
-# ... captures the three lines of output.
-
-# Now we break the lines apart into tokens: this is another application of
-# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
-# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
-
-unlist(strsplit(lines[1], "\\t|\\|"))
-
-# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
-# with a backslash. "t" has to be escaped because we want to match a tab (\t),
-# not the literal character "t". And "|" has to be escaped because we mean the
-# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
-# backslash turns a special meaning off, and sometimes it turns a special
-# meaning on. Unfortunately there's no easy way to tell - you just need to
-# remember the characters - or have a reference handy. The metacharacters are
-# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
-# where in the regex they are.
-
-# Let's put the tokens into named slots of a data frame
-
-features <- data.frame()
-for (line in lines) {
-  tokens <- unlist(strsplit(line, "\\t|\\|"))
-  features <- rbind(features,
-                    data.frame(uID   =  tokens[2],
-                               start =  as.numeric(tokens[4]),
-                               end   =  as.numeric(tokens[5]),
-                               psID  =  tokens[6],
-                               psName = tokens[7],
-                               psSeq  = tokens[11]))
-}
-features
-
-#  This forms the base of a function that collects the features automatically
-#  from a PrositeScan result. You can write this!
-
-
-# ==   1.1  Task - fetchPrositeFeatures() function  ============================
-
-
-# Task: write a function that takes as input a UniProt ID, fetches the
-# features it contains from ScanProsite and returns a data frame as given above, or
-# an empty data frame if there is an error.
-
-
-# =    2  Task solutions  ======================================================
-
-
-# I have placed such a function into the ABC-dbUtilities.R script: look it up by
-# clicking on  dbFetchPrositeFeatures() in the Environment pane.
-
-# Test:
-dbFetchPrositeFeatures("Q5KMQ9")
-
-
-
-
-# [END]
+# tocID <- "RPR-PROSITE_POST.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout,
+#           1.0.1  Updates for slightly changed interfaces
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                                 Line
+#TOC> ---------------------------------------------------------------------
+#TOC>   1        Constructing a POST command from a Web query            43
+#TOC>   1.1        Task - fetchPrositeFeatures() function               148
+#TOC>   2        Task solutions                                         156
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Constructing a POST command from a Web query  ========================
+
+
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+# Package information:
+#  library(help = httr)       # basic information
+#  browseVignettes("httr")    # available vignettes
+#  data(package = "httr")     # available datasets
+
+
+
+
+# We have reverse engineered the Web form for a ScanProsite request, and can
+# construct a valid POST request from knowing the required field names. The POST
+# command is similar to GET(), but we need an explicit request body that
+# contains a list of key/value pairs
+
+UniProtID <- "P39678"
+
+URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
+
+response <- httr::POST(URL,
+                       body = list(meta = "opt1",
+                                   meta1_protein = "opt1",
+                                   seq = UniProtID,
+                                   skip = "on",
+                                   output = "tabular"))
+
+# Send off this request, and you should have a response in a few
+# seconds. Let's check the status first:
+
+httr::status_code(response)  # If this is not 200, something went wrong and it
+                             # makes no sense to continue. If this persists, ask
+                             # on the Discussion Board what to do.
+
+
+# The text contents of the response is available with the
+# content() function:
+httr::content(response, "text")
+
+# ... should show you the same as the page contents that you have seen in the
+# browser. Now we need to extract the data from the page. For this simple
+# example we can get away with using regular expressions, but in general we need
+# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
+# strsplit() the response into individual lines, since each of our data elements
+# is on its own line, and then capture the contents. The way Prosite has
+# formatted their HTML we can simply split on the "\\n" newline character - but
+# they could write the same valid HTML without any newline-characters at all.
+# Understand that we are working with a bit of a "hack" here: exploting
+# empirical assumptions rather than a formal specification. But sometimes quick
+# and dirty is fine, because quick.
+
+lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
+head(lines)
+
+# Now we define a query pattern for the lines we want:
+# we can use the uID, bracketed by two "|" pipe
+# characters:
+
+patt <- sprintf("\\|%s\\|", UniProtID)
+
+# ... and select only the lines that match this
+# pattern:
+
+( lines <- lines[grep(patt, lines)] )
+
+# ... captures the three lines of output.
+
+# Now we break the lines apart into tokens: this is another application of
+# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
+# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
+
+unlist(strsplit(lines[1], "\\t|\\|"))
+
+# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
+# with a backslash. "t" has to be escaped because we want to match a tab (\t),
+# not the literal character "t". And "|" has to be escaped because we mean the
+# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
+# backslash turns a special meaning off, and sometimes it turns a special
+# meaning on. Unfortunately there's no easy way to tell - you just need to
+# remember the characters - or have a reference handy. The metacharacters are
+# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
+# where in the regex they are.
+
+# Let's put the tokens into named slots of a data frame
+
+features <- data.frame()
+for (line in lines) {
+  tokens <- unlist(strsplit(line, "\\t|\\|"))
+  features <- rbind(features,
+                    data.frame(uID   =  tokens[2],
+                               start =  as.numeric(tokens[4]),
+                               end   =  as.numeric(tokens[5]),
+                               psID  =  tokens[6],
+                               psName = tokens[7],
+                               psSeq  = tokens[11]))
+}
+features
+
+#  This forms the base of a function that collects the features automatically
+#  from a PrositeScan result. You can write this!
+
+
+# ==   1.1  Task - fetchPrositeFeatures() function  ============================
+
+
+# Task: write a function that takes as input a UniProt ID, fetches the
+# features it contains from ScanProsite and returns a data frame as given above, or
+# an empty data frame if there is an error.
+
+
+# =    2  Task solutions  ======================================================
+
+
+# I have placed such a function into the ABC-dbUtilities.R script: look it up by
+# clicking on  dbFetchPrositeFeatures() in the Environment pane.
+
+# Test:
+dbFetchPrositeFeatures("Q5KMQ9")
+
+
+
+
+# [END]
--- a/RPR-Pipe.R
+++ b/RPR-Pipe.R
@ -1,135 +1,135 @@
-# tocID <- "RPR-Pipe.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              Discussing pipe operators.
-#
-# Version:  1.0
-#
-# Date:     2021  10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.0    New code
-#
-#
-# TODO:
-#   - find more interesting examples
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                            Line
-#TOC> ------------------------------------------------
-#TOC>   1        Pipe  Concept                      41
-#TOC>   2        Nested Expression                  73
-#TOC>   3        magrittr:: Pipe                    78
-#TOC>   4        Base R Pipe                        93
-#TOC>   5        Intermediate Assignment           108
-#TOC>   6        Postscript                        127
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  Pipe  Concept  =======================================================
-
-# Pipes are actually an awesome idea for any code that implements a workflow -
-# a sequence of operations, each of which transforms data in a specialized way.
-#
-# This principle is familiar from maths: chained functions. If have a function
-# y = f(x) and want to use those results as in z = g(y), I can just write
-# z = g(f(x))
-#
-# On the unix command line, pipes were used from the very beginning, implemented
-# with the "|" pipe character.
-#
-# In R, the magrittr package provided the %>% operator, and recently the |>
-# operator has been introduced into base R.
-#
-# However there are alternatives: intermediate assignment, and nested functions
-# that have always existed in base R anyway.
-#
-# Let us look at an example. In writing this, I found out that virtually
-# ALL non-trivial examples I came up with don't translate well into this idiom
-# at all. It is actually quite limited to simple filtering operations on
-# data. A more interesting example might be added in the future, let me know if
-# you have a good idea.
-#
-# A somewhat contrived example is to sort a list of files by the
-# length of the file names:
-
-myFiles <- list.files(pattern = "\\.R$")
-
-# nchar() gives the number of characters in a string, order() produces indices
-# that map an array to its sorted form.
-#
-# =    2  Nested Expression  ===================================================
-
-myFiles[order(nchar(myFiles))]
-
-
-# =    3  magrittr:: Pipe  =====================================================
-
-if (! requireNamespace("magrittr", quietly = TRUE)) {
-  install.packages("magrittr")
-}
-# Package information:
-#  library(help = magrittr)       # basic information
-#  browseVignettes("magrittr")    # available vignettes
-#  data(package = "magrittr")     # available datasets
-
-
-library(magrittr)
-
-myFiles  %>% nchar %>% order %>% myFiles[.]
-
-# =    4  Base R Pipe  =========================================================
-
-# Since version 4.1, base R now supports a pipe operator without the need
-# to load a special package. Such an introductions of external functionality
-# into the language is very rare.
-#
-# Unfortunately it won't (yet) work with the '[' function, so we need to write
-# an intermediate function for this example
-extract <- function(x, v) {
-  return(v[x])
-}
-
-myFiles |> nchar() |> order() |> extract(myFiles)
-
-
-# =    5  Intermediate Assignment  =============================================
-
-# So what's the problem? As you can see, the piped code may be concise and
-# expressive. But there is also a large amount of implicit assignment and
-# processing going on and that is usually a bad idea because it makes code hard
-# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
-# replacing it with the pipe makes things much better. My preferred idiom is
-# to use intermediate assignments. Only then is it convenient to examine
-# the code step by step and validate every single step. And that is the most
-# important objective at all: no code is good if it does not compute
-# correctly.
-
-
-x <- nchar(myFiles)
-x <- order(x)
-myFiles[x]
-
-
-
-# =    6  Postscript  ==========================================================
-
-# I tried to write an example that strips all comments from a list of files, and
-# another example that finds all files that were not yet updated this year
-# (according to the "# Date: in the header). Neither examples can be well
-# written without intermediate assignments, or at least sapply() functions
-# that are not simpler at all than the intermediate assignment.
-
-# [END]
+# tocID <- "RPR-Pipe.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              Discussing pipe operators.
+#
+# Version:  1.0
+#
+# Date:     2021  10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    New code
+#
+#
+# TODO:
+#   - find more interesting examples
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                            Line
+#TOC> ------------------------------------------------
+#TOC>   1        Pipe  Concept                      41
+#TOC>   2        Nested Expression                  73
+#TOC>   3        magrittr:: Pipe                    78
+#TOC>   4        Base R Pipe                        93
+#TOC>   5        Intermediate Assignment           108
+#TOC>   6        Postscript                        127
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  Pipe  Concept  =======================================================
+
+# Pipes are actually an awesome idea for any code that implements a workflow -
+# a sequence of operations, each of which transforms data in a specialized way.
+#
+# This principle is familiar from maths: chained functions. If have a function
+# y = f(x) and want to use those results as in z = g(y), I can just write
+# z = g(f(x))
+#
+# On the unix command line, pipes were used from the very beginning, implemented
+# with the "|" pipe character.
+#
+# In R, the magrittr package provided the %>% operator, and recently the |>
+# operator has been introduced into base R.
+#
+# However there are alternatives: intermediate assignment, and nested functions
+# that have always existed in base R anyway.
+#
+# Let us look at an example. In writing this, I found out that virtually
+# ALL non-trivial examples I came up with don't translate well into this idiom
+# at all. It is actually quite limited to simple filtering operations on
+# data. A more interesting example might be added in the future, let me know if
+# you have a good idea.
+#
+# A somewhat contrived example is to sort a list of files by the
+# length of the file names:
+
+myFiles <- list.files(pattern = "\\.R$")
+
+# nchar() gives the number of characters in a string, order() produces indices
+# that map an array to its sorted form.
+#
+# =    2  Nested Expression  ===================================================
+
+myFiles[order(nchar(myFiles))]
+
+
+# =    3  magrittr:: Pipe  =====================================================
+
+if (! requireNamespace("magrittr", quietly = TRUE)) {
+  install.packages("magrittr")
+}
+# Package information:
+#  library(help = magrittr)       # basic information
+#  browseVignettes("magrittr")    # available vignettes
+#  data(package = "magrittr")     # available datasets
+
+
+library(magrittr)
+
+myFiles  %>% nchar %>% order %>% myFiles[.]
+
+# =    4  Base R Pipe  =========================================================
+
+# Since version 4.1, base R now supports a pipe operator without the need
+# to load a special package. Such an introductions of external functionality
+# into the language is very rare.
+#
+# Unfortunately it won't (yet) work with the '[' function, so we need to write
+# an intermediate function for this example
+extract <- function(x, v) {
+  return(v[x])
+}
+
+myFiles |> nchar() |> order() |> extract(myFiles)
+
+
+# =    5  Intermediate Assignment  =============================================
+
+# So what's the problem? As you can see, the piped code may be concise and
+# expressive. But there is also a large amount of implicit assignment and
+# processing going on and that is usually a bad idea because it makes code hard
+# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
+# replacing it with the pipe makes things much better. My preferred idiom is
+# to use intermediate assignments. Only then is it convenient to examine
+# the code step by step and validate every single step. And that is the most
+# important objective at all: no code is good if it does not compute
+# correctly.
+
+
+x <- nchar(myFiles)
+x <- order(x)
+myFiles[x]
+
+
+
+# =    6  Postscript  ==========================================================
+
+# I tried to write an example that strips all comments from a list of files, and
+# another example that finds all files that were not yet updated this year
+# (according to the "# Date: in the header). Neither examples can be well
+# written without intermediate assignments, or at least sapply() functions
+# that are not simpler at all than the intermediate assignment.
+
+# [END]
--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -1,180 +1,180 @@
-# tocID <- "RPR-RegEx.R"
-#
-# Purpose: A Bioinformatics Course:
-#              R code accompanying the RPR-RegEx unit
-#
-# Version: 1.0
-#
-# Date:    2017-08  -  2020-09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# V 0.1    Maintenance 2020
-# V 0.1    First code
-#
-# TODO:
-#
-#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
-# DO NOT SIMPLY  source()  THESE FILES!
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC>
-#TOC>   Section  Title                                Line
-#TOC> ----------------------------------------------------
-#TOC>   1        A regex example                        41
-#TOC>   2        Counting lines                        108
-#TOC>   2.1        Counting C-alpha atoms only         126
-#TOC>   3        Code Solutions                        142
-#TOC>   3.1        Counting atoms                      144
-#TOC>   3.2        Counting C-alpha records            160
-#TOC>
-#TOC> ==========================================================================
-
-
-# =    1  A regex example  =====================================================
-
-# The canonical FASTA version of yeast Mbp1 at Uniprot
-s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
-MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
-ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
-SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
-KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
-QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
-PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
-FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
-IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
-SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
-ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
-VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
-IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
-QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
-IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
-
-nchar(s)
-# Must be 969
-
-# Task: Fetch the Uniprot ID by retrieving the first string that appears between
-# two vertical bars ("pipes") in the header record.
-#
-
-# Develop the regular expression:
-                      # Just five characters returned, so we know we are using
-patt <- "^>(.{5})"    # the right functions
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
-patt <- "^>(.*)|"    # everything to the pipe character
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
-# Ooops - "|" is a metacharacter - we must escape it
-
-patt <- "^>(.*)\|"    # using "\|"
-# Ooops - that's not how we escape: must double the \ to send a literal
-# "\" plus the character "|" to the regex engine.
-
-patt <- "^>(.*)\\|"    # using "\\|"
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
-# Good. Now let's first match everything that is not a "|", then match a "|"
-patt <- "^>([^|]*)\\|"
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
-# the same thing again, but capture the second match. And insist that there
-# must be at least one character captured
-
-patt <- "^>[^|]*\\|([^|]+)\\|"
-# Analyze this pattern:
-#    ^           anchor the match at the beginning of the line
-#    >           ">" must be the first character
-#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
-#                  we don't know what other versions of the string "sp"
-#                  might appear. Note that within the brackets "|" is NOT a
-#                  metacharacter.
-#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
-#                  and means "OR"; we need to escape it to match a literal "|".
-#    (           open parenthesis: capture what comes next ...
-#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
-#    )           close parenthesis: stop capturing here
-#    \\|           second "|" character, escaped
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
-
-# =    2  Counting lines  ======================================================
-
-# Task: Write a function that returns the number of atoms in a PDB file. Call it
-#       atomCount(). Sample data is here:
-myPDB <- readLines("./data/0TST.pdb")
-
-#       Specification:
-#       Read a file from its path given as the only argument.
-#       Return the number of lines in that file that begin with "ATOM  "
-#       or with "HETATM".
-
-#       Try this. Write a function. Solution code is at the end of this file.
-#       Don't peek.
-
-atomCount("./data/0TST.pdb")  # must return 6
-
-
-
-# ==   2.1  Counting C-alpha atoms only  =======================================
-
-# Task: write a function based on the previous one that matches only CA records,
-#       i.e. it can be used to count the number of amino acids. Don't get
-#       fooled by calcium atoms, or the string CA appearing elsewhere.
-#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
-
-#       Specification:
-#       Read a file from its path given as the only argument.
-#       Return the number of lines in that file that have a C-alpha atom.
-
-#       Try this. Solution code is at the end of this file. Don't peek.
-
-CAcount("./data/0TST.pdb")  # must return 1
-
-
-# =    3  Code Solutions  ======================================================
-
-# ==   3.1  Counting atoms  ====================================================
-
-atomCount <- function(IN) {
-  # count the number of atoms in a PDB formatted file
-  # Parameters:
-  #     IN  chr  path of the file to read
-  # Value:
-  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
-  # Note: the regex MUST be anchored to the beginning of the line, otherwise
-  # it might match somewhere in a comment!
-  x <- readLines(IN)
-  patt <- "(^ATOM  )|(^HETATM)"
-  return(length(grep(patt, x)))
-}
-
-
-# ==   3.2  Counting C-alpha records  ==========================================
-
-
-CAcount <- function(IN) {
-  # count the number of C-alpha atoms in a PDB formatted file
-  # Parameters:
-  #     IN  chr  path of the file to read
-  # Value:
-  #         numeric  number of lines that match " CA " in position 13 - 16 of
-  #                  an ATOM record.
-  # Note: the regex MUST be aligned into the right position, otherwise it
-  #       might match Calcium records!
-  x <- readLines(IN)
-  patt <- "^ATOM  ...... CA "
-  return(length(grep(patt, x)))
-}
-
-
-
-# [END]
+# tocID <- "RPR-RegEx.R"
+#
+# Purpose: A Bioinformatics Course:
+#              R code accompanying the RPR-RegEx unit
+#
+# Version: 1.0
+#
+# Date:    2017-08  -  2020-09
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# V 0.1    Maintenance 2020
+# V 0.1    First code
+#
+# TODO:
+#
+#
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+#
+# DO NOT SIMPLY  source()  THESE FILES!
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+#  going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                                Line
+#TOC> ----------------------------------------------------
+#TOC>   1        A regex example                        41
+#TOC>   2        Counting lines                        108
+#TOC>   2.1        Counting C-alpha atoms only         126
+#TOC>   3        Code Solutions                        142
+#TOC>   3.1        Counting atoms                      144
+#TOC>   3.2        Counting C-alpha records            160
+#TOC>
+#TOC> ==========================================================================
+
+
+# =    1  A regex example  =====================================================
+
+# The canonical FASTA version of yeast Mbp1 at Uniprot
+s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
+MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
+ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
+SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
+KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
+QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
+PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
+FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
+IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
+SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
+ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
+VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
+IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
+QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
+IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
+
+nchar(s)
+# Must be 969
+
+# Task: Fetch the Uniprot ID by retrieving the first string that appears between
+# two vertical bars ("pipes") in the header record.
+#
+
+# Develop the regular expression:
+                      # Just five characters returned, so we know we are using
+patt <- "^>(.{5})"    # the right functions
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+patt <- "^>(.*)|"    # everything to the pipe character
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# Ooops - "|" is a metacharacter - we must escape it
+
+patt <- "^>(.*)\|"    # using "\|"
+# Ooops - that's not how we escape: must double the \ to send a literal
+# "\" plus the character "|" to the regex engine.
+
+patt <- "^>(.*)\\|"    # using "\\|"
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# Good. Now let's first match everything that is not a "|", then match a "|"
+patt <- "^>([^|]*)\\|"
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# the same thing again, but capture the second match. And insist that there
+# must be at least one character captured
+
+patt <- "^>[^|]*\\|([^|]+)\\|"
+# Analyze this pattern:
+#    ^           anchor the match at the beginning of the line
+#    >           ">" must be the first character
+#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
+#                  we don't know what other versions of the string "sp"
+#                  might appear. Note that within the brackets "|" is NOT a
+#                  metacharacter.
+#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
+#                  and means "OR"; we need to escape it to match a literal "|".
+#    (           open parenthesis: capture what comes next ...
+#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
+#    )           close parenthesis: stop capturing here
+#    \\|           second "|" character, escaped
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+
+# =    2  Counting lines  ======================================================
+
+# Task: Write a function that returns the number of atoms in a PDB file. Call it
+#       atomCount(). Sample data is here:
+myPDB <- readLines("./data/0TST.pdb")
+
+#       Specification:
+#       Read a file from its path given as the only argument.
+#       Return the number of lines in that file that begin with "ATOM  "
+#       or with "HETATM".
+
+#       Try this. Write a function. Solution code is at the end of this file.
+#       Don't peek.
+
+atomCount("./data/0TST.pdb")  # must return 6
+
+
+
+# ==   2.1  Counting C-alpha atoms only  =======================================
+
+# Task: write a function based on the previous one that matches only CA records,
+#       i.e. it can be used to count the number of amino acids. Don't get
+#       fooled by calcium atoms, or the string CA appearing elsewhere.
+#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
+
+#       Specification:
+#       Read a file from its path given as the only argument.
+#       Return the number of lines in that file that have a C-alpha atom.
+
+#       Try this. Solution code is at the end of this file. Don't peek.
+
+CAcount("./data/0TST.pdb")  # must return 1
+
+
+# =    3  Code Solutions  ======================================================
+
+# ==   3.1  Counting atoms  ====================================================
+
+atomCount <- function(IN) {
+  # count the number of atoms in a PDB formatted file
+  # Parameters:
+  #     IN  chr  path of the file to read
+  # Value:
+  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
+  # Note: the regex MUST be anchored to the beginning of the line, otherwise
+  # it might match somewhere in a comment!
+  x <- readLines(IN)
+  patt <- "(^ATOM  )|(^HETATM)"
+  return(length(grep(patt, x)))
+}
+
+
+# ==   3.2  Counting C-alpha records  ==========================================
+
+
+CAcount <- function(IN) {
+  # count the number of C-alpha atoms in a PDB formatted file
+  # Parameters:
+  #     IN  chr  path of the file to read
+  # Value:
+  #         numeric  number of lines that match " CA " in position 13 - 16 of
+  #                  an ATOM record.
+  # Note: the regex MUST be aligned into the right position, otherwise it
+  #       might match Calcium records!
+  x <- readLines(IN)
+  patt <- "^ATOM  ...... CA "
+  return(length(grep(patt, x)))
+}
+
+
+
+# [END]
--- a/RPR-SX-PDB.R
+++ b/RPR-SX-PDB.R
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@ -1,135 +1,135 @@
-# tocID <- "RPR-UniProt_GET.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
-#                  added FASTA headers as attribute
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                      Line
-#TOC> ----------------------------------------------------------
-#TOC>   1        UniProt files via GET                        43
-#TOC>   1.1        Task - fetchUniProtSeq() function         105
-#TOC>   2        Task solutions                              118
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  UniProt files via GET  ===============================================
-
-
-# Perhaps the simplest example of scripted download is to retrieve a protein
-# FASTA sequence from UniProt. All we need is to construct an URL with the
-# correct UniProt ID.
-
-# An interface between R scripts and Web servers is provided by the httr::
-# package. This sends and receives information via the http protocol, just like
-# a Web browser. Since this is a short and simple request, the GET verb is the
-# right tool:
-
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-# Package information:
-#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
-
-
-# The UniProt ID for Mbp1 is ...
-
-UniProtID <- "P39678"
-
-# and the base URL to retrieve data is  ...
-# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
-# retrieve a FASTA sequence:
-
-(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
-
-# the GET() function from httr will get the data.
-response <- httr::GET(URL)
-
-str(response) # the response object is a bit complex ...
-as.character(response) # ... but it is easy to pull out the data.
-
-# to process  ...
-x <- as.character(response)
-x <- strsplit(x, "\n")
-dbSanitizeSequence(x)
-
-# Simple.
-# But what happens if there is an error, e.g. the uniprot ID does not exist?
-
-response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
-as.character(response)
-# this is a large HTML page that tells us the URL was not found. So we need to
-# check for errors.  The Right Way to do this is to evaluate the staus code that
-# every Web server returns for every transaction.
-#
-httr::status_code(response)  # 404 == Page Not Found
-
-# There are many possible codes, but the only code we will be happy with
-# is 200 - oK.
-# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
-
-URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
-response <- httr::GET(URL)
-httr::status_code(response)
-
-
-# ==   1.1  Task - fetchUniProtSeq() function  =================================
-
-# Task: write a function that
-#   - takes as input a vector of UniProt IDs,
-#   - fetches the FASTA sequence for each
-#   - returns a vector of the same length as the input, where an element is:
-#   -  ...  the sequence, if the query was successful
-#   -  ...  NA if there was an error
-#   - each element has the UniProt ID as the name()
-#   - bonus: the output has an attribute "headers" that is a vector of the
-#            FASTA headers ( cf. ?attr )
-
-
-# =    2  Task solutions  ======================================================
-
-
-# I have placed such a function - dbFetchUniProtSeq() - into
-# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
-# in the Environment pane.
-
-# Test this:
-( x <- dbFetchUniProtSeq("P39678") )
-names(x)[1]
-attr(x, "headers")[1]
-x[1]
-cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
-               width = 40), sep = "\n")
-
-
-
-# [END]
+# tocID <- "RPR-UniProt_GET.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
+#                  added FASTA headers as attribute
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                      Line
+#TOC> ----------------------------------------------------------
+#TOC>   1        UniProt files via GET                        43
+#TOC>   1.1        Task - fetchUniProtSeq() function         105
+#TOC>   2        Task solutions                              118
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  UniProt files via GET  ===============================================
+
+
+# Perhaps the simplest example of scripted download is to retrieve a protein
+# FASTA sequence from UniProt. All we need is to construct an URL with the
+# correct UniProt ID.
+
+# An interface between R scripts and Web servers is provided by the httr::
+# package. This sends and receives information via the http protocol, just like
+# a Web browser. Since this is a short and simple request, the GET verb is the
+# right tool:
+
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+# Package information:
+#  library(help = httr)       # basic information
+#  browseVignettes("httr")    # available vignettes
+#  data(package = "httr")     # available datasets
+
+
+# The UniProt ID for Mbp1 is ...
+
+UniProtID <- "P39678"
+
+# and the base URL to retrieve data is  ...
+# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
+# retrieve a FASTA sequence:
+
+(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
+
+# the GET() function from httr will get the data.
+response <- httr::GET(URL)
+
+str(response) # the response object is a bit complex ...
+as.character(response) # ... but it is easy to pull out the data.
+
+# to process  ...
+x <- as.character(response)
+x <- strsplit(x, "\n")
+dbSanitizeSequence(x)
+
+# Simple.
+# But what happens if there is an error, e.g. the uniprot ID does not exist?
+
+response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
+as.character(response)
+# this is a large HTML page that tells us the URL was not found. So we need to
+# check for errors.  The Right Way to do this is to evaluate the staus code that
+# every Web server returns for every transaction.
+#
+httr::status_code(response)  # 404 == Page Not Found
+
+# There are many possible codes, but the only code we will be happy with
+# is 200 - oK.
+# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
+
+URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
+response <- httr::GET(URL)
+httr::status_code(response)
+
+
+# ==   1.1  Task - fetchUniProtSeq() function  =================================
+
+# Task: write a function that
+#   - takes as input a vector of UniProt IDs,
+#   - fetches the FASTA sequence for each
+#   - returns a vector of the same length as the input, where an element is:
+#   -  ...  the sequence, if the query was successful
+#   -  ...  NA if there was an error
+#   - each element has the UniProt ID as the name()
+#   - bonus: the output has an attribute "headers" that is a vector of the
+#            FASTA headers ( cf. ?attr )
+
+
+# =    2  Task solutions  ======================================================
+
+
+# I have placed such a function - dbFetchUniProtSeq() - into
+# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
+# in the Environment pane.
+
+# Test this:
+( x <- dbFetchUniProtSeq("P39678") )
+names(x)[1]
+attr(x, "headers")[1]
+x[1]
+cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
+               width = 40), sep = "\n")
+
+
+
+# [END]
--- a/RPR-Unit_testing.R
+++ b/RPR-Unit_testing.R
@ -1,234 +1,234 @@
-# tocID <- "RPR-Unit_testing.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Unit_testing unit.
-#
-# Version:  1.2
-#
-# Date:     2017  10  -  2019  01
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Updates. Discuss local tests.
-#           1.1    Change from require() to requireNamespace()
-#           1.0    New code
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
-#TOC>   1        Unit Tests with testthat            42
-#TOC>   2        Organizing your tests              165
-#TOC>   2.1        Testing scripts                  189
-#TOC>   2.2        Rethinking testing               202
-#TOC>   3        Task solutions                     220
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Unit Tests with testthat  ============================================
-
-# The testthat package supports writing and executing unit tests in many ways.
-
-if (! requireNamespace("testthat", quietly = TRUE)) {
-  install.packages("testthat")
-}
-# Package information:
-#  library(help = testthat)       # basic information
-#  browseVignettes("testthat")    # available vignettes
-#  data(package = "testthat")     # available datasets
-
-# testthat is one of those packages that we either use A LOT in a script,
-# or not at all. Therefore it's more reasonable to depart from our usual
-# <package>::<function>() idiom, and load the entire library. In fact, if
-# we author packages, it is common practice to load testthat in the part
-# of the package that automates testing.
-
-library(testthat)
-
-# An atomic test consists of an expectation about the bahaviour of a function or
-# the existence of an object. testthat provides a number of useful expectations:
-
-# At the most basic level, you can use expect_true() and expect_false():
-
-expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
-expect_true(file.exists("NO-SUCH-FILE.txt"))
-
-expect_false(is.integer(NA))
-
-# More commonly, you will test for equality of an output with a given result.
-# But you need to consider what it means for two numbers to be "equal" on a
-# digital computer. Consider:
-
-49*(1/49) == 1      # Surprised? Read FAQ 7.31
-                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
-49*(1/49) - 1       # NOT zero (but almost)
-
-# This is really unpredictable ...
-0.1 + 0.05 == 0.15
-0.2 + 0.07 == 0.27
-
-# It's easy to be caught on the wrong foot with numeric comparisons, therefore
-# R uses the function all.equal() to test whether two numbers are equal for
-# practical puposes up to machine precision.
-49*(1/49) == 1
-all.equal(49*(1/49), 1)
-
-# The testthat function expect_equal() uses all.equal internally:
-expect_equal(49*(1/49), 1)
-
-# ... which is reasonable, or, if things MUST be exactly the same ...
-expect_identical(49*(1/49), 1)
-
-# ... but consider:
-expect_identical(2, 2L) # one is typeof() "double", the other is integer"
-
-# Some very useful expectations are expect_warning(), and expect_error(), for
-# constructing tests that check for erroneous output:
-
-as.integer(c("1", "2", "three"))
-expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
-                                                 # printed.
-1/"x"
-expect_warning(1/"x")
-expect_error(1/"x")      # Again: note that the error is NOT printed, as well
-                         # code execution will continue.
-
-# Even better, you can check if the warning or error is what you expect it
-# to be - because it could actually have occured somewhere else in your code.
-
-v <- c("1", "x")
-log(v[1:2])
-expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
-expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
-expect_error(log(v[1,2]))                # This appears oK, but ...
-expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
-
-# Producing unit tests simply means: we define a function, and then we check
-# whether all test pass. Consider a function that is loaded on startup from
-# the .utilities.R script:
-
-biCode
-
-# We could test it like so:
-
-expect_equal(biCode(""), ".....")
-expect_equal(biCode(" "), ".....")
-expect_equal(biCode("123 12"), ".....")
-expect_equal(biCode("h sapiens"), "H..SA")
-expect_equal(biCode("homo sapiens"), "HOMSA")
-expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
-expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
-             c("PHACI", "MACRU"))
-expect_error(biCode(), "argument \"s\" is missing, with no default")
-
-# The test_that() function allows to group related tests, include an informative
-# message which test is being executed, and run a number of tests that are
-# passed to the function inside a code block - i.e. {...}
-# test_that("<descriptive string>, {<code block>})
-
-test_that("NA values are preserved", {
-  # bicode() respects vector length: input and output must have the smae length.
-  # Therefore NA's can't be simply skipped, bust must be properly passed
-  # into output:
-  expect_true(is.na((biCode(NA))))
-  expect_equal(biCode(c("first", NA, "last")),
-               c("FIRST", NA, "LAST."))
-})
-
-
-# Task: Write a function calcGC() that calculates GC content in a sequence.
-#       Hint: you could strsplit() the sequence into a vector, and count
-#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
-#       A's and T's, and use nchar() before and after to calculate the content
-#       from the length difference.
-#       Then write tests that:
-#          confirm that calcGC("AATT") is 0;
-#          confirm that calcGC("ATGC") is 0.5;
-#          confirm that calcGC("AC")   is 0.5;
-#          confirm that calcGC("CGCG") is 1;
-
-
-# =    2  Organizing your tests  ===============================================
-
-
-# Tests are only useful if they are actually executed and we need to make sure
-# there are no barriers to do that. The testthat package supports automatic
-# execution of tests:
-#  - put your tests into an R-script,
-#  - save your tests in a file called "test_<my-function-name>.R"
-#  - execute the test with test_file("test_<my-function-name>.R") ...
-#  ... or, if you are working on a project ...
-#  - place the file in a test-directory (e.g. the directory "test" in this
-#      project),
-#  - execute all your tests with test_dir("<my-test-directory>")
-
-# For example I have provided a "tests" directory with this project, and
-# placed the file "test_biCode.R" inside.
-file.show("./tests/test_biCode.R")
-
-# Execute the file ...
-test_file("./tests/test_biCode.R")
-
-# .. or execute all the test files in the directory:
-test_dir("./tests")
-
-# ==   2.1  Testing scripts  ===================================================
-
-# Scripts need special consideration since we do not necessarily source() them
-# entirely. Therefore automated testing is not reasonable. What you can do
-# instead is to place a conditional block at the end of your script, that
-# never gets executed - then you can manually execute the code in the block
-# whenever you wish to test your functions. For example:
-
-if (FALSE) {
-  # ... your tests go here
-
-}
-
-# ==   2.2  Rethinking testing  ================================================
-
-# However, it is important to keep in mind that different objectives lead to
-# different ideas of what works best. There is never a "best" in and of itself,
-# the question is always: "Best for what?" While automated unit testing is a
-# great way to assure the integrity of packages and larger software artefacts as
-# they are being developed, more loosely conceived aggregates of code - like the
-# scripts for this course for example - have different objectives and in this
-# case I find the testthat approach to actually be inferior. The reason is its
-# tendency to physically separate code and tests. Keeping assets, and functions
-# that operate on those assets separated is always poor design. I have found
-# over time that a more stable approach is to move individual functions into
-# their individual scripts, all in one folder, one function (and its helpers)
-# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
-# explained above.
-
-
-
-# =    3  Task solutions  ======================================================
-
-calcGC <- function(s) {
-  s <- gsub("[^agctAGCT]", "", s)
-  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
-}
-
-expect_equal(calcGC("AATT"), 0)
-expect_equal(calcGC("ATGC"), 0.5)
-expect_equal(calcGC("AC"),   0.5)
-expect_equal(calcGC("CGCG"), 1)
-
-
-
-# [END]
+# tocID <- "RPR-Unit_testing.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Unit_testing unit.
+#
+# Version:  1.2
+#
+# Date:     2017  10  -  2019  01
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Updates. Discuss local tests.
+#           1.1    Change from require() to requireNamespace()
+#           1.0    New code
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Unit Tests with testthat            42
+#TOC>   2        Organizing your tests              165
+#TOC>   2.1        Testing scripts                  189
+#TOC>   2.2        Rethinking testing               202
+#TOC>   3        Task solutions                     220
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Unit Tests with testthat  ============================================
+
+# The testthat package supports writing and executing unit tests in many ways.
+
+if (! requireNamespace("testthat", quietly = TRUE)) {
+  install.packages("testthat")
+}
+# Package information:
+#  library(help = testthat)       # basic information
+#  browseVignettes("testthat")    # available vignettes
+#  data(package = "testthat")     # available datasets
+
+# testthat is one of those packages that we either use A LOT in a script,
+# or not at all. Therefore it's more reasonable to depart from our usual
+# <package>::<function>() idiom, and load the entire library. In fact, if
+# we author packages, it is common practice to load testthat in the part
+# of the package that automates testing.
+
+library(testthat)
+
+# An atomic test consists of an expectation about the bahaviour of a function or
+# the existence of an object. testthat provides a number of useful expectations:
+
+# At the most basic level, you can use expect_true() and expect_false():
+
+expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
+expect_true(file.exists("NO-SUCH-FILE.txt"))
+
+expect_false(is.integer(NA))
+
+# More commonly, you will test for equality of an output with a given result.
+# But you need to consider what it means for two numbers to be "equal" on a
+# digital computer. Consider:
+
+49*(1/49) == 1      # Surprised? Read FAQ 7.31
+                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
+49*(1/49) - 1       # NOT zero (but almost)
+
+# This is really unpredictable ...
+0.1 + 0.05 == 0.15
+0.2 + 0.07 == 0.27
+
+# It's easy to be caught on the wrong foot with numeric comparisons, therefore
+# R uses the function all.equal() to test whether two numbers are equal for
+# practical puposes up to machine precision.
+49*(1/49) == 1
+all.equal(49*(1/49), 1)
+
+# The testthat function expect_equal() uses all.equal internally:
+expect_equal(49*(1/49), 1)
+
+# ... which is reasonable, or, if things MUST be exactly the same ...
+expect_identical(49*(1/49), 1)
+
+# ... but consider:
+expect_identical(2, 2L) # one is typeof() "double", the other is integer"
+
+# Some very useful expectations are expect_warning(), and expect_error(), for
+# constructing tests that check for erroneous output:
+
+as.integer(c("1", "2", "three"))
+expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
+                                                 # printed.
+1/"x"
+expect_warning(1/"x")
+expect_error(1/"x")      # Again: note that the error is NOT printed, as well
+                         # code execution will continue.
+
+# Even better, you can check if the warning or error is what you expect it
+# to be - because it could actually have occured somewhere else in your code.
+
+v <- c("1", "x")
+log(v[1:2])
+expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
+expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
+expect_error(log(v[1,2]))                # This appears oK, but ...
+expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
+
+# Producing unit tests simply means: we define a function, and then we check
+# whether all test pass. Consider a function that is loaded on startup from
+# the .utilities.R script:
+
+biCode
+
+# We could test it like so:
+
+expect_equal(biCode(""), ".....")
+expect_equal(biCode(" "), ".....")
+expect_equal(biCode("123 12"), ".....")
+expect_equal(biCode("h sapiens"), "H..SA")
+expect_equal(biCode("homo sapiens"), "HOMSA")
+expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
+expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
+             c("PHACI", "MACRU"))
+expect_error(biCode(), "argument \"s\" is missing, with no default")
+
+# The test_that() function allows to group related tests, include an informative
+# message which test is being executed, and run a number of tests that are
+# passed to the function inside a code block - i.e. {...}
+# test_that("<descriptive string>, {<code block>})
+
+test_that("NA values are preserved", {
+  # bicode() respects vector length: input and output must have the smae length.
+  # Therefore NA's can't be simply skipped, bust must be properly passed
+  # into output:
+  expect_true(is.na((biCode(NA))))
+  expect_equal(biCode(c("first", NA, "last")),
+               c("FIRST", NA, "LAST."))
+})
+
+
+# Task: Write a function calcGC() that calculates GC content in a sequence.
+#       Hint: you could strsplit() the sequence into a vector, and count
+#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
+#       A's and T's, and use nchar() before and after to calculate the content
+#       from the length difference.
+#       Then write tests that:
+#          confirm that calcGC("AATT") is 0;
+#          confirm that calcGC("ATGC") is 0.5;
+#          confirm that calcGC("AC")   is 0.5;
+#          confirm that calcGC("CGCG") is 1;
+
+
+# =    2  Organizing your tests  ===============================================
+
+
+# Tests are only useful if they are actually executed and we need to make sure
+# there are no barriers to do that. The testthat package supports automatic
+# execution of tests:
+#  - put your tests into an R-script,
+#  - save your tests in a file called "test_<my-function-name>.R"
+#  - execute the test with test_file("test_<my-function-name>.R") ...
+#  ... or, if you are working on a project ...
+#  - place the file in a test-directory (e.g. the directory "test" in this
+#      project),
+#  - execute all your tests with test_dir("<my-test-directory>")
+
+# For example I have provided a "tests" directory with this project, and
+# placed the file "test_biCode.R" inside.
+file.show("./tests/test_biCode.R")
+
+# Execute the file ...
+test_file("./tests/test_biCode.R")
+
+# .. or execute all the test files in the directory:
+test_dir("./tests")
+
+# ==   2.1  Testing scripts  ===================================================
+
+# Scripts need special consideration since we do not necessarily source() them
+# entirely. Therefore automated testing is not reasonable. What you can do
+# instead is to place a conditional block at the end of your script, that
+# never gets executed - then you can manually execute the code in the block
+# whenever you wish to test your functions. For example:
+
+if (FALSE) {
+  # ... your tests go here
+
+}
+
+# ==   2.2  Rethinking testing  ================================================
+
+# However, it is important to keep in mind that different objectives lead to
+# different ideas of what works best. There is never a "best" in and of itself,
+# the question is always: "Best for what?" While automated unit testing is a
+# great way to assure the integrity of packages and larger software artefacts as
+# they are being developed, more loosely conceived aggregates of code - like the
+# scripts for this course for example - have different objectives and in this
+# case I find the testthat approach to actually be inferior. The reason is its
+# tendency to physically separate code and tests. Keeping assets, and functions
+# that operate on those assets separated is always poor design. I have found
+# over time that a more stable approach is to move individual functions into
+# their individual scripts, all in one folder, one function (and its helpers)
+# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
+# explained above.
+
+
+
+# =    3  Task solutions  ======================================================
+
+calcGC <- function(s) {
+  s <- gsub("[^agctAGCT]", "", s)
+  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
+}
+
+expect_equal(calcGC("AATT"), 0)
+expect_equal(calcGC("ATGC"), 0.5)
+expect_equal(calcGC("AC"),   0.5)
+expect_equal(calcGC("CGCG"), 1)
+
+
+
+# [END]
--- a/RPR-eUtils_XML.R
+++ b/RPR-eUtils_XML.R
@ -1,166 +1,166 @@
-# tocID <- "RPR-eUtils_XML.R"
-#
-# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
-# Version:  1.2.1
-#
-# Date:     2017-10  -  2021-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2.1  2021 Maintenance
-#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
-#
-#
-# TODO:
-#
-#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
-# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                       Line
-#TOC> -----------------------------------------------------------
-#TOC>   1        Working with NCBI eUtils                      43
-#TOC>   1.1        Task - fetchNCBItaxData() function         145
-#TOC>   2        Task solutions                               152
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Working with NCBI eUtils  ============================================
-
-
-# To begin, we load the xml2 package that contains functions
-# we need to receive and parse html data. NCBI's eUtils send information in
-# XML format so we need to be able to parse XML.
-if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
-}
-# Package information:
-#  library(help = xml2)       # basic information
-#  browseVignettes("xml2")    # available vignettes
-#  data(package = "xml2")     # available datasets
-
-
-
-# We will walk through the process with the refSeqID
-# of yeast Mbp1
-refSeqID <- "NP_010227"
-
-
-# First we build a query URL...
-eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-
-
-# Then we assemble an URL that will search for get the
-# unique, NCBI internal identifier,
-# for our refSeqID...
-URL <- paste(eUtilsBase,
-             "esearch.fcgi?",     # ...using the esearch program
-                                  # that finds an entry in an
-                                  # NCBI database
-             "db=protein",
-             "&term=", refSeqID,
-             sep="")
-# Copy the URL and paste it into your browser to see
-# what the response should look like.
-URL
-
-# To fetch a response in R, we use the function read_xml()
-# with our URL as its argument.
-( myXML <- xml2::read_xml(URL) )
-
-# This is XML. We can take the response apart into
-# its individual components with the as_list() function.
-
-xml2::as_list(myXML)
-
-# Note how the XML "tree" is represented as a list of
-# lists of lists ...
-# If we know exactly what element we are looking for,
-# we can extract it from this structure:
-xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
-
-# But this is not very robust, it would break with the
-# slightest change that the NCBI makes to their data format -
-# and the NCBI changes things A LOT!
-
-# Somewhat more robust is to specify the type of element
-# we want - its the text contained in an <Id>...</Id>
-# element, and use the XPath XML parsing language to
-# retrieve it.
-
-xml2::xml_find_all(myXML, "//Id") # returns a "node set"
-
-xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
-                                                  # of the node set
-
-# We will need to do this more than once, so we write a function
-# for it...
-node2text <- function(doc, tag) {
-  # an extractor function for the contents of elements
-  # between given tags in an XML response.
-  # Contents of all matching elements is returned in
-  # a vector of strings.
-  path <- paste0("//", tag)
-  nodes <- xml2::xml_find_all(doc, path)
-  return(xml2::xml_text(nodes))
-}
-
-# using node2text() ...
-(GID <- node2text(myXML, "Id"))
-
-# The GI is the pivot for data requests at the
-# NCBI.
-
-# Let's first get the associated data for this GI
-URL <- paste0(eUtilsBase,
-              "esummary.fcgi?",
-              "db=protein",
-              "&id=",
-              GID,
-              "&version=2.0")
-(myXML <- xml2::read_xml(URL))
-
-(taxID <- node2text(myXML, "TaxId"))
-(organism <- node2text(myXML, "Organism"))
-
-#  This forms the base of a function that gets taxonomy data
-#  from an Entrez result. You can write this!
-
-
-# ==   1.1  Task - fetchNCBItaxData() function  ================================
-
-# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
-# information, returns a list with taxID and organism, if the operation is
-# successful, or a list of length 0 if there is an error.
-
-
-# =    2  Task solutions  ======================================================
-
-# I have placed such a function into the dbUtilities script: look it up by
-# clicking on  dbFetchNCBItaxData() in the Environment pane.
-
-# Test:
-dbFetchNCBItaxData("XP_001837394")
-
-# Expected outout:
-# ----------------
-# taxID                         organism
-# 1 240176 Coprinopsis cinerea okayama7#130
-
-
-# [END]
+# tocID <- "RPR-eUtils_XML.R"
+#
+# Purpose:  A Bioinformatics Course:
+#              R code accompanying the RPR-Scripting_data_downloads unit.
+#
+# Version:  1.2.1
+#
+# Date:     2017-10  -  2021-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2.1  2021 Maintenance
+#           1.2    2020 Updates
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0    First ABC units version
+#           0.1    First code copied from 2016 material.
+#
+#
+# TODO:
+#
+#
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+#
+# If there are portions you don't understand, use R's help system, Google for an
+# answer, or ask your instructor. Don't continue if you don't understand what's
+# going on. That's not how it works ...
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                       Line
+#TOC> -----------------------------------------------------------
+#TOC>   1        Working with NCBI eUtils                      43
+#TOC>   1.1        Task - fetchNCBItaxData() function         145
+#TOC>   2        Task solutions                               152
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Working with NCBI eUtils  ============================================
+
+
+# To begin, we load the xml2 package that contains functions
+# we need to receive and parse html data. NCBI's eUtils send information in
+# XML format so we need to be able to parse XML.
+if (! requireNamespace("xml2", quietly=TRUE)) {
+  install.packages("xml2")
+}
+# Package information:
+#  library(help = xml2)       # basic information
+#  browseVignettes("xml2")    # available vignettes
+#  data(package = "xml2")     # available datasets
+
+
+
+# We will walk through the process with the refSeqID
+# of yeast Mbp1
+refSeqID <- "NP_010227"
+
+
+# First we build a query URL...
+eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+
+
+# Then we assemble an URL that will search for get the
+# unique, NCBI internal identifier,
+# for our refSeqID...
+URL <- paste(eUtilsBase,
+             "esearch.fcgi?",     # ...using the esearch program
+                                  # that finds an entry in an
+                                  # NCBI database
+             "db=protein",
+             "&term=", refSeqID,
+             sep="")
+# Copy the URL and paste it into your browser to see
+# what the response should look like.
+URL
+
+# To fetch a response in R, we use the function read_xml()
+# with our URL as its argument.
+( myXML <- xml2::read_xml(URL) )
+
+# This is XML. We can take the response apart into
+# its individual components with the as_list() function.
+
+xml2::as_list(myXML)
+
+# Note how the XML "tree" is represented as a list of
+# lists of lists ...
+# If we know exactly what element we are looking for,
+# we can extract it from this structure:
+xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
+
+# But this is not very robust, it would break with the
+# slightest change that the NCBI makes to their data format -
+# and the NCBI changes things A LOT!
+
+# Somewhat more robust is to specify the type of element
+# we want - its the text contained in an <Id>...</Id>
+# element, and use the XPath XML parsing language to
+# retrieve it.
+
+xml2::xml_find_all(myXML, "//Id") # returns a "node set"
+
+xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
+                                                  # of the node set
+
+# We will need to do this more than once, so we write a function
+# for it...
+node2text <- function(doc, tag) {
+  # an extractor function for the contents of elements
+  # between given tags in an XML response.
+  # Contents of all matching elements is returned in
+  # a vector of strings.
+  path <- paste0("//", tag)
+  nodes <- xml2::xml_find_all(doc, path)
+  return(xml2::xml_text(nodes))
+}
+
+# using node2text() ...
+(GID <- node2text(myXML, "Id"))
+
+# The GI is the pivot for data requests at the
+# NCBI.
+
+# Let's first get the associated data for this GI
+URL <- paste0(eUtilsBase,
+              "esummary.fcgi?",
+              "db=protein",
+              "&id=",
+              GID,
+              "&version=2.0")
+(myXML <- xml2::read_xml(URL))
+
+(taxID <- node2text(myXML, "TaxId"))
+(organism <- node2text(myXML, "Organism"))
+
+#  This forms the base of a function that gets taxonomy data
+#  from an Entrez result. You can write this!
+
+
+# ==   1.1  Task - fetchNCBItaxData() function  ================================
+
+# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
+# information, returns a list with taxID and organism, if the operation is
+# successful, or a list of length 0 if there is an error.
+
+
+# =    2  Task solutions  ======================================================
+
+# I have placed such a function into the dbUtilities script: look it up by
+# clicking on  dbFetchNCBItaxData() in the Environment pane.
+
+# Test:
+dbFetchNCBItaxData("XP_001837394")
+
+# Expected outout:
+# ----------------
+# taxID                         organism
+# 1 240176 Coprinopsis cinerea okayama7#130
+
+
+# [END]
--- a/data/0TST.pdb
+++ b/data/0TST.pdb
@ -1,10 +1,10 @@
-HEADER   TEST                                                 0TST      0TST   1
-REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
-ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
-ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
-ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
-ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
-TER       5      GLY     1                                              0TST   7
-HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
-HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
-END                                                                     0TST  10
+HEADER   TEST                                                 0TST      0TST   1
+REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
+ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
+ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
+ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
+ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
+TER       5      GLY     1                                              0TST   7
+HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
+HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
+END                                                                     0TST  10
--- a/data/1BM8.pdb
+++ b/data/1BM8.pdb
--- a/data/2F1C.fa
+++ b/data/2F1C.fa
@ -1,5 +1,5 @@
->2F1C:X|PDBID|CHAIN|SEQUENCE
-EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
-DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
-FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
+>2F1C:X|PDBID|CHAIN|SEQUENCE
+EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
+DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
+FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
 GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH
--- a/data/3FG7.fa
+++ b/data/3FG7.fa
@ -1,6 +1,6 @@
->3FG7:A|PDBID|CHAIN|SEQUENCE
-MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
-WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
-VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
-ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
-QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
+>3FG7:A|PDBID|CHAIN|SEQUENCE
+MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
+WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
+VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
+ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
+QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
--- a/data/MBP1_SACCE.json
+++ b/data/MBP1_SACCE.json
@ -1,20 +1,20 @@
-[
-  { "name" : "MBP1_SACCE",
-    "RefSeqID" : "NP_010227",
-    "UniProtID" : "P39678",
-    "taxonomyID" : 559292,
-    "sequence" : [
-       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
-       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
-       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
-       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
-       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
-       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
-       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
-       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
-       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
-       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
-       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
-       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
-  }
-]
+[
+  { "name" : "MBP1_SACCE",
+    "RefSeqID" : "NP_010227",
+    "UniProtID" : "P39678",
+    "taxonomyID" : 559292,
+    "sequence" : [
+       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
+       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
+       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
+       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
+       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
+       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
+       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
+       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
+       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
+       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
+       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
+       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
+  }
+]
--- a/data/PTPN5_HSa_coding.fa
+++ b/data/PTPN5_HSa_coding.fa
@ -1,30 +1,30 @@
->PTPN5-201 cds:protein_coding (ENST00000358540.7)
-ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
-GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
-GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
-CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
-TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
-GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
-TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
-TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
-CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
-AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
-ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
-ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
-GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
-GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
-GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
-CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
-GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
-CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
-GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
-GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
-ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
-GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
-TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
-GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
-GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
-GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
-GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
-ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
-CACCAGTCCCCAGAATGA
+>PTPN5-201 cds:protein_coding (ENST00000358540.7)
+ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
+GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
+GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
+CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
+TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
+GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
+TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
+TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
+CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
+AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
+ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
+ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
+GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
+GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
+GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
+CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
+GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
+CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
+GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
+GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
+ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
+GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
+TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
+GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
+GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
+GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
+GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
+ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
+CACCAGTCCCCAGAATGA
--- a/data/RAB39B_HSa_coding.fa
+++ b/data/RAB39B_HSa_coding.fa
@ -1,12 +1,12 @@
->RAB39B cds:protein_coding (ENST00000369454.4)
-ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
-AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
-GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
-CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
-AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
-CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
-GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
-CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
-GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
-ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
-TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
+>RAB39B cds:protein_coding (ENST00000369454.4)
+ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
+AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
+GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
+CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
+AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
+CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
+GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
+CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
+GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
+ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
+TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
--- a/data/RandomPhobiaPage.txt
+++ b/data/RandomPhobiaPage.txt
@ -1,131 +1,131 @@
-
-
-```{css, echo = FALSE}
-
-.striped tr:nth-child(even) {
-  background: #eaf1ff;
-}
-.striped {
-  padding: 5px;
-}
-```
-<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
-
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-## Phobias! ##
-We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
-
-To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
-```{r packages}
-if (! requireNamespace("rvest", quietly=TRUE)) {
-  install.packages("rvest")
-}
-if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
-}
-```
-As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
-
-`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
-
-```{r getPageData, cache=TRUE}
-webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
-allTables <- rvest::html_table(webPage, fill = TRUE)
-```
-
-There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
-
-```{r collateTables, cache=TRUE}
-phobiaTable <- data.frame(Phobia = character(), Condition = character())
-for (i in seq_along(allTables)) {
-  df <- allTables[[i]]
-  if (all(colnames(df) == c("Phobia", "Condition"))) {
-    phobiaTable <- rbind(phobiaTable, df)
-  }
-}
-```
-
-Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
-
-<p>&nbsp;
-<p>
-
-```{r , ref.label="randRow", echo=FALSE}
-```
-
-**Table**: seven random phobias<br/>
-```{r renderPhobiaTable, echo=FALSE, results='asis'}
-sel <- sample(1:nrow(phobiaTable), 7)
-knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
-```
-
-<p>&nbsp;
-<p>
-To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
-
-```{r randRow}
-randRow <- function(M, seed = FALSE) {
-  # Return a random row from a dataframe M.
-  if (seed) {
-    oldseed <- .Random.seed                # play nice and save the RNG state ...
-    set.seed(as.integer(seed))
-  }
-  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
-  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
-  return(r)
-}
-```
-<p>&nbsp;
-<p>
-With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
-
-_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
-
-<p>&nbsp;
-<p>
-
-Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
-
-```{r preProcess}
-
-# select only single-word phobias that end with "phobia"
-sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
-names <- phobiaTable$Phobia[sel]
-
-# extract the ones we did _not_ select
-x <- phobiaTable$Phobia[! sel]
-# use strsplit() to split them apart and flatten the resulting list
-x <- unlist(strsplit(x, ", "))
-x <- unlist(strsplit(x, " "))
-x <- unlist(strsplit(x, "/"))
-# use the same selection as above, and append the result to our "names""
-sel <- ! grepl(" ", x) & grepl(".phobia$", x)
-names <- c(names, x[sel])
-
-```
-
-Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
-
-```{r showHist}
-
-x <- nchar(names)
-pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
-pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
-hist(x,
-     main = "Length of phobia-names",
-     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
-                   pShort, nchar(pShort), pLong, nchar(pLong)),
-     cex.sub = 0.8,
-     xlab = "name",
-     ylab = "counts",
-     col ="#aef5ee")
-
-```
-
-That's all.
-
-<!-- [END] -->
+
+
+```{css, echo = FALSE}
+
+.striped tr:nth-child(even) {
+  background: #eaf1ff;
+}
+.striped {
+  padding: 5px;
+}
+```
+<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Phobias! ##
+We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
+
+To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
+```{r packages}
+if (! requireNamespace("rvest", quietly=TRUE)) {
+  install.packages("rvest")
+}
+if (! requireNamespace("xml2", quietly=TRUE)) {
+  install.packages("xml2")
+}
+```
+As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
+
+`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
+
+```{r getPageData, cache=TRUE}
+webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
+allTables <- rvest::html_table(webPage, fill = TRUE)
+```
+
+There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
+
+```{r collateTables, cache=TRUE}
+phobiaTable <- data.frame(Phobia = character(), Condition = character())
+for (i in seq_along(allTables)) {
+  df <- allTables[[i]]
+  if (all(colnames(df) == c("Phobia", "Condition"))) {
+    phobiaTable <- rbind(phobiaTable, df)
+  }
+}
+```
+
+Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
+
+<p>&nbsp;
+<p>
+
+```{r , ref.label="randRow", echo=FALSE}
+```
+
+**Table**: seven random phobias<br/>
+```{r renderPhobiaTable, echo=FALSE, results='asis'}
+sel <- sample(1:nrow(phobiaTable), 7)
+knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
+```
+
+<p>&nbsp;
+<p>
+To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
+
+```{r randRow}
+randRow <- function(M, seed = FALSE) {
+  # Return a random row from a dataframe M.
+  if (seed) {
+    oldseed <- .Random.seed                # play nice and save the RNG state ...
+    set.seed(as.integer(seed))
+  }
+  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
+  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
+  return(r)
+}
+```
+<p>&nbsp;
+<p>
+With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
+
+_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
+
+<p>&nbsp;
+<p>
+
+Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
+
+```{r preProcess}
+
+# select only single-word phobias that end with "phobia"
+sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
+names <- phobiaTable$Phobia[sel]
+
+# extract the ones we did _not_ select
+x <- phobiaTable$Phobia[! sel]
+# use strsplit() to split them apart and flatten the resulting list
+x <- unlist(strsplit(x, ", "))
+x <- unlist(strsplit(x, " "))
+x <- unlist(strsplit(x, "/"))
+# use the same selection as above, and append the result to our "names""
+sel <- ! grepl(" ", x) & grepl(".phobia$", x)
+names <- c(names, x[sel])
+
+```
+
+Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
+
+```{r showHist}
+
+x <- nchar(names)
+pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
+pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
+hist(x,
+     main = "Length of phobia-names",
+     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
+                   pShort, nchar(pShort), pLong, nchar(pLong)),
+     cex.sub = 0.8,
+     xlab = "name",
+     ylab = "counts",
+     col ="#aef5ee")
+
+```
+
+That's all.
+
+<!-- [END] -->
--- a/data/S288C_YDL056W_MBP1_coding.fsa
+++ b/data/S288C_YDL056W_MBP1_coding.fsa
@ -1,43 +1,43 @@
->MBP1 YDL056W SGDID:S000002214
-ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
-TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
-AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
-GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
-AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
-GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
-TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
-AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
-CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
-AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
-CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
-TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
-CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
-GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
-CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
-CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
-CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
-CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
-TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
-CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
-TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
-ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
-TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
-ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
-TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
-AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
-TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
-ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
-ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
-GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
-GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
-ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
-CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
-ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
-GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
-AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
-CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
-GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
-CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
-ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
-AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
+>MBP1 YDL056W SGDID:S000002214
+ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
+TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
+AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
+GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
+AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
+GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
+TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
+AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
+CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
+AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
+CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
+TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
+CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
+GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
+CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
+CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
+CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
+CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
+TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
+CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
+TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
+ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
+TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
+ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
+TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
+AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
+TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
+ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
+ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
+GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
+GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
+ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
+CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
+ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
+GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
+AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
+CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
+GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
+CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
+ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
+AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
 GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA
--- a/data/SGD_features.README.txt
+++ b/data/SGD_features.README.txt
@ -1,47 +1,47 @@
-SGD_features.tab
-
-The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
-
-The SGD_features.tab file is updated weekly (Saturday).
-
-NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
-used chromosomal_feature.tab file.
-
-File contents:
-
-1. Information on current chromosomal features in SGD, including Dubious ORFs. 
-Also contains coordinates of intron, exons, and other subfeatures that are located
-within a chromosomal feature.
-
-2. The relationship between subfeatures and the feature in which they
-are located is identified by the feature name in column #7 (parent
-feature). For example, the parent feature of the intron found in
-ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
-chromosome 6.
-
-3. The coordinates of all features are in chromosomal coordinates.
-
-
-Columns within SGD_features.tab:
-
-1.   Primary SGDID (mandatory)
-2.   Feature type (mandatory)
-3.   Feature qualifier (optional)
-4.   Feature name (optional)
-5.   Standard gene name (optional)
-6.   Alias (optional, multiples separated by |)
-7.   Parent feature name (optional)
-8.   Secondary SGDID (optional, multiples separated by |)
-9.   Chromosome (optional)
-10.  Start_coordinate (optional)
-11.  Stop_coordinate (optional)
-12.  Strand (optional)
-13.  Genetic position (optional)
-14.  Coordinate version (optional)
-15.  Sequence version (optional)
-16.  Description (optional)
-
-Note that "chromosome 17" is the mitochondrial chromosome.
-
-The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
-
+SGD_features.tab
+
+The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
+
+The SGD_features.tab file is updated weekly (Saturday).
+
+NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
+used chromosomal_feature.tab file.
+
+File contents:
+
+1. Information on current chromosomal features in SGD, including Dubious ORFs. 
+Also contains coordinates of intron, exons, and other subfeatures that are located
+within a chromosomal feature.
+
+2. The relationship between subfeatures and the feature in which they
+are located is identified by the feature name in column #7 (parent
+feature). For example, the parent feature of the intron found in
+ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
+chromosome 6.
+
+3. The coordinates of all features are in chromosomal coordinates.
+
+
+Columns within SGD_features.tab:
+
+1.   Primary SGDID (mandatory)
+2.   Feature type (mandatory)
+3.   Feature qualifier (optional)
+4.   Feature name (optional)
+5.   Standard gene name (optional)
+6.   Alias (optional, multiples separated by |)
+7.   Parent feature name (optional)
+8.   Secondary SGDID (optional, multiples separated by |)
+9.   Chromosome (optional)
+10.  Start_coordinate (optional)
+11.  Stop_coordinate (optional)
+12.  Strand (optional)
+13.  Genetic position (optional)
+14.  Coordinate version (optional)
+15.  Sequence version (optional)
+16.  Description (optional)
+
+Note that "chromosome 17" is the mitochondrial chromosome.
+
+The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
+
--- a/data/SGD_features.tab
+++ b/data/SGD_features.tab
--- a/data/Species.csv
+++ b/data/Species.csv
--- a/data/intogen-KRAS-distribution-data.tsv
+++ b/data/intogen-KRAS-distribution-data.tsv
@ -1,179 +1,179 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
-11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
-11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
-7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
-5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
-5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
-3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
-3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
-2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
-2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
-2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
-2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
-0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
-0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
+11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
+11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
+7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
+7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
+7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
+7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
+5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
+5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
+3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
+3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
+3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
+2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
+2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
+2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
+2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
+0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
+0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
--- a/data/intogen-OR1A1-distribution-data.tsv
+++ b/data/intogen-OR1A1-distribution-data.tsv
@ -1,49 +1,49 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
-2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
+2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
--- a/data/intogen-PTPN11-distribution-data.tsv
+++ b/data/intogen-PTPN11-distribution-data.tsv
@ -1,113 +1,113 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
-4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
-3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
-3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
-1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
-1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
-1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
-1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
+4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
+3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
+3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
+1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
+1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
+1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
+1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
+1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
+1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
+1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
--- a/data/refAPSES.mfa
+++ b/data/refAPSES.mfa
@ -1,39 +1,39 @@
->MBP1_ASPNI AN3154 XP_660758 Q5B8H6
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
-LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
-
->MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
-KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
-LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
-
->MBP1_COPCI  - XP_001837394 A8NYC6
-QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
-LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
-
->MBP1_CRYNE  - XP_569090 Q5KMQ9
-DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
-LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
-
->MBP1_NEUCR Swi4 XP_955821 Q7RW59
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
-LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
-
->MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
-LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
-
->MBP1_SACCE Mbp1 NP_010227 P39678
-QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
-LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
-
->MBP1_SCHPO Res2 NP_593032 P41412
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
-LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
-
->MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
-LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
-
->MBP1_WALME  - XP_006957051 I4YGC0
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
-LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
+>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
+-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
+LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
+
+>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
+KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
+LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
+
+>MBP1_COPCI  - XP_001837394 A8NYC6
+QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
+LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
+
+>MBP1_CRYNE  - XP_569090 Q5KMQ9
+DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
+LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
+
+>MBP1_NEUCR Swi4 XP_955821 Q7RW59
+-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
+LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
+
+>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
+-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
+LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
+
+>MBP1_SACCE Mbp1 NP_010227 P39678
+QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
+LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
+
+>MBP1_SCHPO Res2 NP_593032 P41412
+-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
+LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
+
+>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
+-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
+LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
+
+>MBP1_WALME  - XP_006957051 I4YGC0
+-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
+LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
--- a/data/refAPSES_PSI-BLAST.json
+++ b/data/refAPSES_PSI-BLAST.json
@ -1,490 +1,490 @@
-[
-  { "name" : "68476_WALME",
-    "RefSeqID" : "XP_006957790",
-    "UniProtID" : "I4YDD8",
-    "taxonomyID" : "671144",
-    "sequence" : [
-             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
-             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
-             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
-             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
-             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
-             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
-             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
-             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
-             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
-  },
-  { "name" : "00846_COPCI",
-    "RefSeqID" : "XP_001831299",
-    "UniProtID" : "A8N8X1",
-    "taxonomyID" : "240176",
-    "sequence" : [
-             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
-             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
-             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
-             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
-             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
-             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
-             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
-             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
-             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
-             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
-             "SEAQVVDIGRVSGFMQKVRDGII"]
-  },
-  { "name" : "8533_BIPOR",
-    "RefSeqID" : "XP_007691662",
-    "UniProtID" : "W6ZE71",
-    "taxonomyID" : "930090",
-    "sequence" : [
-             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
-             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
-             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
-             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
-             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
-             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
-             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
-             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
-             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
-             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
-             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
-  },
-  { "name" : "PGTG_02039",
-    "RefSeqID" : "XP_003320997",
-    "UniProtID" : "E3JX03",
-    "taxonomyID" : "418459",
-    "sequence" : [
-             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
-             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
-             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
-             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
-             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
-             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
-             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
-             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
-             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
-             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
-             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
-  },
-  { "name" : "MBPA_ASPNI",
-    "RefSeqID" : "XP_664319",
-    "UniProtID" : "Q5AYB5",
-    "taxonomyID" : "227321",
-    "sequence" : [
-             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
-             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
-             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
-             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
-             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
-             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
-             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
-             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
-             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
-             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
-             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
-             "MRDRGQDW"]
-  },
-  { "name" : "05520_CRYNE",
-    "RefSeqID" : "XP_570545",
-    "UniProtID" : "Q5KHS0",
-    "taxonomyID" : "214684",
-    "sequence" : [
-             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
-             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
-             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
-             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
-             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
-             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
-             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
-             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
-             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
-             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
-             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
-             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
-  },
-  { "name" : "RES1_SCHPO",
-    "RefSeqID" : "NP_595496",
-    "UniProtID" : "P33520",
-    "taxonomyID" : "284812",
-    "sequence" : [
-             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
-             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
-             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
-             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
-             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
-             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
-             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
-             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
-  },
-  { "name" : "CDC10_SCHPO",
-    "RefSeqID" : "NP_596132",
-    "UniProtID" : "P01129",
-    "taxonomyID" : "284812",
-    "sequence" : [
-             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
-             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
-             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
-             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
-             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
-             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
-             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
-             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
-             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
-             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
-  },
-  { "name" : "05338_USTMA",
-    "RefSeqID" : "XP_011392041",
-    "UniProtID" : "A0A0D1BWD8",
-    "taxonomyID" : "237631",
-    "sequence" : [
-             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
-             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
-             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
-             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
-             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
-             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
-             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
-             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
-             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
-             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
-             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
-             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
-             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
-             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
-  },
-  { "name" : "SWI4_SACCE",
-    "RefSeqID" : "NP_011036",
-    "UniProtID" : "P25302",
-    "taxonomyID" : "559292",
-    "sequence" : [
-             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
-             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
-             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
-             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
-             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
-             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
-             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
-             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
-             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
-             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
-             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
-             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
-             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
-             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
-  },
-  { "name" : "SWI6_NEUCR",
-    "RefSeqID" : "XP_962967",
-    "UniProtID" : "Q7SBG9",
-    "taxonomyID" : "367110",
-    "sequence" : [
-             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
-             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
-             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
-             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
-             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
-             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
-             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
-             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
-             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
-             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
-             "EKPELEIARVRRFLGGVEGVVH"]
-  },
-  { "name" : "15042_USTMA",
-    "RefSeqID" : "XP_011388143",
-    "UniProtID" : "A0A0D1CVS5",
-    "taxonomyID" : "237631",
-    "sequence" : [
-             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
-             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
-             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
-             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
-             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
-             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
-             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
-             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
-  },
-  { "name" : "04778_USTMA",
-    "RefSeqID" : "XP_011391646",
-    "UniProtID" : "A0A0D1DQM4",
-    "taxonomyID" : "237631",
-    "sequence" : [
-             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
-             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
-             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
-             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
-             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
-             "PESAQLFTIHDFGSDPFYAEQVERG"]
-  },
-  { "name" : "STUA_ASPNI",
-    "RefSeqID" : "XP_663440",
-    "UniProtID" : "P36011",
-    "taxonomyID" : "227321",
-    "sequence" : [
-             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
-             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
-             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
-             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
-             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
-             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
-             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
-             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
-  },
-  { "name" : "STUA_NEUCR",
-    "RefSeqID" : "XP_960837",
-    "UniProtID" : "Q1K6U0",
-    "taxonomyID" : "367110",
-    "sequence" : [
-             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
-             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
-             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
-             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
-             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
-             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
-             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
-             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
-             "RRR"]
-  },
-  { "name" : "PHD1_SACCE",
-    "RefSeqID" : "NP_012881",
-    "UniProtID" : "P36093",
-    "taxonomyID" : "559292",
-    "sequence" : [
-             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
-             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
-             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
-             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
-             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
-  },
-  { "name" : "08099_COPCI",
-    "RefSeqID" : "XP_001836714",
-    "UniProtID" : "A8NVH3",
-    "taxonomyID" : "240176",
-    "sequence" : [
-             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
-             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
-             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
-             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
-             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
-             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
-             "PHRPW"]
-  },
-  { "name" : "68479_WALME",
-    "RefSeqID" : "XP_006957792",
-    "UniProtID" : "I4YDE0",
-    "taxonomyID" : "671144",
-    "sequence" : [
-             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
-             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
-             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
-             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
-  },
-  { "name" : "11943_PUCGR",
-    "RefSeqID" : "XP_003330006",
-    "UniProtID" : "E3KMR2",
-    "taxonomyID" : "418459",
-    "sequence" : [
-             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
-             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
-             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
-             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
-             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
-             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
-             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
-             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
-  },
-  { "name" : "03082_PUCGR",
-    "RefSeqID" : "XP_003321545",
-    "UniProtID" : "E3JYK1",
-    "taxonomyID" : "418459",
-    "sequence" : [
-             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
-             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
-             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
-             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
-             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
-             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
-             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
-             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
-             "LIMEWNPSC"]
-  },
-  { "name" : "SOK2_SACCE",
-    "RefSeqID" : "NP_013729",
-    "UniProtID" : "P53438",
-    "taxonomyID" : "559292",
-    "sequence" : [
-             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
-             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
-             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
-             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
-             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
-             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
-             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
-             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
-             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
-             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
-  },
-  { "name" : "14426_COPCI",
-    "RefSeqID" : "XP_002911429",
-    "UniProtID" : "D6RMB0",
-    "taxonomyID" : "240176",
-    "sequence" : [
-             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
-             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
-             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
-             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
-             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
-             "ITYLPNFL"]
-  },
-  { "name" : "BQT4_SCHPO",
-    "RefSeqID" : "NP_596166",
-    "UniProtID" : "O60158",
-    "taxonomyID" : "284812",
-    "sequence" : [
-             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
-             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
-             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
-             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
-             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
-             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
-  },
-  { "name" : "PGTG_05590",
-    "RefSeqID" : "XP_003323688",
-    "UniProtID" : "E3K4V4",
-    "taxonomyID" : "418459",
-    "sequence" : [
-             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
-             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
-             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
-             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
-             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
-             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
-             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
-  },
-  { "name" : "06560_NEUCR",
-    "RefSeqID" : "XP_962267",
-    "UniProtID" : "Q7S9H5",
-    "taxonomyID" : "367110",
-    "sequence" : [
-             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
-             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
-             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
-             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
-             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
-             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
-             "ANVL"]
-  },
-  { "name" : "81480_BIPOR",
-    "RefSeqID" : "XP_007682909",
-    "UniProtID" : "W6ZKJ4",
-    "taxonomyID" : "930090",
-    "sequence" : [
-             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
-             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
-             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
-             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
-             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
-             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
-  },
-  { "name" : "01622_ASPNI",
-    "RefSeqID" : "XP_657766",
-    "UniProtID" : "Q5BH18",
-    "taxonomyID" : "227321",
-    "sequence" : [
-             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
-             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
-             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
-             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
-             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
-             "RVRNRALMGVTAAFALAKPALVLLEA"]
-  },
-  { "name" : "05405_ASPNI",
-    "RefSeqID" : "XP_663009",
-    "UniProtID" : "Q5B225",
-    "taxonomyID" : "227321",
-    "sequence" : [
-             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
-             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
-             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
-             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
-             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
-             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
-             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
-             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
-  },
-  { "name" : "105954_BIPOR",
-    "RefSeqID" : "XP_007691967",
-    "UniProtID" : "W6Z1H5",
-    "taxonomyID" : "930090",
-    "sequence" : [
-             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
-             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
-             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
-             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
-             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
-             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
-  },
-  { "name" : "69819_WALME",
-    "RefSeqID" : "XP_006959479",
-    "UniProtID" : "I4Y911",
-    "taxonomyID" : "671144",
-    "sequence" : [
-             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
-             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
-             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
-             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
-             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
-  },
-  { "name" : "02840_CRYNE",
-    "RefSeqID" : "XP_568872",
-    "UniProtID" : "Q5KM59",
-    "taxonomyID" : "214684",
-    "sequence" : [
-             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
-             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
-             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
-             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
-             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
-  },
-  { "name" : "11055_USTMA",
-    "RefSeqID" : "XP_011390537",
-    "UniProtID" : "A0A0D1DZM8",
-    "taxonomyID" : "237631",
-    "sequence" : [
-             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
-             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
-             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
-             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
-             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
-             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
-             "ASILPW"]
-  },
-  { "name" : "XBP1_NEUCR",
-    "RefSeqID" : "XP_962373",
-    "UniProtID" : "Q7S9W7",
-    "taxonomyID" : "367110",
-    "sequence" : [
-             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
-             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
-             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
-             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
-             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
-             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
-             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
-             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
-             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
-  },
-  { "name" : "XBP1_SACCE",
-    "RefSeqID" : "NP_012165",
-    "UniProtID" : "P40489",
-    "taxonomyID" : "559292",
-    "sequence" : [
-             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
-             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
-             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
-             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
-             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
-             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
-             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
-             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
-             "FKTNSKQ"]
-  }
-]
+[
+  { "name" : "68476_WALME",
+    "RefSeqID" : "XP_006957790",
+    "UniProtID" : "I4YDD8",
+    "taxonomyID" : "671144",
+    "sequence" : [
+             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
+             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
+             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
+             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
+             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
+             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
+             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
+             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
+             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
+  },
+  { "name" : "00846_COPCI",
+    "RefSeqID" : "XP_001831299",
+    "UniProtID" : "A8N8X1",
+    "taxonomyID" : "240176",
+    "sequence" : [
+             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
+             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
+             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
+             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
+             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
+             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
+             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
+             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
+             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
+             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
+             "SEAQVVDIGRVSGFMQKVRDGII"]
+  },
+  { "name" : "8533_BIPOR",
+    "RefSeqID" : "XP_007691662",
+    "UniProtID" : "W6ZE71",
+    "taxonomyID" : "930090",
+    "sequence" : [
+             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
+             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
+             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
+             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
+             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
+             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
+             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
+             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
+             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
+             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
+             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
+  },
+  { "name" : "PGTG_02039",
+    "RefSeqID" : "XP_003320997",
+    "UniProtID" : "E3JX03",
+    "taxonomyID" : "418459",
+    "sequence" : [
+             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
+             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
+             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
+             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
+             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
+             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
+             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
+             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
+             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
+             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
+             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
+  },
+  { "name" : "MBPA_ASPNI",
+    "RefSeqID" : "XP_664319",
+    "UniProtID" : "Q5AYB5",
+    "taxonomyID" : "227321",
+    "sequence" : [
+             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
+             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
+             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
+             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
+             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
+             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
+             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
+             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
+             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
+             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
+             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
+             "MRDRGQDW"]
+  },
+  { "name" : "05520_CRYNE",
+    "RefSeqID" : "XP_570545",
+    "UniProtID" : "Q5KHS0",
+    "taxonomyID" : "214684",
+    "sequence" : [
+             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
+             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
+             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
+             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
+             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
+             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
+             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
+             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
+             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
+             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
+             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
+             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
+  },
+  { "name" : "RES1_SCHPO",
+    "RefSeqID" : "NP_595496",
+    "UniProtID" : "P33520",
+    "taxonomyID" : "284812",
+    "sequence" : [
+             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
+             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
+             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
+             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
+             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
+             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
+             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
+             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
+  },
+  { "name" : "CDC10_SCHPO",
+    "RefSeqID" : "NP_596132",
+    "UniProtID" : "P01129",
+    "taxonomyID" : "284812",
+    "sequence" : [
+             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
+             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
+             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
+             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
+             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
+             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
+             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
+             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
+             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
+             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
+  },
+  { "name" : "05338_USTMA",
+    "RefSeqID" : "XP_011392041",
+    "UniProtID" : "A0A0D1BWD8",
+    "taxonomyID" : "237631",
+    "sequence" : [
+             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
+             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
+             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
+             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
+             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
+             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
+             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
+             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
+             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
+             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
+             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
+             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
+             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
+             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
+  },
+  { "name" : "SWI4_SACCE",
+    "RefSeqID" : "NP_011036",
+    "UniProtID" : "P25302",
+    "taxonomyID" : "559292",
+    "sequence" : [
+             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
+             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
+             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
+             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
+             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
+             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
+             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
+             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
+             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
+             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
+             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
+             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
+             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
+             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
+  },
+  { "name" : "SWI6_NEUCR",
+    "RefSeqID" : "XP_962967",
+    "UniProtID" : "Q7SBG9",
+    "taxonomyID" : "367110",
+    "sequence" : [
+             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
+             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
+             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
+             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
+             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
+             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
+             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
+             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
+             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
+             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
+             "EKPELEIARVRRFLGGVEGVVH"]
+  },
+  { "name" : "15042_USTMA",
+    "RefSeqID" : "XP_011388143",
+    "UniProtID" : "A0A0D1CVS5",
+    "taxonomyID" : "237631",
+    "sequence" : [
+             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
+             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
+             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
+             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
+             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
+             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
+             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
+             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
+  },
+  { "name" : "04778_USTMA",
+    "RefSeqID" : "XP_011391646",
+    "UniProtID" : "A0A0D1DQM4",
+    "taxonomyID" : "237631",
+    "sequence" : [
+             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
+             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
+             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
+             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
+             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
+             "PESAQLFTIHDFGSDPFYAEQVERG"]
+  },
+  { "name" : "STUA_ASPNI",
+    "RefSeqID" : "XP_663440",
+    "UniProtID" : "P36011",
+    "taxonomyID" : "227321",
+    "sequence" : [
+             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
+             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
+             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
+             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
+             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
+             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
+             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
+             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
+  },
+  { "name" : "STUA_NEUCR",
+    "RefSeqID" : "XP_960837",
+    "UniProtID" : "Q1K6U0",
+    "taxonomyID" : "367110",
+    "sequence" : [
+             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
+             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
+             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
+             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
+             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
+             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
+             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
+             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
+             "RRR"]
+  },
+  { "name" : "PHD1_SACCE",
+    "RefSeqID" : "NP_012881",
+    "UniProtID" : "P36093",
+    "taxonomyID" : "559292",
+    "sequence" : [
+             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
+             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
+             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
+             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
+             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
+  },
+  { "name" : "08099_COPCI",
+    "RefSeqID" : "XP_001836714",
+    "UniProtID" : "A8NVH3",
+    "taxonomyID" : "240176",
+    "sequence" : [
+             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
+             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
+             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
+             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
+             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
+             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
+             "PHRPW"]
+  },
+  { "name" : "68479_WALME",
+    "RefSeqID" : "XP_006957792",
+    "UniProtID" : "I4YDE0",
+    "taxonomyID" : "671144",
+    "sequence" : [
+             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
+             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
+             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
+             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
+  },
+  { "name" : "11943_PUCGR",
+    "RefSeqID" : "XP_003330006",
+    "UniProtID" : "E3KMR2",
+    "taxonomyID" : "418459",
+    "sequence" : [
+             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
+             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
+             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
+             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
+             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
+             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
+             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
+             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
+  },
+  { "name" : "03082_PUCGR",
+    "RefSeqID" : "XP_003321545",
+    "UniProtID" : "E3JYK1",
+    "taxonomyID" : "418459",
+    "sequence" : [
+             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
+             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
+             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
+             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
+             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
+             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
+             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
+             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
+             "LIMEWNPSC"]
+  },
+  { "name" : "SOK2_SACCE",
+    "RefSeqID" : "NP_013729",
+    "UniProtID" : "P53438",
+    "taxonomyID" : "559292",
+    "sequence" : [
+             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
+             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
+             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
+             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
+             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
+             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
+             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
+             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
+             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
+             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
+  },
+  { "name" : "14426_COPCI",
+    "RefSeqID" : "XP_002911429",
+    "UniProtID" : "D6RMB0",
+    "taxonomyID" : "240176",
+    "sequence" : [
+             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
+             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
+             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
+             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
+             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
+             "ITYLPNFL"]
+  },
+  { "name" : "BQT4_SCHPO",
+    "RefSeqID" : "NP_596166",
+    "UniProtID" : "O60158",
+    "taxonomyID" : "284812",
+    "sequence" : [
+             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
+             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
+             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
+             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
+             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
+             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
+  },
+  { "name" : "PGTG_05590",
+    "RefSeqID" : "XP_003323688",
+    "UniProtID" : "E3K4V4",
+    "taxonomyID" : "418459",
+    "sequence" : [
+             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
+             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
+             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
+             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
+             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
+             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
+             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
+  },
+  { "name" : "06560_NEUCR",
+    "RefSeqID" : "XP_962267",
+    "UniProtID" : "Q7S9H5",
+    "taxonomyID" : "367110",
+    "sequence" : [
+             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
+             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
+             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
+             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
+             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
+             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
+             "ANVL"]
+  },
+  { "name" : "81480_BIPOR",
+    "RefSeqID" : "XP_007682909",
+    "UniProtID" : "W6ZKJ4",
+    "taxonomyID" : "930090",
+    "sequence" : [
+             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
+             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
+             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
+             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
+             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
+             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
+  },
+  { "name" : "01622_ASPNI",
+    "RefSeqID" : "XP_657766",
+    "UniProtID" : "Q5BH18",
+    "taxonomyID" : "227321",
+    "sequence" : [
+             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
+             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
+             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
+             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
+             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
+             "RVRNRALMGVTAAFALAKPALVLLEA"]
+  },
+  { "name" : "05405_ASPNI",
+    "RefSeqID" : "XP_663009",
+    "UniProtID" : "Q5B225",
+    "taxonomyID" : "227321",
+    "sequence" : [
+             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
+             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
+             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
+             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
+             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
+             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
+             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
+             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
+  },
+  { "name" : "105954_BIPOR",
+    "RefSeqID" : "XP_007691967",
+    "UniProtID" : "W6Z1H5",
+    "taxonomyID" : "930090",
+    "sequence" : [
+             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
+             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
+             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
+             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
+             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
+             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
+  },
+  { "name" : "69819_WALME",
+    "RefSeqID" : "XP_006959479",
+    "UniProtID" : "I4Y911",
+    "taxonomyID" : "671144",
+    "sequence" : [
+             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
+             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
+             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
+             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
+             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
+  },
+  { "name" : "02840_CRYNE",
+    "RefSeqID" : "XP_568872",
+    "UniProtID" : "Q5KM59",
+    "taxonomyID" : "214684",
+    "sequence" : [
+             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
+             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
+             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
+             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
+             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
+  },
+  { "name" : "11055_USTMA",
+    "RefSeqID" : "XP_011390537",
+    "UniProtID" : "A0A0D1DZM8",
+    "taxonomyID" : "237631",
+    "sequence" : [
+             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
+             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
+             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
+             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
+             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
+             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
+             "ASILPW"]
+  },
+  { "name" : "XBP1_NEUCR",
+    "RefSeqID" : "XP_962373",
+    "UniProtID" : "Q7S9W7",
+    "taxonomyID" : "367110",
+    "sequence" : [
+             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
+             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
+             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
+             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
+             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
+             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
+             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
+             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
+             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
+  },
+  { "name" : "XBP1_SACCE",
+    "RefSeqID" : "NP_012165",
+    "UniProtID" : "P40489",
+    "taxonomyID" : "559292",
+    "sequence" : [
+             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
+             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
+             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
+             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
+             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
+             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
+             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
+             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
+             "FKTNSKQ"]
+  }
+]
--- a/data/refAnnotations.json
+++ b/data/refAnnotations.json
@ -1,116 +1,116 @@
-[
-  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
-  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
-  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
-  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
-
-  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
-  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
-  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
-  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
-
-  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
-  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
-  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
-  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
-
-  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
-  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
-  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
-
-  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
-  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
-  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
-
-  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
-  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
-  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
-  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
-  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
-
-  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
-  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
-
-  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
-  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
-  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
-  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
-  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
-
-  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
-  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
-  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
-  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
-  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
-
-  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
-  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
-  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
-  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
-  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
-  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
-]
+[
+  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
+  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
+  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
+  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
+
+  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
+  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
+  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
+  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
+
+  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
+  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
+  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
+  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
+
+  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
+  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
+  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
+
+  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
+  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
+  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
+
+  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
+  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
+  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
+  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
+  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
+
+  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
+  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
+
+  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
+  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
+  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
+  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
+  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
+
+  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
+  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
+  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
+  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
+  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
+
+  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
+  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
+  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
+  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
+  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
+  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
+]
--- a/data/refFeatures.json
+++ b/data/refFeatures.json
@ -1,47 +1,47 @@
-[
-  { "name" : "APSES fold",
-    "description " : "DNA binding domain by similarity to structure",
-    "sourceDB" : "PDB",
-    "accession" : "1BM8_A_1_99"},
-
-  { "name" : "KilA-N",
-    "description " : "DNA binding domain by Pfam annotation",
-    "sourceDB" : "Pfam",
-    "accession" : "PF04383"},
-
-  { "name" : "AT hook",
-    "description " : "DNA interaction motif by SMART annotation",
-    "sourceDB" : "SMART",
-    "accession" : null},
-
-  { "name" : "low complexity",
-    "description " : "SEG annotation by SMART",
-    "sourceDB" : "SMART",
-    "accession" : null},
-
-  { "name" : "Ankyrin fold",
-    "description " : "Ankyrin domain by SMART annotation",
-    "sourceDB" : "SMART",
-    "accession" : "SM00248"},
-
-  { "name" : "Swi6 fold",
-    "description " : "Swi6 fold by similarity to structure",
-    "sourceDB" : "PDB",
-    "accession" : "1SW6_B"},
-
-  { "name" : "coiled coil",
-    "description " : "Coiled coil by SMART annotation",
-    "sourceDB" : "SMART",
-    "accession" : null},
-
-  { "name" : "McInerny 2011",
-    "description " : "Yeast cell cycle review",
-    "sourceDB" : "PubMed",
-    "accession" : "21310294"}
-]
-
-
-
-
-
-
+[
+  { "name" : "APSES fold",
+    "description " : "DNA binding domain by similarity to structure",
+    "sourceDB" : "PDB",
+    "accession" : "1BM8_A_1_99"},
+
+  { "name" : "KilA-N",
+    "description " : "DNA binding domain by Pfam annotation",
+    "sourceDB" : "Pfam",
+    "accession" : "PF04383"},
+
+  { "name" : "AT hook",
+    "description " : "DNA interaction motif by SMART annotation",
+    "sourceDB" : "SMART",
+    "accession" : null},
+
+  { "name" : "low complexity",
+    "description " : "SEG annotation by SMART",
+    "sourceDB" : "SMART",
+    "accession" : null},
+
+  { "name" : "Ankyrin fold",
+    "description " : "Ankyrin domain by SMART annotation",
+    "sourceDB" : "SMART",
+    "accession" : "SM00248"},
+
+  { "name" : "Swi6 fold",
+    "description " : "Swi6 fold by similarity to structure",
+    "sourceDB" : "PDB",
+    "accession" : "1SW6_B"},
+
+  { "name" : "coiled coil",
+    "description " : "Coiled coil by SMART annotation",
+    "sourceDB" : "SMART",
+    "accession" : null},
+
+  { "name" : "McInerny 2011",
+    "description " : "Yeast cell cycle review",
+    "sourceDB" : "PubMed",
+    "accession" : "21310294"}
+]
+
+
+
+
+
+
--- a/data/refMBP1Proteins.json
+++ b/data/refMBP1Proteins.json
@ -1,155 +1,155 @@
-[
-  { "name" : "MBP1_SCHPO",
-    "RefSeqID" : "NP_593032",
-    "UniProtID" : "P41412",
-    "taxonomyID" : 284812,
-    "sequence" : [
-       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
-       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
-       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
-       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
-       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
-       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
-       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
-       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
-       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
-       "AMSCGINPEDLSLEILDAVEEALTREK"]
-  },
-  { "name" : "MBP1_ASPNI",
-    "RefSeqID" : "XP_660758",
-    "UniProtID" : "Q5B8H6",
-    "taxonomyID" : 227321,
-    "sequence" : [
-       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
-       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
-       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
-       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
-       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
-       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
-       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
-       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
-       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
-       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
-  },
-  { "name" : "MBP1_BIPOR",
-    "RefSeqID" : "XP_007682304",
-    "UniProtID" : "W6ZM86",
-    "taxonomyID" : 930090,
-    "sequence" : [
-       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
-       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
-       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
-       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
-       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
-       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
-       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
-       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
-       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
-       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
-  },
-  { "name" : "MBP1_NEUCR",
-    "RefSeqID" : "XP_955821",
-    "UniProtID" : "Q7RW59",
-    "taxonomyID" : 367110,
-    "sequence" : [
-       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
-       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
-       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
-       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
-       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
-       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
-       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
-       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
-       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
-       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
-       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
-       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
-  },
-  { "name" : "MBP1_COPCI",
-    "RefSeqID" : "XP_001837394",
-    "UniProtID" : "A8NYC6",
-    "taxonomyID" : 240176,
-    "sequence" : [
-       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
-       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
-       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
-       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
-       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
-       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
-       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
-       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
-       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
-       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
-       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
-  },
-  { "name" : "MBP1_CRYNE",
-    "RefSeqID" : "XP_569090",
-    "UniProtID" : "Q5KMQ9",
-    "taxonomyID" : 214684,
-    "sequence" : [
-       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
-       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
-       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
-       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
-       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
-       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
-       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
-       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
-       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
-       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
-       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
-  },
-  { "name" : "MBP1_PUCGR",
-    "RefSeqID" : "XP_003327086",
-    "UniProtID" : "E3KED4",
-    "taxonomyID" : 418459,
-    "sequence" : [
-       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
-       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
-       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
-       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
-       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
-       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
-       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
-       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
-       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
-       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
-       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
-       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
-       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
-       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
-  },
-  { "name" : "MBP1_USTMA",
-    "RefSeqID" : "XP_011392621",
-    "UniProtID" : "A0A0D1DP35",
-    "taxonomyID" : 237631,
-    "sequence" : [
-       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
-       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
-       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
-       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
-       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
-       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
-       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
-       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
-       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
-       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
-       "P"]
-  },
-  { "name" : "MBP1_WALME",
-    "RefSeqID" : "XP_006957051",
-    "UniProtID" : "I4YGC0",
-    "taxonomyID" : 671144,
-    "sequence" : [
-       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
-       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
-       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
-       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
-       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
-       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
-       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
-       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
-       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
-       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
-  }
-]
+[
+  { "name" : "MBP1_SCHPO",
+    "RefSeqID" : "NP_593032",
+    "UniProtID" : "P41412",
+    "taxonomyID" : 284812,
+    "sequence" : [
+       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
+       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
+       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
+       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
+       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
+       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
+       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
+       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
+       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
+       "AMSCGINPEDLSLEILDAVEEALTREK"]
+  },
+  { "name" : "MBP1_ASPNI",
+    "RefSeqID" : "XP_660758",
+    "UniProtID" : "Q5B8H6",
+    "taxonomyID" : 227321,
+    "sequence" : [
+       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
+       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
+       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
+       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
+       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
+       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
+       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
+       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
+       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
+       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
+  },
+  { "name" : "MBP1_BIPOR",
+    "RefSeqID" : "XP_007682304",
+    "UniProtID" : "W6ZM86",
+    "taxonomyID" : 930090,
+    "sequence" : [
+       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
+       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
+       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
+       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
+       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
+       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
+       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
+       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
+       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
+       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
+  },
+  { "name" : "MBP1_NEUCR",
+    "RefSeqID" : "XP_955821",
+    "UniProtID" : "Q7RW59",
+    "taxonomyID" : 367110,
+    "sequence" : [
+       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
+       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
+       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
+       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
+       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
+       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
+       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
+       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
+       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
+       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
+       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
+       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
+  },
+  { "name" : "MBP1_COPCI",
+    "RefSeqID" : "XP_001837394",
+    "UniProtID" : "A8NYC6",
+    "taxonomyID" : 240176,
+    "sequence" : [
+       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
+       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
+       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
+       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
+       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
+       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
+       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
+       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
+       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
+       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
+       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
+  },
+  { "name" : "MBP1_CRYNE",
+    "RefSeqID" : "XP_569090",
+    "UniProtID" : "Q5KMQ9",
+    "taxonomyID" : 214684,
+    "sequence" : [
+       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
+       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
+       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
+       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
+       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
+       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
+       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
+       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
+       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
+       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
+       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
+  },
+  { "name" : "MBP1_PUCGR",
+    "RefSeqID" : "XP_003327086",
+    "UniProtID" : "E3KED4",
+    "taxonomyID" : 418459,
+    "sequence" : [
+       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
+       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
+       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
+       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
+       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
+       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
+       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
+       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
+       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
+       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
+       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
+       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
+       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
+       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
+  },
+  { "name" : "MBP1_USTMA",
+    "RefSeqID" : "XP_011392621",
+    "UniProtID" : "A0A0D1DP35",
+    "taxonomyID" : 237631,
+    "sequence" : [
+       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
+       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
+       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
+       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
+       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
+       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
+       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
+       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
+       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
+       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
+       "P"]
+  },
+  { "name" : "MBP1_WALME",
+    "RefSeqID" : "XP_006957051",
+    "UniProtID" : "I4YGC0",
+    "taxonomyID" : 671144,
+    "sequence" : [
+       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
+       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
+       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
+       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
+       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
+       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
+       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
+       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
+       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
+       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
+  }
+]
--- a/data/refTaxonomy.json
+++ b/data/refTaxonomy.json
@ -1,22 +1,22 @@
-[
-  { "ID" : 227321,
-    "species" : "Aspergillus nidulans FGSC A4"},
-  { "ID" : 930090,
-    "species" : "Bipolaris oryzae ATCC 44560"},
-  { "ID" : 240176,
-    "species" : "Coprinopsis cinerea okayama7#130"},
-  { "ID" : 214684,
-    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
-  { "ID" : 367110,
-    "species" : "Neurospora crassa OR74A"},
-  { "ID" : 418459,
-    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
-  { "ID" : 559292,
-    "species" : "Saccharomyces cerevisiae S288C"},
-  { "ID" : 284812,
-    "species" : "Schizosaccharomyces pombe 972h-"},
-  { "ID" : 237631,
-    "species" : "Ustilago maydis 521"},
-  { "ID" : 671144,
-    "species" : "Wallemia mellicola CBS 633.66"}
-]
+[
+  { "ID" : 227321,
+    "species" : "Aspergillus nidulans FGSC A4"},
+  { "ID" : 930090,
+    "species" : "Bipolaris oryzae ATCC 44560"},
+  { "ID" : 240176,
+    "species" : "Coprinopsis cinerea okayama7#130"},
+  { "ID" : 214684,
+    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
+  { "ID" : 367110,
+    "species" : "Neurospora crassa OR74A"},
+  { "ID" : 418459,
+    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
+  { "ID" : 559292,
+    "species" : "Saccharomyces cerevisiae S288C"},
+  { "ID" : 284812,
+    "species" : "Schizosaccharomyces pombe 972h-"},
+  { "ID" : 237631,
+    "species" : "Ustilago maydis 521"},
+  { "ID" : 671144,
+    "species" : "Wallemia mellicola CBS 633.66"}
+]
--- a/data/referenceDomainAnnotations.txt
+++ b/data/referenceDomainAnnotations.txt
@ -1,115 +1,115 @@
-ID	protein.ID	feature.ID	start	end	note
-# MBP1_SACCE
-NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
-NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
-NA	ref_pro_4	ref_ftr_4	108	122	low complexity
-NA	ref_pro_4	ref_ftr_4	236	241	low complexity
-NA	ref_pro_4	ref_ftr_4	279	307	low complexity
-NA	ref_pro_4	ref_ftr_4	700	717	low complexity
-NA	ref_pro_4	ref_ftr_4	700	717	low complexity
-NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
-NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
-NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
-NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
-NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
-# MBP1_ASPNI
-NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
-NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
-NA	ref_pro_1	ref_ftr_4	529	534	low complexity
-NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
-NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
-NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
-NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
-# MBP1_BIPOR
-NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
-NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
-NA	ref_pro_2	ref_ftr_4	134	152	low complexity
-NA	ref_pro_2	ref_ftr_4	267	278	low complexity
-NA	ref_pro_2	ref_ftr_4	670	685	low complexity
-NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
-NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
-NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
-NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
-NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
-# MBP1_NEUCR
-NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
-NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
-NA	ref_pro_3	ref_ftr_4	130	141	low complexity
-NA	ref_pro_3	ref_ftr_4	253	266	low complexity
-NA	ref_pro_3	ref_ftr_4	514	525	low complexity
-NA	ref_pro_3	ref_ftr_4	554	564	low complexity
-NA	ref_pro_3	ref_ftr_4	601	618	low complexity
-NA	ref_pro_3	ref_ftr_4	620	629	low complexity
-NA	ref_pro_3	ref_ftr_4	636	652	low complexity
-NA	ref_pro_3	ref_ftr_4	658	672	low complexity
-NA	ref_pro_3	ref_ftr_4	725	735	low complexity
-NA	ref_pro_3	ref_ftr_4	752	771	low complexity
-NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
-NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
-NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
-NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
-# MBP1_SCHPO
-NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
-NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
-NA	ref_pro_5	ref_ftr_4	111	125	low complexity
-NA	ref_pro_5	ref_ftr_4	136	145	low complexity
-NA	ref_pro_5	ref_ftr_4	176	191	low complexity
-NA	ref_pro_5	ref_ftr_4	422	447	low complexity
-NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
-NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
-NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
-NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
-# MBP1_COPCI
-NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
-NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
-NA	ref_pro_6	ref_ftr_4	170	191	low complexity
-NA	ref_pro_6	ref_ftr_4	435	450	low complexity
-NA	ref_pro_6	ref_ftr_4	611	626	low complexity
-NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
-NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
-NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
-NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
-NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
-NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
-# MBP1_CRYNE
-NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
-NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
-NA	ref_pro_7	ref_ftr_4	66	85	low complexity
-NA	ref_pro_7	ref_ftr_4	413	423	low complexity
-NA	ref_pro_7	ref_ftr_4	633	644	low complexity
-NA	ref_pro_7	ref_ftr_4	697	709	low complexity
-NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
-NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
-NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
-# MBP1_PUCGR
-NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
-NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
-NA	ref_pro_8	ref_ftr_4	208	227	low complexity
-NA	ref_pro_8	ref_ftr_4	273	291	low complexity
-NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
-NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
-NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
-NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
-NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
-# MBP1_USTMA
-NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
-NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
-NA	ref_pro_9	ref_ftr_4	106	116	low complexity
-NA	ref_pro_9	ref_ftr_4	161	183	low complexity
-NA	ref_pro_9	ref_ftr_4	657	672	low complexity
-NA	ref_pro_9	ref_ftr_4	776	796	low complexity
-NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
-NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
-NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
-NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
-# MBP1_WALME
-NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
-NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
-NA	ref_pro_10	ref_ftr_4	149	162	low complexity
-NA	ref_pro_10	ref_ftr_4	171	188	low complexity
-NA	ref_pro_10	ref_ftr_4	618	628	low complexity
-NA	ref_pro_10	ref_ftr_4	634	660	low complexity
-NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
-NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
-NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
-NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
+ID	protein.ID	feature.ID	start	end	note
+# MBP1_SACCE
+NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
+NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
+NA	ref_pro_4	ref_ftr_4	108	122	low complexity
+NA	ref_pro_4	ref_ftr_4	236	241	low complexity
+NA	ref_pro_4	ref_ftr_4	279	307	low complexity
+NA	ref_pro_4	ref_ftr_4	700	717	low complexity
+NA	ref_pro_4	ref_ftr_4	700	717	low complexity
+NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
+NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
+NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
+NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
+NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
+# MBP1_ASPNI
+NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
+NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
+NA	ref_pro_1	ref_ftr_4	529	534	low complexity
+NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
+NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
+NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
+NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
+# MBP1_BIPOR
+NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
+NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
+NA	ref_pro_2	ref_ftr_4	134	152	low complexity
+NA	ref_pro_2	ref_ftr_4	267	278	low complexity
+NA	ref_pro_2	ref_ftr_4	670	685	low complexity
+NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
+NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
+NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
+NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
+NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
+# MBP1_NEUCR
+NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
+NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
+NA	ref_pro_3	ref_ftr_4	130	141	low complexity
+NA	ref_pro_3	ref_ftr_4	253	266	low complexity
+NA	ref_pro_3	ref_ftr_4	514	525	low complexity
+NA	ref_pro_3	ref_ftr_4	554	564	low complexity
+NA	ref_pro_3	ref_ftr_4	601	618	low complexity
+NA	ref_pro_3	ref_ftr_4	620	629	low complexity
+NA	ref_pro_3	ref_ftr_4	636	652	low complexity
+NA	ref_pro_3	ref_ftr_4	658	672	low complexity
+NA	ref_pro_3	ref_ftr_4	725	735	low complexity
+NA	ref_pro_3	ref_ftr_4	752	771	low complexity
+NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
+NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
+NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
+NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
+# MBP1_SCHPO
+NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
+NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
+NA	ref_pro_5	ref_ftr_4	111	125	low complexity
+NA	ref_pro_5	ref_ftr_4	136	145	low complexity
+NA	ref_pro_5	ref_ftr_4	176	191	low complexity
+NA	ref_pro_5	ref_ftr_4	422	447	low complexity
+NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
+NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
+NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
+NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
+# MBP1_COPCI
+NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
+NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
+NA	ref_pro_6	ref_ftr_4	170	191	low complexity
+NA	ref_pro_6	ref_ftr_4	435	450	low complexity
+NA	ref_pro_6	ref_ftr_4	611	626	low complexity
+NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
+NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
+NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
+NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
+NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
+NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
+# MBP1_CRYNE
+NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
+NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
+NA	ref_pro_7	ref_ftr_4	66	85	low complexity
+NA	ref_pro_7	ref_ftr_4	413	423	low complexity
+NA	ref_pro_7	ref_ftr_4	633	644	low complexity
+NA	ref_pro_7	ref_ftr_4	697	709	low complexity
+NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
+NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
+NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
+# MBP1_PUCGR
+NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
+NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
+NA	ref_pro_8	ref_ftr_4	208	227	low complexity
+NA	ref_pro_8	ref_ftr_4	273	291	low complexity
+NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
+NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
+NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
+NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
+NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
+# MBP1_USTMA
+NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
+NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
+NA	ref_pro_9	ref_ftr_4	106	116	low complexity
+NA	ref_pro_9	ref_ftr_4	161	183	low complexity
+NA	ref_pro_9	ref_ftr_4	657	672	low complexity
+NA	ref_pro_9	ref_ftr_4	776	796	low complexity
+NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
+NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
+NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
+NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
+# MBP1_WALME
+NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
+NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
+NA	ref_pro_10	ref_ftr_4	149	162	low complexity
+NA	ref_pro_10	ref_ftr_4	171	188	low complexity
+NA	ref_pro_10	ref_ftr_4	618	628	low complexity
+NA	ref_pro_10	ref_ftr_4	634	660	low complexity
+NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
+NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
+NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
+NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
--- a/functionTemplate.R
+++ b/functionTemplate.R
@ -1,37 +1,37 @@
-# functionTemplate.R
-#
-# Purpose:  (General)
-#
-# ToDo:
-# Notes:
-#
-# ==============================================================================
-
-myFunction <- function(a, b=1) {
-	# Purpose:
-	#     Describe ...
-    # Version:
-    # Date:
-    # Author:
-    #
-    # Parameters:
-	#     a: ...
-	#     b: ...
-	# Value:
-	#     result: ...
-	# Example: <example invocation>
-
-	# code ...
-
-	return(result)
-}
-
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-if (FALSE) {
-  # test ...
-}
-
-# [END]
+# functionTemplate.R
+#
+# Purpose:  (General)
+#
+# ToDo:
+# Notes:
+#
+# ==============================================================================
+
+myFunction <- function(a, b=1) {
+	# Purpose:
+	#     Describe ...
+    # Version:
+    # Date:
+    # Author:
+    #
+    # Parameters:
+	#     a: ...
+	#     b: ...
+	# Value:
+	#     result: ...
+	# Example: <example invocation>
+
+	# code ...
+
+	return(result)
+}
+
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+# [END]
--- a/myScripts/.myProfile.R
+++ b/myScripts/.myProfile.R
@ -1,21 +1,21 @@
-# .myProfile.R
-# This contains information which the course framework needs from time to time
-# to personalize assignments, validate submissions etc. Make sure that
-# the information correctly matches our official records.
-# myEmail          char      A string with your eMail address. Use your official
-#                            UofT eMail address.
-# myStudentNumber  numeric   Your UofT student number. Take care to have this
-#                            correct.
-#
-# NOTE:
-# After you have updated this script, move the file to your "myScripts" folder.
-# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
-#
-# ==============================================================================
-# options(stringsAsFactors = FALSE)
-
-myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
-myStudentNumber <- 1005845285  # e.g. 1003141592
-MYSPE <- "Cutaneotrichosporon oleaginosum" 
-
-# [END]
+# .myProfile.R
+# This contains information which the course framework needs from time to time
+# to personalize assignments, validate submissions etc. Make sure that
+# the information correctly matches our official records.
+# myEmail          char      A string with your eMail address. Use your official
+#                            UofT eMail address.
+# myStudentNumber  numeric   Your UofT student number. Take care to have this
+#                            correct.
+#
+# NOTE:
+# After you have updated this script, move the file to your "myScripts" folder.
+# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
+#
+# ==============================================================================
+# options(stringsAsFactors = FALSE)
+
+myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
+myStudentNumber <- 1005845285  # e.g. 1003141592
+MYSPE <- "Cutaneotrichosporon oleaginosum" 
+
+# [END]
--- a/myScripts/ABC-INT-Mutation_impact-code.R
+++ b/myScripts/ABC-INT-Mutation_impact-code.R
@ -1,54 +1,51 @@
-myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
-myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
-myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
-myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
-rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
-
-gen_mutations <- function(seq, N) {
-  stats <- c()
-  stats <- cbind(stats, c(0, 0, 0))
-  rownames(stats) <- c("silent", "missense", "nonsense")
-  colnames(stats) <- c("occurrences")
-  # Actual function
-  for (i in 1:217) {
-    # select index for mutation
-    working_seq <- Biostrings::DNAString(seq)
-    aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE)
-    mut_action <- sample(c("ins", "del", "sub"), 1, TRUE)
-    mut_seq <- Biostrings::DNAString(seq)
-    if (mut_action == "sub") {
-      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
-      possible_mutations <- Biostrings::DNA_BASES
-      possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))]
-      mut_change <- sample(possible_mutations, 1, replace = TRUE)
-      mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change)
-    } else if (mut_action == "ins") {
-      mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE)
-      possible_mutations <- Biostrings::DNA_BASES
-      mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = ""))
-    } else {
-      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
-      mut_seq <- mut_seq[-mut_index]
-    }
-    mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
-    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
-
-    # Note: we need silent, nonsense, and missense
-    mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa))
-    aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq))
-    if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) {
-      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
-    } else if (mut_aa == aa_seq) {
-      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
-    } else {
-      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
-    }
-  }
-  return(stats)
-}
-N_test <- 1200
-gen_mutations("ATGATGATGATGATGATG", N_test)
-gen_mutations("CCCCCCCCCCCCCCCCCC", N_test)
-gen_mutations("TATTACTATTACTATTAC", N_test)
-gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
-gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)
+gen_mutations <- function(seq, N) {
+  sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
+  stats <- c()
+  stats <- cbind(stats, c(0, 0, 0))
+  rownames(stats) <- c("silent", "missense", "nonsense")
+  colnames(stats) <- c("occurrences")
+  # Actual function
+  for (i in 1:N) {
+    original_seq <- Biostrings::DNAString(seq)
+    aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
+
+    mut_seq <- Biostrings::DNAString(seq)
+    mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
+    possible_mutations <- Biostrings::DNA_BASES
+    possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
+    mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
+    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
+
+
+    term_aa <- regexpr(pattern = "\\*", aa_seq)
+    term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
+    if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
+      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
+    } else if (mut_aa == aa_seq) {
+      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
+    } else {
+      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
+    }
+  }
+  sealKey()
+  return(stats)
+}
+
+gen_mutations("ATGATGATGATGATGATG", 1000)
+gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
+gen_mutations("TATTACTATTACTATTAC", 500)
+gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
+gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
+gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
+
+
+myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
+myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
+myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
+myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
+rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
+
+gen_mutations(myFA["RAB39B", 2], 10000)
+gen_mutations(myFA["PTPN5", 2], 10000)
+gen_mutations(myFA["PTPN11", 2], 10000)
+gen_mutations(myFA["KRAS", 2], 10000)
--- a/myScripts/BIN-Storing_data.R
+++ b/myScripts/BIN-Storing_data.R
@ -1,41 +1,41 @@
-# ==   1.3  Task: submit for credit (part 1/2)  ================================
-# == Submission - Code to add another philosopher to the datamodel:
-
-pID <- autoincrement(philDB$person)
-immanuelKant <- data.frame(id = pID,
-                           name = "Immanuel Kant",
-                           born = "1724",
-                           died = "1804",
-                           school = "Enlightenment Philosophy")
-philDB$person <- rbind(philDB$person, immanuelKant)
-
-bID = autoincrement(philDB$books)
-immanuelKantWork <- data.frame(id = bID,
-                               title = "Critique of Pure Reason",
-                               published = "1781")
-philDB$books <- rbind(philDB$books, immanuelKantWork)
-philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
-
-bID = autoincrement(philDB$books)
-immanuelKantWork <- data.frame(id = bID,
-                               title = "Critique of Judgement",
-                               published = "1790")
-philDB$books <- rbind(philDB$books, immanuelKantWork)
-philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
-
-# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
-
-schools <- unique(philDB$person$school)
-schools <- sort(schools)
-
-for (s in schools) {
-  cat(sprintf("%s\n", s))
-  authors = which(philDB$person$school == s)
-  for (author in authors) {
-    works = which(philDB$works$personID == author)
-    for (work in works) {
-      bookId = which(philDB$books$id == philDB$works$bookID[work])
-      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
-    }
-  }
+# ==   1.3  Task: submit for credit (part 1/2)  ================================
+# == Submission - Code to add another philosopher to the datamodel:
+
+pID <- autoincrement(philDB$person)
+immanuelKant <- data.frame(id = pID,
+                           name = "Immanuel Kant",
+                           born = "1724",
+                           died = "1804",
+                           school = "Enlightenment Philosophy")
+philDB$person <- rbind(philDB$person, immanuelKant)
+
+bID = autoincrement(philDB$books)
+immanuelKantWork <- data.frame(id = bID,
+                               title = "Critique of Pure Reason",
+                               published = "1781")
+philDB$books <- rbind(philDB$books, immanuelKantWork)
+philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
+
+bID = autoincrement(philDB$books)
+immanuelKantWork <- data.frame(id = bID,
+                               title = "Critique of Judgement",
+                               published = "1790")
+philDB$books <- rbind(philDB$books, immanuelKantWork)
+philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
+
+# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
+
+schools <- unique(philDB$person$school)
+schools <- sort(schools)
+
+for (s in schools) {
+  cat(sprintf("%s\n", s))
+  authors = which(philDB$person$school == s)
+  for (author in authors) {
+    works = which(philDB$works$personID == author)
+    for (work in works) {
+      bookId = which(philDB$books$id == philDB$works$bookID[work])
+      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
+    }
+  }
 }
--- a/myScripts/CUTOLTaxonomy.json
+++ b/myScripts/CUTOLTaxonomy.json
@ -1,4 +1,4 @@
-[{
-	"ID": 879819,
-	"species": "Cutaneotrichosporon oleaginosum"}
-]
+[{
+	"ID": 879819,
+	"species": "Cutaneotrichosporon oleaginosum"}
+]
--- a/myScripts/MBP1_CUTOL.json
+++ b/myScripts/MBP1_CUTOL.json
@ -1,19 +1,19 @@
-[
-  { "name" : "MBP1_CUTOL",
-    "RefSeqID" : "XP_018278493.1",
-    "UniProtID" : "A0A0J0XLN0",
-    "taxonomyID" : 879819,
-    "sequence" : [
-       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
-       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
-       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
-       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
-       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
-       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
-       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
-       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
-       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
-       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
-       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
-  }
-]
+[
+  { "name" : "MBP1_CUTOL",
+    "RefSeqID" : "XP_018278493.1",
+    "UniProtID" : "A0A0J0XLN0",
+    "taxonomyID" : 879819,
+    "sequence" : [
+       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
+       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
+       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
+       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
+       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
+       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
+       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
+       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
+       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
+       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
+       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
+  }
+]
--- a/myScripts/README-myScripts.txt
+++ b/myScripts/README-myScripts.txt
@ -1,8 +1,8 @@
-README - myScripts folder:
-==========================
-
-The "myScripts" folder is a place to keep your personal files
-safe. No files will be submitted into this folder on the GitHub, master
-copy. Thefore everything you put into this folder is safe from being
-inadvertently overwritten by a file with the same name that would be
-downloaded in a GitHub "pull" request.
+README - myScripts folder:
+==========================
+
+The "myScripts" folder is a place to keep your personal files
+safe. No files will be submitted into this folder on the GitHub, master
+copy. Thefore everything you put into this folder is safe from being
+inadvertently overwritten by a file with the same name that would be
+downloaded in a GitHub "pull" request.
--- a/myScripts/makeProteinDB.R
+++ b/myScripts/makeProteinDB.R
@ -1,4 +1,4 @@
-source("./scripts/ABC-createRefDB.R")
-
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
-myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
+source("./scripts/ABC-createRefDB.R")
+
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
+myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
--- a/myScripts/myScript.R
+++ b/myScripts/myScript.R
@ -1,38 +1,38 @@
-# myScript.R
-#
-# --- As you work with this file, you can delete the instructions below --------
-# Write your notes and code experiments into this document. Save it
-# from time to time - however I recommend that you do not _commit_
-# your saved version.
-#
-# As long as you do not _commit_ this script to version control,
-# you can _pull_ updated versions of the entire project from GitHub
-# by using the RStudio version control interface. However, once
-# you _commit_ any file in your local version, RStudio will require
-# you to resolve conflicts before you can _pull_ updates.
-# --- As you work with this file, you can delete the instructions above --------
-#
-## Purpose: <...>
-#
-# Version: <...>
-#
-# Date:    <...>
-# Author:  <Name> (<namee@mail.utoronto.ca>)
-#
-# Versions:
-#
-#   <number>    <Features>
-#
-# TODO:
-#   <...>
-#
-# ====================================================================
-
-
-
-
-
-
-
-# [END]
-
+# myScript.R
+#
+# --- As you work with this file, you can delete the instructions below --------
+# Write your notes and code experiments into this document. Save it
+# from time to time - however I recommend that you do not _commit_
+# your saved version.
+#
+# As long as you do not _commit_ this script to version control,
+# you can _pull_ updated versions of the entire project from GitHub
+# by using the RStudio version control interface. However, once
+# you _commit_ any file in your local version, RStudio will require
+# you to resolve conflicts before you can _pull_ updates.
+# --- As you work with this file, you can delete the instructions above --------
+#
+## Purpose: <...>
+#
+# Version: <...>
+#
+# Date:    <...>
+# Author:  <Name> (<namee@mail.utoronto.ca>)
+#
+# Versions:
+#
+#   <number>    <Features>
+#
+# TODO:
+#   <...>
+#
+# ====================================================================
+
+
+
+
+
+
+
+# [END]
+
--- a/plottingReference.R
+++ b/plottingReference.R
--- a/scriptTemplate.R
+++ b/scriptTemplate.R
@ -1,75 +1,75 @@
-# scriptTemplate.R
-#
-# Purpose:
-# Version:
-# Date:
-# Author:
-#
-# Input:
-# Output:
-# Dependencies:
-#
-# ToDo:
-# Notes:
-#
-# ==============================================================================
-
-setwd("<your/project/directory>")
-
-# ====  PARAMETERS  ============================================================
-# Define and explain all parameters. No "magic numbers" in your code below.
-
-
-
-# ====  PACKAGES  ==============================================================
-# Check that required packages have been installed. Install if needed.
-
-if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
-}
-# Package information:
-#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
-
-# Note: use package functions with the :: operator - eg.
-# seqinr::aaa("K")
-
-
-
-# ====  FUNCTIONS  =============================================================
-
-# Define functions or source external files
-source("<myUtilityFunctionsScript.R>")
-
-myFunction <- function(a, b=1) {
-	# Purpose:
-	#     Describe ...
-	# Parameters:
-	#     a: ...
-	#     b: ...
-	# Value:
-	#     result: ...
-
-	# code ...
-
-	return(result)
-}
-
-
-
-# ====  PROCESS  ===============================================================
-# Enter the step-by-step process of your project here. Strive to write your
-# code so that you can simply run this entire file and re-create all
-# intermediate results.
-
-
-
-
-
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-
-# [END]
+# scriptTemplate.R
+#
+# Purpose:
+# Version:
+# Date:
+# Author:
+#
+# Input:
+# Output:
+# Dependencies:
+#
+# ToDo:
+# Notes:
+#
+# ==============================================================================
+
+setwd("<your/project/directory>")
+
+# ====  PARAMETERS  ============================================================
+# Define and explain all parameters. No "magic numbers" in your code below.
+
+
+
+# ====  PACKAGES  ==============================================================
+# Check that required packages have been installed. Install if needed.
+
+if (! requireNamespace("seqinr", quietly=TRUE)) {
+  install.packages("seqinr")
+}
+# Package information:
+#  library(help = seqinr)       # basic information
+#  browseVignettes("seqinr")    # available vignettes
+#  data(package = "seqinr")     # available datasets
+
+# Note: use package functions with the :: operator - eg.
+# seqinr::aaa("K")
+
+
+
+# ====  FUNCTIONS  =============================================================
+
+# Define functions or source external files
+source("<myUtilityFunctionsScript.R>")
+
+myFunction <- function(a, b=1) {
+	# Purpose:
+	#     Describe ...
+	# Parameters:
+	#     a: ...
+	#     b: ...
+	# Value:
+	#     result: ...
+
+	# code ...
+
+	return(result)
+}
+
+
+
+# ====  PROCESS  ===============================================================
+# Enter the step-by-step process of your project here. Strive to write your
+# code so that you can simply run this entire file and re-create all
+# intermediate results.
+
+
+
+
+
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+
+# [END]
--- a/scripts/ABC-createRefDB.R
+++ b/scripts/ABC-createRefDB.R
@ -1,30 +1,30 @@
-# ABC-createRefDB.R
-#
-# Create a reference protein database for Mbp1-like proteins
-#
-# Boris Steipe for ABC learning units
-#
-# For the species, see:
-# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
-#
-# For the data model, see
-# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
-# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
-#
-# ==============================================================================
-
-
-myDB <- dbInit()
-
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
-
-myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
-
-myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
-
-myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
-
-
-# [END]
+# ABC-createRefDB.R
+#
+# Create a reference protein database for Mbp1-like proteins
+#
+# Boris Steipe for ABC learning units
+#
+# For the species, see:
+# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
+#
+# For the data model, see
+# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
+# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
+#
+# ==============================================================================
+
+
+myDB <- dbInit()
+
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
+
+myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
+
+myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
+
+myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
+
+
+# [END]
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@ -1,443 +1,443 @@
-# tocID <- "scripts/ABC-makeMYSPElist.R"
-#
-# Purpose:  Create a list of genome sequenced fungi with protein annotations and
-#               Mbp1 homologues.
-#
-# Version: 1.4
-#
-# Date:    2016  09  -  2021  09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions
-#          1.4    New retrieval logic
-#          1.3    Rewrite to change datasource. NCBI has not been updated
-#                   since 2012. Use ensembl fungi as initial source.
-#          1.2    Change from require() to requireNamespace()
-#          1.1.2  Moved BLAST.R to ./scripts directory
-#          1.1    Update 2017
-#          1.0    First code 2016
-#
-# TODO:
-#
-# ==============================================================================
-#
-# DO NOT  source()  THIS FILE!
-#
-# This file is code I provide for your deeper understanding of a process and
-# to provide you with useful sample code. It is not actually necessary for
-# you to run this code, but I encourage you to read it carefully and discuss
-# if there are parts you don't understand.
-#
-# Run the commands that interact with the NCBI servers only if you want to
-# experiment specifically with the code and/or parameters. I have commented out
-# those parts. If you only want to study the general workflow, just load()
-# the respective intermediate results.
-#
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                    Line
-#TOC> --------------------------------------------------------
-#TOC>   1        The strategy                               55
-#TOC>   2        PACKAGES AND INITIALIZATIONS               67
-#TOC>   3        ENSEMBL FUNGI                              75
-#TOC>   3.1        Import                                   78
-#TOC>   4        BLAST SEARCH                              155
-#TOC>   4.1        find homologous proteins                161
-#TOC>   4.2        Identify species in "hits"              192
-#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
-#TOC>   6        STUDENT NUMBERS                           375
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  The strategy  ========================================================
-
-# This script will create a list of "MYSPE" species and save it in an R object
-# MYSPEspecies that is stored in the data subdirectory of this project from
-# where it can be loaded. The strategy is as follows: we download a list of
-# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
-# species that have been annotated.
-# Next we perform a BLAST search, to identify fungal species that have
-# genes that are homologous to yeast MBP1.
-#
-# ...
-
-# =    2  PACKAGES AND INITIALIZATIONS  ========================================
-
-# httr provides interfaces to Webservers on the Internet
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-
-
-# =    3  ENSEMBL FUNGI  =======================================================
-
-
-# ==   3.1  Import  ============================================================
-
-# Navigate to https://fungi.ensembl.org and click on the link to the full
-# list of all species: https://fungi.ensembl.org/species.html
-# On the page, click on the spreadsheet symbol top right and choose
-# "download whole table". The file will be named  "Species.csv", in your
-# usual downloads folder. Move it to the data folder, and read it.
-
-sDat <- read.csv("./data/Species.csv")
-str(sDat)
-
-# The most obvious way to partition these is according to Classification ...
-# (poking around a bit in the UniProt taxonomy database shows that the
-#  classification used here is the taxonomic rank of "order").
-# how many classifications do we have?
-length(unique(sDat$Classification))  # 66
-
-# To have a good set for the class, we should have about 100.
-# Let's see for which of these we can find Mbp1 homologues.
-# First, we'll keep only the colums for name, classification, and taxID, and
-# drop the rest ...
-sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
-colnames(sDat) <- c("name", "order", "taxID")
-
-# Next, we make an extra column: genus - the first part of the binomial name.
-# We'll use the gsub() function, and for that we need a "regular expression"
-# that matches to all characters from the first blank to the end of the string:
-myPatt <- "\\s.*$"  # one whitespace (\\s) ...
-                    # followed by any character (.) 0..n times (*) ...
-                    # until the end of the string
-
-# using gsub() we substitue all matching characters with the empty string "" -
-# this deletes the matching characters
-# Test this:
-gsub(myPatt, "", "Genus")                      # one word: unchanged
-gsub(myPatt, "", "gEnus species")              # two words: return only first
-gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
-
-# apply this to the "name" column and add the result as a separate column
-# called "genus"
-sDat$genus <- gsub(myPatt, "", sDat$name)
-
-# what do we get?
-c(head(unique(sDat$genus)),
-  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
-                             # is a problem that we have to keep in mind.
-                             # (Always inspect your results!)
-# Drop all rows for which the genus contains special chracters -
-# like "[Candida]"
-sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
-
-length(table(sDat$genus))    # how many genus?
-hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
-                                              # most genus have very few, but
-                                              # some have very many species.
-sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
-
-# We should have at least one species from each taxonomic order, but we can
-# add a few genus until we have about 100 validated species.
-
-# Let's add a column for species, by changing our regular expression a bit,
-# using ^ (start of string), \\S (NOT a whitespace),
-# and + (one or more matches), capturing the match (...), and returning
-# it as the substitution (\\1) ...
-
-myPatt <- "^(\\S+\\s\\S+)\\s.*$"
-sDat$species <- gsub(myPatt, "\\1", sDat$name)
-
-# And we reorder the columns, just for aesthetics:
-sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
-
-# Final check:
-any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
-
-#
-# Now we check which of these have Mbp1 homologues ...
-
-# =    4  BLAST SEARCH  ========================================================
-
-
-# We run a BLAST search to find all proteins related to yeast Mbp1 in any
-# fungus. With the results, we'll annotate our sDat table.
-
-# ==   4.1  find homologous proteins  ==========================================
-#
-# Use BLAST to fetch proteins related to Mbp1 and identify the species that
-# contain them.
-
-# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
-# amount of error handling involved that is not supported by the API in a
-# principled way but requires rather ad hoc solutions. The code I threw together
-# to make a BLAST interface (demo-quality, not research-quality) is in the file
-# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
-# standard task of communicating with servers and parsing responses - everyday
-# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
-# parser in currently available packages.
-#
-# DON'T use this for BLAST searches unless you have read the NCBI policy
-# for automated tasks. If you indicriminately pound on the NCBI's BLAST
-# server, they will blacklist your IP-address. See:
-# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
-# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
-# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
-#                    db = "refseq_protein",        # database to search in
-#                    nHits = 3000,                 # 945 hits in 2020
-#                    E = 0.01,                     #
-#                    limits = "txid4751[ORGN]")    # = fungi
-# saveRDS(BLASThits, file="data/BLASThits.rds")
-#
-# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
-#
-BLASThits <- readRDS(file = "data/BLASThits.rds")
-
-# ==   4.2  Identify species in "hits"  ========================================
-
-# This is a very big list that can't be usefully analyzed manually. Here
-# we are only interested in the species names that it contains.
-
-# How many hits in the list?
-length(BLASThits$hits)      # 1,134
-
-# Let's look at a hit somewhere down the list
-str(BLASThits$hit[[277]])
-
-# A fair amount of parsing has gone into the BLAST.R code to prepare the results
-# in a useful way. The species information is in the $species element of every
-# hit.
-
-# Run a loop to extract all the species names into a vector. We subset ...
-# Blasthits$hits                 ... the list of hits, from which we choose ...
-# Blasthits$hits[[i]]            ... the i-th hit, and get ...
-# Blasthits$hits[[i]]$species    ... the species element from that.
-# Subsetting FTW.
-
-BLASTspecies <- character()
-for (i in seq_along(BLASThits$hits)) {
-    BLASTspecies[i] <- BLASThits$hits[[i]]$species
-}
-
-# You can confirm that BLASTspecies has the expected size.
-length(BLASTspecies)
-
-# if we delete some of these later on, we still want to remember which hit
-# they came from. Thus we name() the elements with their index, which is the
-# same as the index of the hit in BLASThits
-names(BLASTspecies) <- 1:length(BLASTspecies)
-
-
-# let's plot the distribution of E-values
-eVals <- numeric()
-for (i in seq_along(BLASThits$hits)) {
-  eVals[i] <- BLASThits$hits[[i]]$E
-}
-range(eVals)
-sum(eVals == 0)
-
-# let's plot the log of all values > 0 to see how they are distributed
-# plotting only one vectyor of numbers plots their index as x, and
-# their value as y ...
-plot(log(eVals[eVals > 0]), col = "#CC0000")
-
-# This is very informative: I would suspect that the first ten or so are
-# virtually identical to the yeast protein, then we have about 800 hits with
-# decreasing similarity, and then about 200 more that may actually be false
-# positives. Also - we plotted them by index, that means the table is SORTED:
-# Lower E-values strictly come before higher E-values.
-
-# Again, some species appear more than once, e.g. ...
-sum(BLASTspecies == "Saccharomyces cerevisiae")
-
-# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
-
-# Therefore we remove duplicates. Removing duplicates will leave the FIRST
-# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
-# species, we will retain only the protein that has the highest similarity
-# to yeast Mbp1, not any of its more distant paralogues.
-sel <- ! duplicated(BLASTspecies)
-BLASTspecies <- BLASTspecies[sel]
-
-length(BLASTspecies)
-# i.e. we got rid of about two thirds of the hits.
-tail(BLASTspecies)  # see how the names are useful!
-                    # again - there are some special characters ...
-                    # what are they?
-BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
-
-# remove the brackets ...
-BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
-# drop any new duplicates ...
-BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
-
-# check the number again:
-length(BLASTspecies)
-# Think a bit about this: what may be the biological reason to find that
-# on average, in 388 fungi across the entire phylogenetic tree, we have
-# three sequences that are homologous to yeast Mbp1?
-
-# Let's look at the distribution of E-values in this selection (Subsetting FTW):
-# we plot all values that are TRUE in the vector "sel" that we created above,
-# AND greater than 0
-plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
-
-
-# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
-
-# Next we add the blast result to our sDat dataframe. We'll store the index,
-# the E-value, and the Query-bounds from which we can estimate which domains
-# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
-# Mbp1's N-terminal APSES domain.)
-#
-# First we pull the hits we wanted from the BLASTspecies:
-iHits <- as.numeric(names(BLASTspecies))
-length(iHits)     # one index for each TRUE in sel
-
-# add columns to sDat
-l <- nrow(sDat)
-sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
-sDat$eVal   <- numeric(l)  # E-value of the hit
-sDat$lAli   <- numeric(l)  # length of the aligned region
-
-# extract and merge
-for (iHit in iHits) {
-  thisSp <- BLASThits$hits[[iHit]]$species
-  sel <- sDat$species == thisSp
-
-  sDat$iHit[sel]   <- iHit
-  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
-  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
-}
-
-# Are all reference species accounted for?
-selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
-REFspecies %in% sDat$species[selA]     # yes, all there
-
-selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
-
-sum(selA & selB)   # How many rows?
-
-# sDat of course includes all duplicates. Some may be multiply sequenced, some
-# may be different strains. We'll use the same strategy as before and keep
-# only the best hit: order the rows by E-value, then drop all rows which
-# are duplicated.
-
-
-# drop all rows without BLAST hits ...
-sDat <- sDat[ ! (sDat$iHit == 0) , ]
-
-# order sDat by E-value ...
-sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
-
-# drop all rows with duplicated species ...
-sDat <- sDat[ ! duplicated(sDat$species) , ]
-
-# Lets look at the E-values ...
-plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
-
-# and alignment lengths ...
-plot(sDat$lAli, col = "#00DDAA")
-
-# How many ...
-length(unique(sDat$name))
-length(unique(sDat$species))
-length(unique(sDat$genus))
-length(unique(sDat$order))
-
-# I need an extra species for admin purposes later on ...
-sel <- grep("Sporothrix schenckii", sDat$species)
-SPOSCdat <- sDat[sel, ]
-sDat <- sDat[-sel, ]
-
-# To get the final dataset, we remove the reference species with their
-# entire orders ...
-REForders <- unique(sDat$order[sDat$species %in% REFspecies])
-sel <- sDat$order %in% REForders
-REFdat <- sDat[sel , ]
-sDat   <- sDat[ ! sel , ]
-
-# REFdat should now contain only the REFspecies ...
-( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
-
-# ... but all of them
-sum(REFspecies %in% REFdat$species)
-
-# ... and we have enough left in sDat to prune sDat to unique genus
-sDat <- sDat[ ! duplicated(sDat$genus) , ]
-nrow(sDat)   # 84
-
-# I add back "Sporothrix schenckii" ...
-sDat <- rbind(SPOSCdat, sDat)
-
-# ... and save for future use.
-# saveRDS(sDat, file = "data/sDat.rds")
-# saveRDS(REFdat, file = "data/REFdat.rds")
-
-
-
-# =    6  STUDENT NUMBERS  =====================================================
-#
-# An asymmetric function to retrieve a MYSPE species
-#
-sDat <- readRDS(file = "data/sDat.rds")
-
-students <- read.csv("../BCH441-2021-students.csv")
-sN <- students$Integration.ID
-sN <- sN[! is.na(sN)]
-sN <- as.character(sN)
-sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
-
-set.seed(112358)
-theseSpecies <- sDat[sample(1:nrow(sDat)), ]
-all(sort(theseSpecies$name) == sort(sDat$name))
-nrow((theseSpecies))
-(iX <- grep("Sporothrix schenckii", theseSpecies$name))
-theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
-rndMin <-  992000000
-rndMax <- 1020000000
-N <- 10000
-keys <- as.character(sample(rndMin:rndMax, N + 1000))
-keys <- keys[! (keys %in% sN)]
-keys <- keys[1:N]
-keys[1:length(sN)] <- sN
-
-nRep <- floor(N/nrow(theseSpecies))
-MYSPEdat <- theseSpecies
-for(i in 1:nRep) {
-  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
-}
-MYSPEdat <- MYSPEdat[1:N, ]
-for (i in 1:N) {
-  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
-}
-set.seed(NULL)
-MYSPEdat <- MYSPEdat[sample(1:N), ]
-
-# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
-
-# === validate
-x <- character()
-for (n in sN) {
-  sp <- getMYSPE(n)
-  if (length(sp) != 1) {
-    stop(print(as.character(n)))
-  } else {
-    x <- c(x, sp)
-  }
-}
-
-# === species for late-comers
-y <- unique(MYSPEdat$species)
-print(y[!(y %in% x)])
-
-
-# === validate
-l <- length(sN)
-sp <- character(l)
-for(i in 1:l) {
-  sp[i] <- getMYSPE(sN[i])
-}
-any(duplicated(sp))
-length(unique(sp))
-which(! sDat$species %in% sp)  # these can be assigned to late-comers
-
-# Done.
-
-# [END]
+# tocID <- "scripts/ABC-makeMYSPElist.R"
+#
+# Purpose:  Create a list of genome sequenced fungi with protein annotations and
+#               Mbp1 homologues.
+#
+# Version: 1.4
+#
+# Date:    2016  09  -  2021  09
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions
+#          1.4    New retrieval logic
+#          1.3    Rewrite to change datasource. NCBI has not been updated
+#                   since 2012. Use ensembl fungi as initial source.
+#          1.2    Change from require() to requireNamespace()
+#          1.1.2  Moved BLAST.R to ./scripts directory
+#          1.1    Update 2017
+#          1.0    First code 2016
+#
+# TODO:
+#
+# ==============================================================================
+#
+# DO NOT  source()  THIS FILE!
+#
+# This file is code I provide for your deeper understanding of a process and
+# to provide you with useful sample code. It is not actually necessary for
+# you to run this code, but I encourage you to read it carefully and discuss
+# if there are parts you don't understand.
+#
+# Run the commands that interact with the NCBI servers only if you want to
+# experiment specifically with the code and/or parameters. I have commented out
+# those parts. If you only want to study the general workflow, just load()
+# the respective intermediate results.
+#
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                    Line
+#TOC> --------------------------------------------------------
+#TOC>   1        The strategy                               55
+#TOC>   2        PACKAGES AND INITIALIZATIONS               67
+#TOC>   3        ENSEMBL FUNGI                              75
+#TOC>   3.1        Import                                   78
+#TOC>   4        BLAST SEARCH                              155
+#TOC>   4.1        find homologous proteins                161
+#TOC>   4.2        Identify species in "hits"              192
+#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
+#TOC>   6        STUDENT NUMBERS                           375
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  The strategy  ========================================================
+
+# This script will create a list of "MYSPE" species and save it in an R object
+# MYSPEspecies that is stored in the data subdirectory of this project from
+# where it can be loaded. The strategy is as follows: we download a list of
+# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
+# species that have been annotated.
+# Next we perform a BLAST search, to identify fungal species that have
+# genes that are homologous to yeast MBP1.
+#
+# ...
+
+# =    2  PACKAGES AND INITIALIZATIONS  ========================================
+
+# httr provides interfaces to Webservers on the Internet
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+
+
+# =    3  ENSEMBL FUNGI  =======================================================
+
+
+# ==   3.1  Import  ============================================================
+
+# Navigate to https://fungi.ensembl.org and click on the link to the full
+# list of all species: https://fungi.ensembl.org/species.html
+# On the page, click on the spreadsheet symbol top right and choose
+# "download whole table". The file will be named  "Species.csv", in your
+# usual downloads folder. Move it to the data folder, and read it.
+
+sDat <- read.csv("./data/Species.csv")
+str(sDat)
+
+# The most obvious way to partition these is according to Classification ...
+# (poking around a bit in the UniProt taxonomy database shows that the
+#  classification used here is the taxonomic rank of "order").
+# how many classifications do we have?
+length(unique(sDat$Classification))  # 66
+
+# To have a good set for the class, we should have about 100.
+# Let's see for which of these we can find Mbp1 homologues.
+# First, we'll keep only the colums for name, classification, and taxID, and
+# drop the rest ...
+sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
+colnames(sDat) <- c("name", "order", "taxID")
+
+# Next, we make an extra column: genus - the first part of the binomial name.
+# We'll use the gsub() function, and for that we need a "regular expression"
+# that matches to all characters from the first blank to the end of the string:
+myPatt <- "\\s.*$"  # one whitespace (\\s) ...
+                    # followed by any character (.) 0..n times (*) ...
+                    # until the end of the string
+
+# using gsub() we substitue all matching characters with the empty string "" -
+# this deletes the matching characters
+# Test this:
+gsub(myPatt, "", "Genus")                      # one word: unchanged
+gsub(myPatt, "", "gEnus species")              # two words: return only first
+gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
+
+# apply this to the "name" column and add the result as a separate column
+# called "genus"
+sDat$genus <- gsub(myPatt, "", sDat$name)
+
+# what do we get?
+c(head(unique(sDat$genus)),
+  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
+                             # is a problem that we have to keep in mind.
+                             # (Always inspect your results!)
+# Drop all rows for which the genus contains special chracters -
+# like "[Candida]"
+sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
+
+length(table(sDat$genus))    # how many genus?
+hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
+                                              # most genus have very few, but
+                                              # some have very many species.
+sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
+
+# We should have at least one species from each taxonomic order, but we can
+# add a few genus until we have about 100 validated species.
+
+# Let's add a column for species, by changing our regular expression a bit,
+# using ^ (start of string), \\S (NOT a whitespace),
+# and + (one or more matches), capturing the match (...), and returning
+# it as the substitution (\\1) ...
+
+myPatt <- "^(\\S+\\s\\S+)\\s.*$"
+sDat$species <- gsub(myPatt, "\\1", sDat$name)
+
+# And we reorder the columns, just for aesthetics:
+sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
+
+# Final check:
+any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
+
+#
+# Now we check which of these have Mbp1 homologues ...
+
+# =    4  BLAST SEARCH  ========================================================
+
+
+# We run a BLAST search to find all proteins related to yeast Mbp1 in any
+# fungus. With the results, we'll annotate our sDat table.
+
+# ==   4.1  find homologous proteins  ==========================================
+#
+# Use BLAST to fetch proteins related to Mbp1 and identify the species that
+# contain them.
+
+# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
+# amount of error handling involved that is not supported by the API in a
+# principled way but requires rather ad hoc solutions. The code I threw together
+# to make a BLAST interface (demo-quality, not research-quality) is in the file
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
+# standard task of communicating with servers and parsing responses - everyday
+# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
+# parser in currently available packages.
+#
+# DON'T use this for BLAST searches unless you have read the NCBI policy
+# for automated tasks. If you indicriminately pound on the NCBI's BLAST
+# server, they will blacklist your IP-address. See:
+# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#
+# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
+# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
+#                    db = "refseq_protein",        # database to search in
+#                    nHits = 3000,                 # 945 hits in 2020
+#                    E = 0.01,                     #
+#                    limits = "txid4751[ORGN]")    # = fungi
+# saveRDS(BLASThits, file="data/BLASThits.rds")
+#
+# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
+#
+BLASThits <- readRDS(file = "data/BLASThits.rds")
+
+# ==   4.2  Identify species in "hits"  ========================================
+
+# This is a very big list that can't be usefully analyzed manually. Here
+# we are only interested in the species names that it contains.
+
+# How many hits in the list?
+length(BLASThits$hits)      # 1,134
+
+# Let's look at a hit somewhere down the list
+str(BLASThits$hit[[277]])
+
+# A fair amount of parsing has gone into the BLAST.R code to prepare the results
+# in a useful way. The species information is in the $species element of every
+# hit.
+
+# Run a loop to extract all the species names into a vector. We subset ...
+# Blasthits$hits                 ... the list of hits, from which we choose ...
+# Blasthits$hits[[i]]            ... the i-th hit, and get ...
+# Blasthits$hits[[i]]$species    ... the species element from that.
+# Subsetting FTW.
+
+BLASTspecies <- character()
+for (i in seq_along(BLASThits$hits)) {
+    BLASTspecies[i] <- BLASThits$hits[[i]]$species
+}
+
+# You can confirm that BLASTspecies has the expected size.
+length(BLASTspecies)
+
+# if we delete some of these later on, we still want to remember which hit
+# they came from. Thus we name() the elements with their index, which is the
+# same as the index of the hit in BLASThits
+names(BLASTspecies) <- 1:length(BLASTspecies)
+
+
+# let's plot the distribution of E-values
+eVals <- numeric()
+for (i in seq_along(BLASThits$hits)) {
+  eVals[i] <- BLASThits$hits[[i]]$E
+}
+range(eVals)
+sum(eVals == 0)
+
+# let's plot the log of all values > 0 to see how they are distributed
+# plotting only one vectyor of numbers plots their index as x, and
+# their value as y ...
+plot(log(eVals[eVals > 0]), col = "#CC0000")
+
+# This is very informative: I would suspect that the first ten or so are
+# virtually identical to the yeast protein, then we have about 800 hits with
+# decreasing similarity, and then about 200 more that may actually be false
+# positives. Also - we plotted them by index, that means the table is SORTED:
+# Lower E-values strictly come before higher E-values.
+
+# Again, some species appear more than once, e.g. ...
+sum(BLASTspecies == "Saccharomyces cerevisiae")
+
+# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
+
+# Therefore we remove duplicates. Removing duplicates will leave the FIRST
+# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
+# species, we will retain only the protein that has the highest similarity
+# to yeast Mbp1, not any of its more distant paralogues.
+sel <- ! duplicated(BLASTspecies)
+BLASTspecies <- BLASTspecies[sel]
+
+length(BLASTspecies)
+# i.e. we got rid of about two thirds of the hits.
+tail(BLASTspecies)  # see how the names are useful!
+                    # again - there are some special characters ...
+                    # what are they?
+BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
+
+# remove the brackets ...
+BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
+# drop any new duplicates ...
+BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
+
+# check the number again:
+length(BLASTspecies)
+# Think a bit about this: what may be the biological reason to find that
+# on average, in 388 fungi across the entire phylogenetic tree, we have
+# three sequences that are homologous to yeast Mbp1?
+
+# Let's look at the distribution of E-values in this selection (Subsetting FTW):
+# we plot all values that are TRUE in the vector "sel" that we created above,
+# AND greater than 0
+plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
+
+
+# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
+
+# Next we add the blast result to our sDat dataframe. We'll store the index,
+# the E-value, and the Query-bounds from which we can estimate which domains
+# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
+# Mbp1's N-terminal APSES domain.)
+#
+# First we pull the hits we wanted from the BLASTspecies:
+iHits <- as.numeric(names(BLASTspecies))
+length(iHits)     # one index for each TRUE in sel
+
+# add columns to sDat
+l <- nrow(sDat)
+sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
+sDat$eVal   <- numeric(l)  # E-value of the hit
+sDat$lAli   <- numeric(l)  # length of the aligned region
+
+# extract and merge
+for (iHit in iHits) {
+  thisSp <- BLASThits$hits[[iHit]]$species
+  sel <- sDat$species == thisSp
+
+  sDat$iHit[sel]   <- iHit
+  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
+  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
+}
+
+# Are all reference species accounted for?
+selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
+REFspecies %in% sDat$species[selA]     # yes, all there
+
+selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
+
+sum(selA & selB)   # How many rows?
+
+# sDat of course includes all duplicates. Some may be multiply sequenced, some
+# may be different strains. We'll use the same strategy as before and keep
+# only the best hit: order the rows by E-value, then drop all rows which
+# are duplicated.
+
+
+# drop all rows without BLAST hits ...
+sDat <- sDat[ ! (sDat$iHit == 0) , ]
+
+# order sDat by E-value ...
+sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
+
+# drop all rows with duplicated species ...
+sDat <- sDat[ ! duplicated(sDat$species) , ]
+
+# Lets look at the E-values ...
+plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
+
+# and alignment lengths ...
+plot(sDat$lAli, col = "#00DDAA")
+
+# How many ...
+length(unique(sDat$name))
+length(unique(sDat$species))
+length(unique(sDat$genus))
+length(unique(sDat$order))
+
+# I need an extra species for admin purposes later on ...
+sel <- grep("Sporothrix schenckii", sDat$species)
+SPOSCdat <- sDat[sel, ]
+sDat <- sDat[-sel, ]
+
+# To get the final dataset, we remove the reference species with their
+# entire orders ...
+REForders <- unique(sDat$order[sDat$species %in% REFspecies])
+sel <- sDat$order %in% REForders
+REFdat <- sDat[sel , ]
+sDat   <- sDat[ ! sel , ]
+
+# REFdat should now contain only the REFspecies ...
+( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
+
+# ... but all of them
+sum(REFspecies %in% REFdat$species)
+
+# ... and we have enough left in sDat to prune sDat to unique genus
+sDat <- sDat[ ! duplicated(sDat$genus) , ]
+nrow(sDat)   # 84
+
+# I add back "Sporothrix schenckii" ...
+sDat <- rbind(SPOSCdat, sDat)
+
+# ... and save for future use.
+# saveRDS(sDat, file = "data/sDat.rds")
+# saveRDS(REFdat, file = "data/REFdat.rds")
+
+
+
+# =    6  STUDENT NUMBERS  =====================================================
+#
+# An asymmetric function to retrieve a MYSPE species
+#
+sDat <- readRDS(file = "data/sDat.rds")
+
+students <- read.csv("../BCH441-2021-students.csv")
+sN <- students$Integration.ID
+sN <- sN[! is.na(sN)]
+sN <- as.character(sN)
+sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
+
+set.seed(112358)
+theseSpecies <- sDat[sample(1:nrow(sDat)), ]
+all(sort(theseSpecies$name) == sort(sDat$name))
+nrow((theseSpecies))
+(iX <- grep("Sporothrix schenckii", theseSpecies$name))
+theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
+rndMin <-  992000000
+rndMax <- 1020000000
+N <- 10000
+keys <- as.character(sample(rndMin:rndMax, N + 1000))
+keys <- keys[! (keys %in% sN)]
+keys <- keys[1:N]
+keys[1:length(sN)] <- sN
+
+nRep <- floor(N/nrow(theseSpecies))
+MYSPEdat <- theseSpecies
+for(i in 1:nRep) {
+  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
+}
+MYSPEdat <- MYSPEdat[1:N, ]
+for (i in 1:N) {
+  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
+}
+set.seed(NULL)
+MYSPEdat <- MYSPEdat[sample(1:N), ]
+
+# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
+
+# === validate
+x <- character()
+for (n in sN) {
+  sp <- getMYSPE(n)
+  if (length(sp) != 1) {
+    stop(print(as.character(n)))
+  } else {
+    x <- c(x, sp)
+  }
+}
+
+# === species for late-comers
+y <- unique(MYSPEdat$species)
+print(y[!(y %in% x)])
+
+
+# === validate
+l <- length(sN)
+sp <- character(l)
+for(i in 1:l) {
+  sp[i] <- getMYSPE(sN[i])
+}
+any(duplicated(sp))
+length(unique(sp))
+which(! sDat$species %in% sp)  # these can be assigned to late-comers
+
+# Done.
+
+# [END]
--- a/scripts/ABC-makeSTRINGedges.R
+++ b/scripts/ABC-makeSTRINGedges.R
@ -1,168 +1,168 @@
-# tocID <- "scripts/ABC-makeSTRINGedges.R"
-#
-# Create a subnetwork of high-confidence human STRING edges.
-#
-# Notes:
-#
-#      The large source- datafile is NOT posted to github. If you want to
-#      experiment with the original data, download it and place it into your
-#      local  ./data  directory.
-#
-#      STRING data source:
-#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
-#        Data: (127.6 Mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
-#
-# Version:  1.0
-#
-# Date:     2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.0    Rewrite
-#
-# TODO:
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
-#TOC>   1        Initialize                          44
-#TOC>   2        Read STRING Data                    51
-#TOC>   3        Define cutoff and subset            63
-#TOC>   4        Drop  duplicates                   103
-#TOC>   5        Simple statistics                  127
-#TOC>   6        Write to file                      160
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Initialize  ==========================================================
-
-if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
-}
-
-
-# =    2  Read STRING Data  ====================================================
-
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 127.6MB, the uncompressed version is probably
-# 848 Mb. Fortunately readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
-( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
-nrow(STR)  #  11,759,454 rows
-head(STR)
-
-
-# =    3  Define cutoff and subset  ============================================
-
-# approximate distribution of combined_score
-hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
-
-# Let's table the counts >= 850 and plot them for better resolution.
-
-myTb <- table(STR$combined_score[STR$combined_score >= 850])
-is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
-
-plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
-myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
-                         # frequently assigns a combined score of 0.900
-
-# Let's plot these counts as cumulative sums, in reverse order, scaled
-# as combined scores.
-myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
-plot(myX,
-     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
-     xlim = c(1.0, 0.85),            # reverse x-axis
-     type = "l",
-     main = "STRING interactions for 9606 (top 600,000)",
-     xlab = "combined_score",
-     ylab = "cumulative counts",
-     col = "#CC0000")
-abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
-
-# What's the cutoff for 100,000 edges?
-which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
-
-# confirm
-sum(STR$combined_score >= 964) # 101,348
-abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
-
-# subset the table, and use only the protein IDs and the combined_score
-STR <- STR[STR$combined_score >= 964,
-            c("protein1", "protein2", "combined_score")]
-colnames(STR) <- c("a", "b", "score")
-
-
-# =    4  Drop  duplicates  ====================================================
-
-# identify duplicate interactions by creating keys in a defined alphabetical
-# sort order, then checking for  duplicated().
-# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
-# (X:U, X:U) has a duplicate.
-
-AB <- STR$a < STR$b        # logical vector: genes we need to swap
-tmp <- STR$b               # copy column b
-STR$b[AB] <- STR$a[AB]     # copy a's into b
-STR$a[AB] <- tmp[AB]       # copy tmp's into a
-all(STR$a >= STR$b)        # confirm: TRUE
-
-# now, make combined keys, like this:
-paste0(STR$a[1:10], ":", STR$b[1:10])
-
-tmp <- paste0(STR$a, ":", STR$b)
-sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
-                     # both a:b and b:a !
-
-# drop all duplicated interactions from tmp
-STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
-
-
-# =    5  Simple statistics  ===================================================
-
-# how many unique genes?
-length(unique(c(STR$a, STR$b)))   # 8,445
-
-# how many self-edges?
-sum(STR$a == STR$b)  # none
-
-# log(rank) / log(frequency)
-myTbl <- table(c(STR$a, STR$b))
-myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
-
-hist(myTbl, breaks = 40, col = "#FFEEBB")
-
-# number of singletons
-sum(myTbl == 1) # almost a quarter
-
-# maximum?
-myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
-                                   # Google: CDC5L
-
-# Zipf-plot
-plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
-     type = "b", cex = 0.7,
-     main = "STRINGedges - degrees",
-     xlab = "log(rank)",
-     ylab = "log(frequency)",
-     col = "#FFBB88")
-
-sprintf("Average number of interactions: %5.2f",
-         nrow(STR) / length(unique(c(STR$a, STR$b))))
-
-
-# =    6  Write to file  =======================================================
-
-saveRDS(STR, file = "./data/STRINGedges.rds")
-
-# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
-                                                    # object when needed
-
-
-# [END]
+# tocID <- "scripts/ABC-makeSTRINGedges.R"
+#
+# Create a subnetwork of high-confidence human STRING edges.
+#
+# Notes:
+#
+#      The large source- datafile is NOT posted to github. If you want to
+#      experiment with the original data, download it and place it into your
+#      local  ./data  directory.
+#
+#      STRING data source:
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
+#        Data: (127.6 Mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
+#
+# Version:  1.0
+#
+# Date:     2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    Rewrite
+#
+# TODO:
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Initialize                          44
+#TOC>   2        Read STRING Data                    51
+#TOC>   3        Define cutoff and subset            63
+#TOC>   4        Drop  duplicates                   103
+#TOC>   5        Simple statistics                  127
+#TOC>   6        Write to file                      160
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Initialize  ==========================================================
+
+if (! requireNamespace("readr", quietly = TRUE)) {
+  install.packages("readr")
+}
+
+
+# =    2  Read STRING Data  ====================================================
+
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# The .gz compressed version is 127.6MB, the uncompressed version is probably
+# 848 Mb. Fortunately readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")
+nrow(STR)  #  11,759,454 rows
+head(STR)
+
+
+# =    3  Define cutoff and subset  ============================================
+
+# approximate distribution of combined_score
+hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
+
+# Let's table the counts >= 850 and plot them for better resolution.
+
+myTb <- table(STR$combined_score[STR$combined_score >= 850])
+is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
+
+plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
+myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
+                         # frequently assigns a combined score of 0.900
+
+# Let's plot these counts as cumulative sums, in reverse order, scaled
+# as combined scores.
+myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
+plot(myX,
+     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
+     xlim = c(1.0, 0.85),            # reverse x-axis
+     type = "l",
+     main = "STRING interactions for 9606 (top 600,000)",
+     xlab = "combined_score",
+     ylab = "cumulative counts",
+     col = "#CC0000")
+abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
+
+# What's the cutoff for 100,000 edges?
+which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
+
+# confirm
+sum(STR$combined_score >= 964) # 101,348
+abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
+
+# subset the table, and use only the protein IDs and the combined_score
+STR <- STR[STR$combined_score >= 964,
+            c("protein1", "protein2", "combined_score")]
+colnames(STR) <- c("a", "b", "score")
+
+
+# =    4  Drop  duplicates  ====================================================
+
+# identify duplicate interactions by creating keys in a defined alphabetical
+# sort order, then checking for  duplicated().
+# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
+# (X:U, X:U) has a duplicate.
+
+AB <- STR$a < STR$b        # logical vector: genes we need to swap
+tmp <- STR$b               # copy column b
+STR$b[AB] <- STR$a[AB]     # copy a's into b
+STR$a[AB] <- tmp[AB]       # copy tmp's into a
+all(STR$a >= STR$b)        # confirm: TRUE
+
+# now, make combined keys, like this:
+paste0(STR$a[1:10], ":", STR$b[1:10])
+
+tmp <- paste0(STR$a, ":", STR$b)
+sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
+                     # both a:b and b:a !
+
+# drop all duplicated interactions from tmp
+STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
+
+
+# =    5  Simple statistics  ===================================================
+
+# how many unique genes?
+length(unique(c(STR$a, STR$b)))   # 8,445
+
+# how many self-edges?
+sum(STR$a == STR$b)  # none
+
+# log(rank) / log(frequency)
+myTbl <- table(c(STR$a, STR$b))
+myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
+
+hist(myTbl, breaks = 40, col = "#FFEEBB")
+
+# number of singletons
+sum(myTbl == 1) # almost a quarter
+
+# maximum?
+myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
+                                   # Google: CDC5L
+
+# Zipf-plot
+plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
+     type = "b", cex = 0.7,
+     main = "STRINGedges - degrees",
+     xlab = "log(rank)",
+     ylab = "log(frequency)",
+     col = "#FFBB88")
+
+sprintf("Average number of interactions: %5.2f",
+         nrow(STR) / length(unique(c(STR$a, STR$b))))
+
+
+# =    6  Write to file  =======================================================
+
+saveRDS(STR, file = "./data/STRINGedges.rds")
+
+# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
+                                                    # object when needed
+
+
+# [END]
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@ -1,167 +1,167 @@
-# tocID <- "scripts/ABC-makeScCCnet.R"
-#
-# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
-# GOSlim annotation.
-#
-# Boris Steipe for ABC learning units
-#
-# Notes:
-#
-#      The large source- datafiles are NOT posted to github. If you want to
-#      experiment with your own code, download them and place them into your
-#      local  ./data  directory.
-#
-#      STRING data source:
-#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
-#        Data: (20.1 mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
-#
-#      GOSlim data source: (Note: this has moved from GO to SGD)
-#        Info page: https://www.yeastgenome.org/downloads
-#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
-#        Data: (3 mb)
-# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
-#
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Update. GO Slim Yeast mow at SGD
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0    First code copied from 2016 material.
-#
-# TODO:
-#
-# ==============================================================================
-# SRCDIR <- "./instructor"
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------------
-#TOC>   1        INITIALIZE                                        58
-#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
-#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
-#TOC>   3.1        Intersect interactions and annotations         122
-#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  INITIALIZE  ==========================================================
-
-SRCDIR <- "./data"
-if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
-}
-
-
-# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
-
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
-# really not necessary to uncompress since readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
-( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
-
-# Subset only IDs and combined_score column
-STR <- STR[ , c("protein1", "protein2", "combined_score")]
-
-# head(STR)
-# sum(STR$combined_score > 909)  # 100270 edges
-# subset for 100,000 highest confidence edges
-STR <- STR[(STR$combined_score > 909), ]
-head(STR)
-
-# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
-STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
-STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
-head(STR)
-
-# get a vector of gene names in this list
-myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
-                                                      # names
-length(myIntxGenes)
-sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
-
-
-# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
-#
-# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
-( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
-
-Gsl <- readr::read_tsv(fn,
-                       col_names = c("ID",
-                                     "name",
-                                     "SGDId",
-                                     "Ontology",
-                                     "termName",
-                                     "termID",
-                                     "status"))
-
-head(Gsl)
-
-# What cell cycle names does it contain?
-myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
-myGslTermNames[grep("cycle", myGslTermNames)]
-# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
-
-# Choose "mitotic cell cycle" as the GOslim term to subset with
-
-scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
-length(scCCgenes)  # 324 genes annotated to that term
-
-# ==   3.1  Intersect interactions and annotations  ============================
-
-sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
-#                                # functional interactions
-
-
-# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
-#
-# Define scCCnet ... the S. Cervisiae Cell Cycle network
-# Subset all rows for which BOTH genes are in the GOslim cell cycle set
-#
-scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
-               (STR$protein2 %in% scCCgenes), ]
-
-# How many genes are there?
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
-
-# Each edge is listed twice - now remove duplicates.
-
-# Step 1: make a vector: sort two names so the fiRst one is alphabetically
-#         smaller Than the second one. This brings the two names into a defined
-#         order. Then concatenate them with a "." - the resulting string
-#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
-#         and c("B", "A") also gives "A.B". This identifies duplicates.
-
-x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
-           1,
-           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
-head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
-
-sum(duplicated(x))  # 1453
-
-# Step 2: drop all rows that contain duplicates in x
-scCCnet <- scCCnet[! duplicated(x), ]
-
-# Confirm we didn't loose genes
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
-nrow(scCCnet)
-# Network has 283 nodes, 1453 edges
-
-saveRDS(scCCnet, file = "./data/scCCnet.rds")
-
-# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
-                                             #      object when needed
-
-
-# [END]
+# tocID <- "scripts/ABC-makeScCCnet.R"
+#
+# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
+# GOSlim annotation.
+#
+# Boris Steipe for ABC learning units
+#
+# Notes:
+#
+#      The large source- datafiles are NOT posted to github. If you want to
+#      experiment with your own code, download them and place them into your
+#      local  ./data  directory.
+#
+#      STRING data source:
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
+#        Data: (20.1 mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
+#
+#      GOSlim data source: (Note: this has moved from GO to SGD)
+#        Info page: https://www.yeastgenome.org/downloads
+#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
+#        Data: (3 mb)
+# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
+#
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Update. GO Slim Yeast mow at SGD
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0    First code copied from 2016 material.
+#
+# TODO:
+#
+# ==============================================================================
+# SRCDIR <- "./instructor"
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        INITIALIZE                                        58
+#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
+#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
+#TOC>   3.1        Intersect interactions and annotations         122
+#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  INITIALIZE  ==========================================================
+
+SRCDIR <- "./data"
+if (! requireNamespace("readr", quietly = TRUE)) {
+  install.packages("readr")
+}
+
+
+# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
+
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
+# really not necessary to uncompress since readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")
+
+# Subset only IDs and combined_score column
+STR <- STR[ , c("protein1", "protein2", "combined_score")]
+
+# head(STR)
+# sum(STR$combined_score > 909)  # 100270 edges
+# subset for 100,000 highest confidence edges
+STR <- STR[(STR$combined_score > 909), ]
+head(STR)
+
+# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
+STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
+STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
+head(STR)
+
+# get a vector of gene names in this list
+myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
+                                                      # names
+length(myIntxGenes)
+sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
+
+
+# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
+#
+# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
+( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
+
+Gsl <- readr::read_tsv(fn,
+                       col_names = c("ID",
+                                     "name",
+                                     "SGDId",
+                                     "Ontology",
+                                     "termName",
+                                     "termID",
+                                     "status"))
+
+head(Gsl)
+
+# What cell cycle names does it contain?
+myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
+myGslTermNames[grep("cycle", myGslTermNames)]
+# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
+
+# Choose "mitotic cell cycle" as the GOslim term to subset with
+
+scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
+length(scCCgenes)  # 324 genes annotated to that term
+
+# ==   3.1  Intersect interactions and annotations  ============================
+
+sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
+#                                # functional interactions
+
+
+# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
+#
+# Define scCCnet ... the S. Cervisiae Cell Cycle network
+# Subset all rows for which BOTH genes are in the GOslim cell cycle set
+#
+scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
+               (STR$protein2 %in% scCCgenes), ]
+
+# How many genes are there?
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
+
+# Each edge is listed twice - now remove duplicates.
+
+# Step 1: make a vector: sort two names so the fiRst one is alphabetically
+#         smaller Than the second one. This brings the two names into a defined
+#         order. Then concatenate them with a "." - the resulting string
+#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
+#         and c("B", "A") also gives "A.B". This identifies duplicates.
+
+x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
+           1,
+           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
+head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
+
+sum(duplicated(x))  # 1453
+
+# Step 2: drop all rows that contain duplicates in x
+scCCnet <- scCCnet[! duplicated(x), ]
+
+# Confirm we didn't loose genes
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
+nrow(scCCnet)
+# Network has 283 nodes, 1453 edges
+
+saveRDS(scCCnet, file = "./data/scCCnet.rds")
+
+# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
+                                             #      object when needed
+
+
+# [END]
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@ -1,135 +1,135 @@
-# tocID <- "scripts/ABC-writeALN.R"
-#
-# ToDo:    calculate consensus line
-#          append sequence numbers
-# Notes:
-#
-# ==============================================================================
-
-
-writeALN <- function(ali,
-                     range,
-                     note = "",
-                     myCon = stdout(),
-                     blockWidth = 60) {
-  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
-  # Version: 2.0
-  # Date:    2017 10
-  # Author:  Boris Steipe
-  #
-  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector.
-  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full alignment length.
-  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
-  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
-
-  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
-  if (blockWidth < 1) {
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
-  if (blockWidth > 60) {
-    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
-  }
-
-  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
-
-  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
-  }
-
-  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
-    sSet <- ali
-  } else {
-    stop(paste("Input object of class",
-               class(ali),
-               "can't be handled by this function."))
-  }
-
-  if (missing(range)) {
-    range <- 1
-    range[2] <- max(nchar(sSet))
-  } else {
-    range <- as.integer(range)
-    if(length(range) != 2 ||
-       any(is.na(range)) ||
-       range[1] > range[2] ||
-       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
-  }
-
-  # Right-pad any sequence with "-" that is shorter than ranges[2]
-    for (i in seq_along(sSet)) {
-      if (nchar(sSet[i]) < range[2]) {
-        sSet[i] <- paste0(sSet[i],
-                          paste0(rep("-", range[2] - nchar(sSet[i])),
-                                 collapse = ""))
-      }
-    }
-
-  # Right-pad sequence names
-  sNames <- names(sSet)
-  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
-  for (i in seq_along(sNames)) {
-    sNames[i] <- paste0(sNames[i],
-                      paste0(rep(" ", len - nchar(sNames[i])),
-                             collapse = ""))
-  }
-
-
-  # Process each sequence
-  txt <- paste0("CLUSTAL W format. ", note)
-  txt[2] <- ""
-
-  iStarts <- seq(range[1], range[2], by = blockWidth)
-  iEnds <- c((iStarts[-1] - 1), range[2])
-
-  for (i in seq_along(iStarts)) {
-    for (j in seq_along(sSet)) {
-      txt <- c(txt,
-               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
-    }
-    txt <- c(txt, "")  # append a blank consenus line
-    txt <- c(txt, "")  # append a separator line
-  }
-
-  writeLines(txt, con= myCon)
-
-}
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-if (FALSE) {
-  # test ...
-}
-
-
-
-# [END]
+# tocID <- "scripts/ABC-writeALN.R"
+#
+# ToDo:    calculate consensus line
+#          append sequence numbers
+# Notes:
+#
+# ==============================================================================
+
+
+writeALN <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 60) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.0
+  # Date:    2017 10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector.
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full alignment length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (blockWidth < 1) {
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+  if (blockWidth > 60) {
+    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Right-pad any sequence with "-" that is shorter than ranges[2]
+    for (i in seq_along(sSet)) {
+      if (nchar(sSet[i]) < range[2]) {
+        sSet[i] <- paste0(sSet[i],
+                          paste0(rep("-", range[2] - nchar(sSet[i])),
+                                 collapse = ""))
+      }
+    }
+
+  # Right-pad sequence names
+  sNames <- names(sSet)
+  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
+  for (i in seq_along(sNames)) {
+    sNames[i] <- paste0(sNames[i],
+                      paste0(rep(" ", len - nchar(sNames[i])),
+                             collapse = ""))
+  }
+
+
+  # Process each sequence
+  txt <- paste0("CLUSTAL W format. ", note)
+  txt[2] <- ""
+
+  iStarts <- seq(range[1], range[2], by = blockWidth)
+  iEnds <- c((iStarts[-1] - 1), range[2])
+
+  for (i in seq_along(iStarts)) {
+    for (j in seq_along(sSet)) {
+      txt <- c(txt,
+               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
+    }
+    txt <- c(txt, "")  # append a blank consenus line
+    txt <- c(txt, "")  # append a separator line
+  }
+
+  writeLines(txt, con= myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@ -1,121 +1,121 @@
-# ABC-writeMFA.R
-#
-# ToDo:
-# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
-#
-#
-# ==============================================================================
-
-
-writeMFA <- function(ali,
-                     range,
-                     note = "",
-                     myCon = stdout(),
-                     blockWidth = 80) {
-  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
-  # Version: 2.1
-  # Date:    2017  10
-  # Author:  Boris Steipe
-  #
-  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector
-  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full sequence length.
-  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
-  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
-
-  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
-  if (! blockWidth > 0){
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
-
-  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
-
-  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
-  }
-
-  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
-    sSet <- ali
-  } else {
-    stop(paste("Input object of class",
-               class(ali),
-               "can't be handled by this function."))
-  }
-
-  if (missing(range)) {
-    range <- 1
-    range[2] <- max(nchar(sSet))
-  } else {
-    range <- as.integer(range)
-    if(length(range) != 2 ||
-       any(is.na(range)) ||
-       range[1] > range[2] ||
-       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
-  }
-
-  # Process each sequence
-  txt <- character()
-  if (note != "") {  # construct header line
-    headers <- paste(names(sSet), note)
-  } else {
-    headers <- names(sSet)
-  }
-
-  for (i in seq_along(sSet)) {
-
-    # output FASTA header
-    txt <- c(txt, sprintf(">%s", headers[i]))
-
-    # output the sequence in blocks of blockWidth per line ...
-    iStarts <- seq(range[1], range[2], by = blockWidth)
-    iEnds <- c((iStarts[-1] - 1), range[2])
-
-    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
-    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
-    txt <- c(txt, thisSeq)
-
-    txt <- c(txt, "")  # append an empty line for readability
-  }
-
-  writeLines(txt, con = myCon)
-
-}
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-if (FALSE) {
-  # test ...
-}
-
-
-
-# [END]
+# ABC-writeMFA.R
+#
+# ToDo:
+# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
+#
+#
+# ==============================================================================
+
+
+writeMFA <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 80) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.1
+  # Date:    2017  10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full sequence length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (! blockWidth > 0){
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Process each sequence
+  txt <- character()
+  if (note != "") {  # construct header line
+    headers <- paste(names(sSet), note)
+  } else {
+    headers <- names(sSet)
+  }
+
+  for (i in seq_along(sSet)) {
+
+    # output FASTA header
+    txt <- c(txt, sprintf(">%s", headers[i]))
+
+    # output the sequence in blocks of blockWidth per line ...
+    iStarts <- seq(range[1], range[2], by = blockWidth)
+    iEnds <- c((iStarts[-1] - 1), range[2])
+
+    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
+    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
+    txt <- c(txt, thisSeq)
+
+    txt <- c(txt, "")  # append an empty line for readability
+  }
+
+  writeLines(txt, con = myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@ -1,384 +1,384 @@
-# BLAST.R
-#
-# Purpose: Send off one BLAST search and return parsed list of results
-#          This script uses the BLAST URL-API
-#          (Application Programming Interface) at the NCBI.
-#          Read about the constraints here:
-#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
-#
-# Version: 3.2
-# Date:    2016 09 - 2020 09
-# Author:  Boris Steipe
-#
-# Versions:
-#    3.2   2020 updates
-#    3.1   Change from require() to requireNamespace(),
-#          use <package>::<function>() idiom throughout
-#    3.0   parsing logic had not been fully implemented; Fixed.
-#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
-#          refactored parseBLASTalignment() to handle lists with multiple hits.
-#    2.0   Completely rewritten because the interface completely changed.
-#          Code adpated in part from NCBI Perl sample code:
-#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
-#    1.0   first version posted for BCH441 2016, based on BLAST - API
-#
-# ToDo:    Return the organism/strain name in the output, and propagate
-#          into MYSPE selection script.
-#
-# Notes:   This is somewhat pedestrian, but apparently there are currently
-#          no R packages that contain such code.
-#
-# ==============================================================================
-
-
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-
-
-BLAST <- function(Q,
-                  db = "refseq_protein",
-                  nHits = 30,
-                  E = 0.1,
-                  limits = "",
-                  rid = "",
-                  query = "",
-                  quietly = FALSE,
-                  myTimeout = 120) {
-    # Purpose:
-    #     Basic BLAST search
-    #
-    # Parameters:
-    #     Q: query - either a valid ID or a sequence
-    #     db: "refseq_protein" by default,
-    #         other legal values include: "nr", "pdb", "swissprot" ...
-    #     nHits: number of hits to maximally return
-    #     E: E-value cutoff. Do not return hits whose score would be expected
-    #        to occur E or more times in a database of random sequence.
-    #     limits: a valid ENTREZ filter
-    #     rid: a request ID - to retrieve earlier search results
-    #     query: the actual query string (needed when retrieving results
-    #            with an rid)
-    #     quietly: controls printing of wait-time progress bar
-    #     timeout: how much longer _after_ rtoe to wait for a result
-    #              before giving up (seconds)
-    # Value:
-    #     result: list of process status or resulting hits, and some metadata
-
-
-    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
-
-    results <- list()
-    results$query = query
-    results$rid <- rid
-    results$rtoe <- 0
-
-    if (rid == "") {  # If no rid is available, spawn a search.
-                      # Else, proceed directly to retrieval.
-
-      # prepare query, GET(), and parse rid and rtoe from BLAST server response
-      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                              "?",
-                              "CMD=Put",
-                              "&PROGRAM=", "blastp",
-                              "&QUERY=", URLencode(Q),
-                              "&DATABASE=", db,
-                              "&MATRIX=", "BLOSUM62",
-                              "&EXPECT=", as.character(E),
-                              "&HITLIST_SIZE=", as.character(nHits),
-                              "&ALIGNMENTS=", as.character(nHits),
-                              "&FORMAT_TYPE=Text")
-
-      if (limits != "") {
-        results$query <- paste0(
-          results$query,
-          "&ENTREZ_QUERY=", limits)
-      }
-
-      # send it off ...
-      response <- httr::GET(results$query)
-      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
-                     httr::http_status(response)$message))
-      }
-
-      txt <- httr::content(response, "text", encoding = "UTF-8")
-
-      patt <- "RID = (\\w+)" # match the request id
-      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
-
-      patt <- "RTOE = (\\d+)" # match the expected completion time
-      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
-
-      # Now we wait ...
-      if (quietly) {
-        Sys.sleep(results$rtoe)
-      } else {
-        cat(sprintf("BLAST is processing %s:\n", results$rid))
-        waitTimer(results$rtoe)
-      }
-
-    } # done sending query and retrieving rid, rtoe
-
-    # Enter an infinite loop to check for result availability
-    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                         "?",
-                         "CMD=Get",
-                         "&RID=", results$rid,
-                         "&FORMAT_TYPE=Text",
-                         "&FORMAT_OBJECT=SearchInfo",
-                         sep = "")
-
-    while (TRUE) {
-      # Check whether the result is ready
-      response <- httr::GET(checkStatus)
-      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
-                     httr::http_status(response)$message))
-      }
-
-      txt <- httr::content(response, "text", encoding = "UTF-8")
-
-      if (length(grep("Status=WAITING",  txt)) > 0) {
-        myTimeout <- myTimeout - EXTRAWAIT
-
-        if (myTimeout <= 0) { # abort
-          cat("BLAST search not concluded before timeout. Aborting.\n")
-          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
-                      "Trying checking back later with >",
-                      results$rid))
-          return(results)
-        }
-
-        if (quietly) {
-          Sys.sleep(EXTRAWAIT)
-        } else {
-          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
-                      EXTRAWAIT,
-                      myTimeout))
-          waitTimer(EXTRAWAIT)
-          next
-        }
-
-      } else if (length(grep("Status=FAILED",  txt)) > 0) {
-          cat("BLAST search returned status \"FAILED\". Aborting.\n")
-          return(results)
-
-      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
-          cat("BLAST search returned status \"UNKNOWN\".\n")
-          cat("This probably means the rid has expired. Aborting.\n")
-          return(results)
-
-      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
-
-          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
-            cat("BLAST search ready but no hits found. Aborting.\n")
-            return(results)
-
-          } else {
-            break  # done ... retrieve search result
-          }
-      }
-    } # end result-check loop
-
-    # retrieve results from BLAST server
-    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                      "?",
-                      "&CMD=Get",
-                      "&RID=", results$rid,
-                      "&FORMAT_TYPE=Text",
-                      sep = "")
-
-    response <- httr::GET(retrieve)
-    if (httr::http_status(response)$category != "Success" ) {
-      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
-                   httr::http_status(response)$message))
-    }
-
-    txt <- httr::content(response, "text", encoding = "UTF-8")
-
-    # txt contains the whole set of results. Process:
-
-    # First, we strsplit() on linebreaks:
-    txt <- unlist(strsplit(txt, "\n"))
-
-    # The alignments range from the first line that begins with ">" ...
-    iFirst <- grep("^>", txt)[1]
-
-    # ... to the last line that begins with "Sbjct"
-    x <- grep("^Sbjct", txt)
-    iLast <- x[length(x)]
-
-    # Get the alignments block
-    txt <- txt[iFirst:iLast]
-
-    # Drop empty lines
-    txt <- txt[!(nchar(txt) == 0)]
-
-    # A line that ends "]" but does not begin ">" seems to be a split
-    # defline ... eg.
-    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
-    #  [2] "EXF-2481]"
-    #  Merge these lines to the preceding lines and delete them.
-    #
-    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
-    if (length(x) > 0) {
-      txt[x-1] <- paste0(txt[x-1], txt[x])
-      txt <- txt[-x]
-    }
-
-    # Special case: there may be multiple deflines when the BLAST hit is to
-    # redundant, identical sequences. Keep only the first instance.
-    iKeep <- ! grepl("^>", txt)
-    x <- rle(iKeep)
-    x$positions <- cumsum(x$lengths)
-    i <- which(x$lengths > 1 & x$values == FALSE)
-    if (length(i) > 0) {
-      firsts <- x$positions[i] - x$lengths[i] + 1
-      iKeep[firsts] <- TRUE
-      txt <- txt[iKeep]
-    }
-
-    # After this preprocessing the following should be true:
-    # - Every alignment block begins with a defline in which the
-    #   first character is ">"
-    # - There is only one defline in each block.
-    # - Lines are not split.
-
-    # Make a dataframe of first and last indices of alignment blocks
-    x <- grep("^>", txt)
-    blocks <- data.frame(iFirst = x,
-                         iLast  = c((x[-1] - 1), length(txt)))
-
-    # Build the hits list by parsing the blocks
-    results$hits <- list()
-
-    for (i in seq_len(nrow(blocks))) {
-      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
-      results$hits[[i]] <- parseBLASTalignment(thisBlock)
-    }
-
-    return(results)
-}
-
-parseBLASTalignment <- function(hit) {
-  # Parse data from a character vector containing a BLAST hit
-  # Parameters:
-  #    hit  char   one BLAST hit as char vector
-  # Value:
-  #          list   $def          chr   defline
-  #                 $accession    chr   accession number
-  #                 $organism     chr   complete organism definition
-  #                 $species      chr   binomial species
-  #                 $E            num   E value
-  #                 $lengthAli    num   length of the alignment
-  #                 $nIdentitites num   number of identities
-  #                 $nGaps        num   number of gaps
-  #                 $Qbounds      num   2-element vector of query start-end
-  #                 $Sbounds      num   2-element vector of subject start-end
-  #                 $Qseq         chr   query sequence
-  #                 $midSeq       chr   midline string
-  #                 $Sseq         chr   subject sequence
-
-  getToken <- function(patt, v) {
-    # get the first token identified by pattern patt in character vector v
-    v <- v[grep(patt, v)]
-    if (length(v) > 1) { v <- v[1] }
-    if (length(v) == 0) { token <- NA
-    } else {
-      token <- regmatches(v, regexec(patt, v))[[1]][2] }
-    return(token)
-  }
-
-  h <- list()
-
-  # FASTA defline
-  h$def <- hit[1]
-
-  # accesion number (ID), use the first if there are several, separated by "|"
-  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
-  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
-  # organism
-  patt <- "\\[(.+)]"
-  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
-  # species
-  x <- unlist(strsplit(h$organism, "\\s+"))
-  if (length(x) >= 2) {
-    h$species <- paste(x[1], x[2])
-  } else if (length(x) == 1) {
-    h$species <- paste(x[1], "sp.")
-  } else {
-    h$species <- NA
-  }
-
-  # E-value
-  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
-
-  # length of alignment
-  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
-
-  # number of identities
-  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
-
-  # number of gaps
-  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
-
-  # split up alignment section
-  idx <- grep("^Query ", hit)
-  Que <- hit[idx]
-  Mid <- hit[idx + 1]
-  Sbj <- hit[idx + 2]
-
-  # first and last positions
-  h$Qbounds <- c(start = 0, end = 0)
-  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
-  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
-
-  h$Sbounds <- c(start = 0, end = 0)
-  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
-  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
-
-  # aligned sequences
-  for (i in seq_along(Que)) {
-    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
-    m <- regexec(patt, Que[i])
-    iFirst <- m[[1]][2]
-    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
-    Que[i] <- substring(Que[i], iFirst, iLast)
-    Mid[i] <- substring(Mid[i], iFirst, iLast)
-    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
-  }
-
-  h$Qseq   <- paste0(Que, collapse = "")
-  h$midSeq <- paste0(Mid, collapse = "")
-  h$Sseq   <- paste0(Sbj, collapse = "")
-
-  return(h)
-}
-
-
-# ==== TESTS ===================================================================
-
-if (FALSE) {
-  # define query:
-  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
-               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-               sep="")
-  # or ...
-  q <- "NP_010227" # refseq ID
-
-  test <- BLAST(q,
-                nHits = 100,
-                E = 0.001,
-                rid = "",
-                limits = "txid4751[ORGN]")  # Fungi
-  str(test)
-  length(test$hits)
-}
-
-# [END]
-
+# BLAST.R
+#
+# Purpose: Send off one BLAST search and return parsed list of results
+#          This script uses the BLAST URL-API
+#          (Application Programming Interface) at the NCBI.
+#          Read about the constraints here:
+#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#
+#
+# Version: 3.2
+# Date:    2016 09 - 2020 09
+# Author:  Boris Steipe
+#
+# Versions:
+#    3.2   2020 updates
+#    3.1   Change from require() to requireNamespace(),
+#          use <package>::<function>() idiom throughout
+#    3.0   parsing logic had not been fully implemented; Fixed.
+#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
+#          refactored parseBLASTalignment() to handle lists with multiple hits.
+#    2.0   Completely rewritten because the interface completely changed.
+#          Code adpated in part from NCBI Perl sample code:
+#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
+#    1.0   first version posted for BCH441 2016, based on BLAST - API
+#
+# ToDo:    Return the organism/strain name in the output, and propagate
+#          into MYSPE selection script.
+#
+# Notes:   This is somewhat pedestrian, but apparently there are currently
+#          no R packages that contain such code.
+#
+# ==============================================================================
+
+
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+
+
+BLAST <- function(Q,
+                  db = "refseq_protein",
+                  nHits = 30,
+                  E = 0.1,
+                  limits = "",
+                  rid = "",
+                  query = "",
+                  quietly = FALSE,
+                  myTimeout = 120) {
+    # Purpose:
+    #     Basic BLAST search
+    #
+    # Parameters:
+    #     Q: query - either a valid ID or a sequence
+    #     db: "refseq_protein" by default,
+    #         other legal values include: "nr", "pdb", "swissprot" ...
+    #     nHits: number of hits to maximally return
+    #     E: E-value cutoff. Do not return hits whose score would be expected
+    #        to occur E or more times in a database of random sequence.
+    #     limits: a valid ENTREZ filter
+    #     rid: a request ID - to retrieve earlier search results
+    #     query: the actual query string (needed when retrieving results
+    #            with an rid)
+    #     quietly: controls printing of wait-time progress bar
+    #     timeout: how much longer _after_ rtoe to wait for a result
+    #              before giving up (seconds)
+    # Value:
+    #     result: list of process status or resulting hits, and some metadata
+
+
+    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
+
+    results <- list()
+    results$query = query
+    results$rid <- rid
+    results$rtoe <- 0
+
+    if (rid == "") {  # If no rid is available, spawn a search.
+                      # Else, proceed directly to retrieval.
+
+      # prepare query, GET(), and parse rid and rtoe from BLAST server response
+      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                              "?",
+                              "CMD=Put",
+                              "&PROGRAM=", "blastp",
+                              "&QUERY=", URLencode(Q),
+                              "&DATABASE=", db,
+                              "&MATRIX=", "BLOSUM62",
+                              "&EXPECT=", as.character(E),
+                              "&HITLIST_SIZE=", as.character(nHits),
+                              "&ALIGNMENTS=", as.character(nHits),
+                              "&FORMAT_TYPE=Text")
+
+      if (limits != "") {
+        results$query <- paste0(
+          results$query,
+          "&ENTREZ_QUERY=", limits)
+      }
+
+      # send it off ...
+      response <- httr::GET(results$query)
+      if (httr::http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
+                     httr::http_status(response)$message))
+      }
+
+      txt <- httr::content(response, "text", encoding = "UTF-8")
+
+      patt <- "RID = (\\w+)" # match the request id
+      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
+
+      patt <- "RTOE = (\\d+)" # match the expected completion time
+      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
+
+      # Now we wait ...
+      if (quietly) {
+        Sys.sleep(results$rtoe)
+      } else {
+        cat(sprintf("BLAST is processing %s:\n", results$rid))
+        waitTimer(results$rtoe)
+      }
+
+    } # done sending query and retrieving rid, rtoe
+
+    # Enter an infinite loop to check for result availability
+    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                         "?",
+                         "CMD=Get",
+                         "&RID=", results$rid,
+                         "&FORMAT_TYPE=Text",
+                         "&FORMAT_OBJECT=SearchInfo",
+                         sep = "")
+
+    while (TRUE) {
+      # Check whether the result is ready
+      response <- httr::GET(checkStatus)
+      if (httr::http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
+                     httr::http_status(response)$message))
+      }
+
+      txt <- httr::content(response, "text", encoding = "UTF-8")
+
+      if (length(grep("Status=WAITING",  txt)) > 0) {
+        myTimeout <- myTimeout - EXTRAWAIT
+
+        if (myTimeout <= 0) { # abort
+          cat("BLAST search not concluded before timeout. Aborting.\n")
+          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
+                      "Trying checking back later with >",
+                      results$rid))
+          return(results)
+        }
+
+        if (quietly) {
+          Sys.sleep(EXTRAWAIT)
+        } else {
+          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
+                      EXTRAWAIT,
+                      myTimeout))
+          waitTimer(EXTRAWAIT)
+          next
+        }
+
+      } else if (length(grep("Status=FAILED",  txt)) > 0) {
+          cat("BLAST search returned status \"FAILED\". Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
+          cat("BLAST search returned status \"UNKNOWN\".\n")
+          cat("This probably means the rid has expired. Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
+
+          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
+            cat("BLAST search ready but no hits found. Aborting.\n")
+            return(results)
+
+          } else {
+            break  # done ... retrieve search result
+          }
+      }
+    } # end result-check loop
+
+    # retrieve results from BLAST server
+    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                      "?",
+                      "&CMD=Get",
+                      "&RID=", results$rid,
+                      "&FORMAT_TYPE=Text",
+                      sep = "")
+
+    response <- httr::GET(retrieve)
+    if (httr::http_status(response)$category != "Success" ) {
+      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
+                   httr::http_status(response)$message))
+    }
+
+    txt <- httr::content(response, "text", encoding = "UTF-8")
+
+    # txt contains the whole set of results. Process:
+
+    # First, we strsplit() on linebreaks:
+    txt <- unlist(strsplit(txt, "\n"))
+
+    # The alignments range from the first line that begins with ">" ...
+    iFirst <- grep("^>", txt)[1]
+
+    # ... to the last line that begins with "Sbjct"
+    x <- grep("^Sbjct", txt)
+    iLast <- x[length(x)]
+
+    # Get the alignments block
+    txt <- txt[iFirst:iLast]
+
+    # Drop empty lines
+    txt <- txt[!(nchar(txt) == 0)]
+
+    # A line that ends "]" but does not begin ">" seems to be a split
+    # defline ... eg.
+    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
+    #  [2] "EXF-2481]"
+    #  Merge these lines to the preceding lines and delete them.
+    #
+    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
+    if (length(x) > 0) {
+      txt[x-1] <- paste0(txt[x-1], txt[x])
+      txt <- txt[-x]
+    }
+
+    # Special case: there may be multiple deflines when the BLAST hit is to
+    # redundant, identical sequences. Keep only the first instance.
+    iKeep <- ! grepl("^>", txt)
+    x <- rle(iKeep)
+    x$positions <- cumsum(x$lengths)
+    i <- which(x$lengths > 1 & x$values == FALSE)
+    if (length(i) > 0) {
+      firsts <- x$positions[i] - x$lengths[i] + 1
+      iKeep[firsts] <- TRUE
+      txt <- txt[iKeep]
+    }
+
+    # After this preprocessing the following should be true:
+    # - Every alignment block begins with a defline in which the
+    #   first character is ">"
+    # - There is only one defline in each block.
+    # - Lines are not split.
+
+    # Make a dataframe of first and last indices of alignment blocks
+    x <- grep("^>", txt)
+    blocks <- data.frame(iFirst = x,
+                         iLast  = c((x[-1] - 1), length(txt)))
+
+    # Build the hits list by parsing the blocks
+    results$hits <- list()
+
+    for (i in seq_len(nrow(blocks))) {
+      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
+      results$hits[[i]] <- parseBLASTalignment(thisBlock)
+    }
+
+    return(results)
+}
+
+parseBLASTalignment <- function(hit) {
+  # Parse data from a character vector containing a BLAST hit
+  # Parameters:
+  #    hit  char   one BLAST hit as char vector
+  # Value:
+  #          list   $def          chr   defline
+  #                 $accession    chr   accession number
+  #                 $organism     chr   complete organism definition
+  #                 $species      chr   binomial species
+  #                 $E            num   E value
+  #                 $lengthAli    num   length of the alignment
+  #                 $nIdentitites num   number of identities
+  #                 $nGaps        num   number of gaps
+  #                 $Qbounds      num   2-element vector of query start-end
+  #                 $Sbounds      num   2-element vector of subject start-end
+  #                 $Qseq         chr   query sequence
+  #                 $midSeq       chr   midline string
+  #                 $Sseq         chr   subject sequence
+
+  getToken <- function(patt, v) {
+    # get the first token identified by pattern patt in character vector v
+    v <- v[grep(patt, v)]
+    if (length(v) > 1) { v <- v[1] }
+    if (length(v) == 0) { token <- NA
+    } else {
+      token <- regmatches(v, regexec(patt, v))[[1]][2] }
+    return(token)
+  }
+
+  h <- list()
+
+  # FASTA defline
+  h$def <- hit[1]
+
+  # accesion number (ID), use the first if there are several, separated by "|"
+  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
+  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # organism
+  patt <- "\\[(.+)]"
+  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # species
+  x <- unlist(strsplit(h$organism, "\\s+"))
+  if (length(x) >= 2) {
+    h$species <- paste(x[1], x[2])
+  } else if (length(x) == 1) {
+    h$species <- paste(x[1], "sp.")
+  } else {
+    h$species <- NA
+  }
+
+  # E-value
+  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
+
+  # length of alignment
+  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
+
+  # number of identities
+  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
+
+  # number of gaps
+  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
+
+  # split up alignment section
+  idx <- grep("^Query ", hit)
+  Que <- hit[idx]
+  Mid <- hit[idx + 1]
+  Sbj <- hit[idx + 2]
+
+  # first and last positions
+  h$Qbounds <- c(start = 0, end = 0)
+  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
+  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
+
+  h$Sbounds <- c(start = 0, end = 0)
+  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
+  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
+
+  # aligned sequences
+  for (i in seq_along(Que)) {
+    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
+    m <- regexec(patt, Que[i])
+    iFirst <- m[[1]][2]
+    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
+    Que[i] <- substring(Que[i], iFirst, iLast)
+    Mid[i] <- substring(Mid[i], iFirst, iLast)
+    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
+  }
+
+  h$Qseq   <- paste0(Que, collapse = "")
+  h$midSeq <- paste0(Mid, collapse = "")
+  h$Sseq   <- paste0(Sbj, collapse = "")
+
+  return(h)
+}
+
+
+# ==== TESTS ===================================================================
+
+if (FALSE) {
+  # define query:
+  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
+               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+               sep="")
+  # or ...
+  q <- "NP_010227" # refseq ID
+
+  test <- BLAST(q,
+                nHits = 100,
+                E = 0.001,
+                rid = "",
+                limits = "txid4751[ORGN]")  # Fungi
+  str(test)
+  length(test$hits)
+}
+
+# [END]
+
--- a/tests/test_biCode.R
+++ b/tests/test_biCode.R
@ -1,32 +1,32 @@
-# test_biCode.R
-#
-
-context("biCode() utility function tests")  # A set of tests for some
-                                            # functionality
-
-test_that("expected input is processed correctly", {  # Related expectations
-  expect_equal(biCode("homo sapiens"), "HOMSA")
-  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
-  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
-               c("PHACI", "MACRU"))
-})
-
-test_that("unexpected input is managed", {
-  expect_equal(biCode(""), ".....")
-  expect_equal(biCode(" "), ".....")
-  expect_equal(biCode("123 12"), ".....")
-  expect_equal(biCode("h sapiens"), "H..SA")
-})
-
-test_that("NA values are preserved", {
-  expect_true(is.na((biCode(NA))))
-  expect_equal(biCode(c("first", NA, "last")),
-               c("FIRST", NA, "LAST."))
-})
-
-test_that("Missing argument throws an error", {
-  expect_error(biCode(), "argument \"s\" is missing, with no default")
-})
-
-
-# [END]
+# test_biCode.R
+#
+
+context("biCode() utility function tests")  # A set of tests for some
+                                            # functionality
+
+test_that("expected input is processed correctly", {  # Related expectations
+  expect_equal(biCode("homo sapiens"), "HOMSA")
+  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
+  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
+               c("PHACI", "MACRU"))
+})
+
+test_that("unexpected input is managed", {
+  expect_equal(biCode(""), ".....")
+  expect_equal(biCode(" "), ".....")
+  expect_equal(biCode("123 12"), ".....")
+  expect_equal(biCode("h sapiens"), "H..SA")
+})
+
+test_that("NA values are preserved", {
+  expect_true(is.na((biCode(NA))))
+  expect_equal(biCode(c("first", NA, "last")),
+               c("FIRST", NA, "LAST."))
+})
+
+test_that("Missing argument throws an error", {
+  expect_error(biCode(), "argument \"s\" is missing, with no default")
+})
+
+
+# [END]