Line termination change and old code.

2021-11-16 00:31:48 -05:00 · 2021-11-16 00:31:48 -05:00 · affe00f6fb
commit affe00f6fb
parent b1e00f52f7
86 changed files with 37873 additions and 37876 deletions
--- a/.Rprofile
+++ b/.Rprofile
@ -1,129 +1,129 @@
-# .Rprofile
+# .Rprofile
-#
+#
-# This script is automatically executed on startup
+# This script is automatically executed on startup
-# ==============================================================================
+# ==============================================================================
-
+
-init <- function() {
+init <- function() {
-
+
-  # Create a local copy of myScript.R if not done yet.
+  # Create a local copy of myScript.R if not done yet.
-  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
+  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
-    file.copy(".tmp.R", "myScript.R")
+    file.copy(".tmp.R", "myScript.R")
-    cat("A new file \"myScript.R\" was created. You can use it for\n")
+    cat("A new file \"myScript.R\" was created. You can use it for\n")
-    cat("notes and code experiments.\n\n")
+    cat("notes and code experiments.\n\n")
-  }
+  }
-
+
-  cat("\n\n")
+  cat("\n\n")
-  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
+  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
-  cat("\"files\" pane), edit it and save it.\n")
+  cat("\"files\" pane), edit it and save it.\n")
-  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
+  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
-  cat("to move it into the \"myScripts\" folder.\n\n")
+  cat("to move it into the \"myScripts\" folder.\n\n")
-
+
-  file.edit("ABC-units.R")
+  file.edit("ABC-units.R")
-  return(invisible(NULL))
+  return(invisible(NULL))
-}
+}
-
+
-if (! file.exists("./myScripts/.myProfile.R")) {
+if (! file.exists("./myScripts/.myProfile.R")) {
-  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
+  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
-  cat("    =================")
+  cat("    =================")
-  cat("\n\n")
+  cat("\n\n")
-  cat("        WELCOME !\n")
+  cat("        WELCOME !\n")
-  cat("\n")
+  cat("\n")
-  cat("  Type  'init()'  to begin\n\n")
+  cat("  Type  'init()'  to begin\n\n")
-  cat("\n")
+  cat("\n")
-  cat("    =================")
+  cat("    =================")
-  cat("\n\n")
+  cat("\n\n")
-
+
-} else {  # local profile exists ... validate state:
+} else {  # local profile exists ... validate state:
-  cat("\n\nLoading local functions ...")
+  cat("\n\nLoading local functions ...")
-
+
-  source(".utilities.R")  # local profile appears sane, source utilities
+  source(".utilities.R")  # local profile appears sane, source utilities
-  source("./myScripts/.myProfile.R")
+  source("./myScripts/.myProfile.R")
-
+
-  if (! exists("myEMail")) {  # ... has eMail been defined?
+  if (! exists("myEMail")) {  # ... has eMail been defined?
-    cat("ERROR !\n")
+    cat("ERROR !\n")
-    cat("=======\n")
+    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("the variable \"myEMail\" was not loaded.\n")
+    cat("the variable \"myEMail\" was not loaded.\n")
-    cat("Please contact your instructor to continue.\n\n")
+    cat("Please contact your instructor to continue.\n\n")
-  }
+  }
-  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
+  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
-    cat("ERROR !\n")
+    cat("ERROR !\n")
-    cat("=======\n")
+    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("the variable \"myStudentNumber\" was not loaded.\n")
+    cat("the variable \"myStudentNumber\" was not loaded.\n")
-    cat("Please contact your instructor to continue.\n\n")
+    cat("Please contact your instructor to continue.\n\n")
-  }
+  }
-  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
+  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
-    cat("ERROR !\n")                 # is the Student Number valid?
+    cat("ERROR !\n")                 # is the Student Number valid?
-    cat("=======\n")
+    cat("=======\n")
-    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
+    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
-    cat("your Student Number could not be validated.\n")
+    cat("your Student Number could not be validated.\n")
-    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
+    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
-    cat(" and fix the problem or contact your instructor to continue.\n\n")
+    cat(" and fix the problem or contact your instructor to continue.\n\n")
-  }
+  }
-
+
-
+
-  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
+  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
-                            # ... and write it into the profile.
+                            # ... and write it into the profile.
-       prf <- readLines("./myScripts/.myProfile.R")
+       prf <- readLines("./myScripts/.myProfile.R")
-       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
+       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
-       out <- prf[1:iEmail]
+       out <- prf[1:iEmail]
-       out <- c(out, sprintf("MYSPE <- \"%s\" ",
+       out <- c(out, sprintf("MYSPE <- \"%s\" ",
-                             getMYSPE(myStudentNumber)))
+                             getMYSPE(myStudentNumber)))
-       out <- c(out, prf[(iEmail+1):length(prf)])
+       out <- c(out, prf[(iEmail+1):length(prf)])
-       writeLines(out, "./myScripts/.myProfile.R")
+       writeLines(out, "./myScripts/.myProfile.R")
-
+
-       cat("\n")
+       cat("\n")
-       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
+       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
-                   getMYSPE(myStudentNumber)))
+                   getMYSPE(myStudentNumber)))
-       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
+       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
-       rm(prf, iEmail, out)                # cleanup
+       rm(prf, iEmail, out)                # cleanup
-  }
+  }
-  cat("... done.\n\n")
+  cat("... done.\n\n")
-}
+}
-
+
-if (default.stringsAsFactors()) {
+if (default.stringsAsFactors()) {
-  cat("WARNING.\n")
+  cat("WARNING.\n")
-  cat("========\n")
+  cat("========\n")
-  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
+  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
-  cat("This will break some of the code.\n")
+  cat("This will break some of the code.\n")
-  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
+  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
-  cat("\n")
+  cat("\n")
-}
+}
-
+
-errText <- list()
+errText <- list()
-errText[["noProfileFile"]] <- '
+errText[["noProfileFile"]] <- '
-Your PROFILE FILE does not exist. This problem must be fixed to continue.
+Your PROFILE FILE does not exist. This problem must be fixed to continue.
-
+
-  The code expects the file "./myScripts/.myProfile.R" to exist and to
+  The code expects the file "./myScripts/.myProfile.R" to exist and to
-  contain your correct eMail address and student number. Detailed
+  contain your correct eMail address and student number. Detailed
-  instructions were given when you first ran the init() command.
+  instructions were given when you first ran the init() command.
-
+
-  Try running init() again and follow the instructions. Reload youR RStudio
+  Try running init() again and follow the instructions. Reload youR RStudio
-  session and start over with this file.
+  session and start over with this file.
-
+
-  If this does not fix the problem, ask for help.
+  If this does not fix the problem, ask for help.
-'
+'
-
+
-errText[["noStudentNumber"]] <- '
+errText[["noStudentNumber"]] <- '
-Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
+Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
-
+
-  The code expects the file "./myScripts/.myProfile.R" to exist and to
+  The code expects the file "./myScripts/.myProfile.R" to exist and to
-  contain your correct eMail address and student number. This file gets
+  contain your correct eMail address and student number. This file gets
-  sourced when you start a new R-session, but since you see this error
+  sourced when you start a new R-session, but since you see this error
-  message there was a problem.
+  message there was a problem.
-
+
-  Perhaps you need to restart your R-session. Try closing the RStudio
+  Perhaps you need to restart your R-session. Try closing the RStudio
-  project and reopening it from the File > Recent Projects menu.
+  project and reopening it from the File > Recent Projects menu.
-
+
-  Perhaps there was a syntax error in your file. Then not all the
+  Perhaps there was a syntax error in your file. Then not all the
-  instructions in the file are executed. Check the file: is your
+  instructions in the file are executed. Check the file: is your
-  email perhpas not defined? Or did you type it without qwuoataion
+  email perhpas not defined? Or did you type it without qwuoataion
-  marks?
+  marks?
-
+
-  Try fixing problems, and then restart R as described above.
+  Try fixing problems, and then restart R as described above.
-
+
-  If none of this fixes the problem, ask for help.
+  If none of this fixes the problem, ask for help.
-'
+'
-
+
-# [END]
+# [END]
--- a/.gitignore
+++ b/.gitignore
@ -1,44 +1,44 @@
-# Miscellaneous
+# Miscellaneous
-.Ds_store
+.Ds_store
-instructor/
+instructor/
-dev/
+dev/
-# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
+# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
-
+
-# History files
+# History files
-.Rhistory
+.Rhistory
-.Rapp.history
+.Rapp.history
-
+
-# Session Data files
+# Session Data files
-# .RData
+# .RData
-
+
-# Files produced in assingments
+# Files produced in assingments
-data/APSESphyloSet.mfa
+data/APSESphyloSet.mfa
-data/APSEStreeRproml.rds
+data/APSEStreeRproml.rds
-
+
-# Example code in package build process
+# Example code in package build process
-*-Ex.R
+*-Ex.R
-
+
-# Output files from R CMD build
+# Output files from R CMD build
-/*.tar.gz
+/*.tar.gz
-
+
-# Output files from R CMD check
+# Output files from R CMD check
-/*.Rcheck/
+/*.Rcheck/
-
+
-# RStudio files
+# RStudio files
-.Rproj.user/
+.Rproj.user/
-
+
-# produced vignettes
+# produced vignettes
-vignettes/*.html
+vignettes/*.html
-vignettes/*.pdf
+vignettes/*.pdf
-
+
-# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
-.httr-oauth
+.httr-oauth
-
+
-# knitr and R markdown default cache directories
+# knitr and R markdown default cache directories
-/*_cache/
+/*_cache/
-/cache/
+/cache/
-
+
-# Temporary files created by R markdown
+# Temporary files created by R markdown
-*.utf8.md
+*.utf8.md
-*.knit.md
+*.knit.md
-.Rproj.user
+.Rproj.user
--- a/.tmp.R
+++ b/.tmp.R
@ -1,38 +1,38 @@
-# myScript.R
+# myScript.R
-#
+#
-# --- As you work with this file, you can delete the instructions below --------
+# --- As you work with this file, you can delete the instructions below --------
-# Write your notes and code experiments into this document. Save it
+# Write your notes and code experiments into this document. Save it
-# from time to time - however I recommend that you do not _commit_
+# from time to time - however I recommend that you do not _commit_
-# your saved version.
+# your saved version.
-#
+#
-# As long as you do not _commit_ this script to version control,
+# As long as you do not _commit_ this script to version control,
-# you can _pull_ updated versions of the entire project from GitHub
+# you can _pull_ updated versions of the entire project from GitHub
-# by using the RStudio version control interface. However, once
+# by using the RStudio version control interface. However, once
-# you _commit_ any file in your local version, RStudio will require
+# you _commit_ any file in your local version, RStudio will require
-# you to resolve conflicts before you can _pull_ updates.
+# you to resolve conflicts before you can _pull_ updates.
-# --- As you work with this file, you can delete the instructions above --------
+# --- As you work with this file, you can delete the instructions above --------
-#
+#
-## Purpose: <...>
+## Purpose: <...>
-#
+#
-# Version: <...>
+# Version: <...>
-#
+#
-# Date:    <...>
+# Date:    <...>
-# Author:  <Name> (<namee@mail.utoronto.ca>)
+# Author:  <Name> (<namee@mail.utoronto.ca>)
-#
+#
-# Versions:
+# Versions:
-#
+#
-#   <number>    <Features>
+#   <number>    <Features>
-#
+#
-# TODO:
+# TODO:
-#   <...>
+#   <...>
-#
+#
-# ====================================================================
+# ====================================================================
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-# [END]
+# [END]
-
+
--- a/.utilities.R
+++ b/.utilities.R
--- a/2021-10-12_In-Class_exploration.R
+++ b/2021-10-12_In-Class_exploration.R
@ -1,257 +1,257 @@
-# 2021-10-12_In-Class_exploration.R
+# 2021-10-12_In-Class_exploration.R
-#
+#
-#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
+#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
-#
+#
-# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
+# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
-# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
+# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
-# Scribe:     boris.steipe@utoronto.ca
+# Scribe:     boris.steipe@utoronto.ca
-#
+#
-# ==============================================================================
+# ==============================================================================
-#
+#
-# In our last session we explored some properties of amino acids and noted that
+# In our last session we explored some properties of amino acids and noted that
-# we can arrange them in a scatter-plot according to some properties. But can
+# we can arrange them in a scatter-plot according to some properties. But can
-# we also arrange them according to generic properties, i.e. taking all
+# we also arrange them according to generic properties, i.e. taking all
-# published property scales into account? We will try to use all tables from
+# published property scales into account? We will try to use all tables from
-# the seqinr package.
+# the seqinr package.
-
+
-# First we load the package - this makes all datasets immediately available and
+# First we load the package - this makes all datasets immediately available and
-# we don't have to load them one by one.
+# we don't have to load them one by one.
-
+
-library(seqinr)
+library(seqinr)
-
+
-# Determine what datasets are available
+# Determine what datasets are available
-#
+#
-# Using "find in topic" ... "amino acid"
+# Using "find in topic" ... "amino acid"
-data(aacost)
+data(aacost)
-data(aaindex)
+data(aaindex)
-data(pK)
+data(pK)
-
+
-# We note that datasets may be sorted in different ways: for example
+# We note that datasets may be sorted in different ways: for example
-# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
+# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
-# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
+# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
-# acids are sorted in the same way.
+# acids are sorted in the same way.
-
+
-# Build a datastructure ...
+# Build a datastructure ...
-# rows: amino acids
+# rows: amino acids
-# columns: properties
+# columns: properties
-
+
-# Are all lists in aaindex organized in the same way?
+# Are all lists in aaindex organized in the same way?
-
+
-refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
+refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
-                                  # index as a reference list
+                                  # index as a reference list
-
+
-# Loop over each list in aaindex
+# Loop over each list in aaindex
-for (i in 1:length(aaindex)) {
+for (i in 1:length(aaindex)) {
-#   get the I-vector
+#   get the I-vector
-  x <- aaindex[[i]]$I
+  x <- aaindex[[i]]$I
-#   get the names
+#   get the names
-  x <- names(x)
+  x <- names(x)
-#   compare with the names of our reference list
+#   compare with the names of our reference list
-#   the == and != operators are vectorized. Applying them to two vectors
+#   the == and != operators are vectorized. Applying them to two vectors
-#   gives TRUE or FALSE for each pair of elements. any() or all() can be
+#   gives TRUE or FALSE for each pair of elements. any() or all() can be
-#   applied to logical vectors to anylise them and return a soingle result.
+#   applied to logical vectors to anylise them and return a soingle result.
-#   if (...) conditions evaluate only a single value and will throw a warning if
+#   if (...) conditions evaluate only a single value and will throw a warning if
-#   there is more than one.
+#   there is more than one.
-
+
-  if (any(x != refNames)) {
+  if (any(x != refNames)) {
-    # There was at least one not-equal pair - so: complain
+    # There was at least one not-equal pair - so: complain
-    print(sprintf("Problem in list %d: names don't match", i))
+    print(sprintf("Problem in list %d: names don't match", i))
-  }
+  }
-}
+}
-
+
-# If we get here without identifying problems, it means all pairs of
+# If we get here without identifying problems, it means all pairs of
-# rownames match throughout the aainfex list.
+# rownames match throughout the aainfex list.
-
+
-
+
-# Next: what is the cvorrect syntax to add one vector (the "I" vector of
+# Next: what is the cvorrect syntax to add one vector (the "I" vector of
-# one of the list elements) to our dataframe?
+# one of the list elements) to our dataframe?
-aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
+aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
-aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
+aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
-
+
-str(aaData)  # Confirm: we now have a two-column dataframe
+str(aaData)  # Confirm: we now have a two-column dataframe
-
+
-# Next: add the rest ...
+# Next: add the rest ...
-for (i in 3:length(aaindex)) {
+for (i in 3:length(aaindex)) {
-  #   get the I-vector and write it into our dataframe
+  #   get the I-vector and write it into our dataframe
-  aaData[,i] <- aaindex[[i]]$I
+  aaData[,i] <- aaindex[[i]]$I
-}
+}
-
+
-# Sanity check
+# Sanity check
-plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
+plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
-
+
-# Looks good.
+# Looks good.
-
+
-# We finished building our data structure ... but let's add the aacost table
+# We finished building our data structure ... but let's add the aacost table
-# aacost is ordered differently:
+# aacost is ordered differently:
-rownames(aaData)
+rownames(aaData)
-aacost[ , 1]
+aacost[ , 1]
-
+
-# using order(), applied to aacost - ordering the column with column-name
+# using order(), applied to aacost - ordering the column with column-name
-# "aaa"
+# "aaa"
-sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
+sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
-aacost[sel, "aaa"] # applying the order vector sorts the column
+aacost[sel, "aaa"] # applying the order vector sorts the column
-
+
-# Is this the same order as refNames?
+# Is this the same order as refNames?
-refNames == aacost[sel, "aaa"]  # Yes!
+refNames == aacost[sel, "aaa"]  # Yes!
-
+
-# add the data from column "tot" (i.e. total metabolic cost) after the
+# add the data from column "tot" (i.e. total metabolic cost) after the
-# last column of aaData
+# last column of aaData
-aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
+aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
-
+
-# Done.
+# Done.
-str(aaData)  # A dataframe with 20 rows and 545 columns
+str(aaData)  # A dataframe with 20 rows and 545 columns
-
+
-# To answer the question "Which amino acids are similar to each other?" we
+# To answer the question "Which amino acids are similar to each other?" we
-# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
+# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
-# we will succumb to the "Curse of Dimensionality":
+# we will succumb to the "Curse of Dimensionality":
-#
+#
-#    "in high dimensional data, however, all objects appear
+#    "in high dimensional data, however, all objects appear
-#     to be sparse and dissimilar in many ways..."
+#     to be sparse and dissimilar in many ways..."
-#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
+#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
-#
+#
-# A classic way to do this is Principal Component Analysis (PCA) ...
+# A classic way to do this is Principal Component Analysis (PCA) ...
-# (Principal components analysis)
+# (Principal components analysis)
-#
+#
-# PCA expects objects in columns, properties in rows. Therefore we need to
+# PCA expects objects in columns, properties in rows. Therefore we need to
-# transpose our dataset:
+# transpose our dataset:
-
+
-aaPCA <- prcomp(t(aaData))
+aaPCA <- prcomp(t(aaData))
-
+
-# This creates an error, because some of our indicews contain NA values!
+# This creates an error, because some of our indicews contain NA values!
-# Which indices are this?
+# Which indices are this?
-
+
-# We create a vector "sel" for which we check whether any element in each
+# We create a vector "sel" for which we check whether any element in each
-# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
+# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
-# then use this vector to subset ourt dataframe.
+# then use this vector to subset ourt dataframe.
-
+
-sel <- logical()
+sel <- logical()
-
+
-for (i in 1:ncol(aaData)) {         # for each index
+for (i in 1:ncol(aaData)) {         # for each index
-  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
+  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
-    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
+    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
-  } else {                          #   else
+  } else {                          #   else
-    sel <- c(sel, TRUE)             #     add a TRUE element
+    sel <- c(sel, TRUE)             #     add a TRUE element
-  }
+  }
-}
+}
-
+
-# Done. sel now subsets only the NA-free columns
+# Done. sel now subsets only the NA-free columns
-545 - sum(sel)                      # 13 columns excluded
+545 - sum(sel)                      # 13 columns excluded
-
+
-# Do the PCA ... use the prcomp() function
+# Do the PCA ... use the prcomp() function
-aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
+aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
-
+
-str(aaPCA)   # structure of the result
+str(aaPCA)   # structure of the result
-
+
-plot(aaPCA)                         # plot the contributions of the
+plot(aaPCA)                         # plot the contributions of the
-                                    # components to the variance
+                                    # components to the variance
-
+
-plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
+plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
-     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
+     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
-     type ="n")                     # just to set up the coordinate system
+     type ="n")                     # just to set up the coordinate system
-
+
-text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
+text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
-     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
+     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
-     labels = rownames(aaPCA$rotation))
+     labels = rownames(aaPCA$rotation))
-
+
-# PCA results are sensitive to the absolute numeric value of the features that
+# PCA results are sensitive to the absolute numeric value of the features that
-# we are comparing. The prcomp() function has an option scale. = TRUE that
+# we are comparing. The prcomp() function has an option scale. = TRUE that
-# scales each row of features so that the variance of the value is 1.0  This
+# scales each row of features so that the variance of the value is 1.0  This
-# ensures that each feature is given approximately equal weight
+# ensures that each feature is given approximately equal weight
-
+
-aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
+aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
-
+
-plot(aaPCA)
+plot(aaPCA)
-
+
-plot(aaPCA$rotation[ , 1],
+plot(aaPCA$rotation[ , 1],
-     aaPCA$rotation[ , 2],
+     aaPCA$rotation[ , 2],
-     type ="n")
+     type ="n")
-text(aaPCA$rotation[ , 1],
+text(aaPCA$rotation[ , 1],
-     aaPCA$rotation[ , 2],
+     aaPCA$rotation[ , 2],
-     labels = rownames(aaPCA$rotation))
+     labels = rownames(aaPCA$rotation))
-
+
-
+
-# Next we try to identify what the PCs correspond to. We see whether there are
+# Next we try to identify what the PCs correspond to. We see whether there are
-# specific features that are highly correlated with the PCs
+# specific features that are highly correlated with the PCs
-
+
-# ==== Rotation 1 ===================
+# ==== Rotation 1 ===================
-#
+#
-
+
-(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
+(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
-
+
-# The function cor() calculates Pearson coefficients of correlation
+# The function cor() calculates Pearson coefficients of correlation
-cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
+cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
-
+
-
+
-# Iterate over all columns and calculate correlations
+# Iterate over all columns and calculate correlations
-cors <- numeric()
+cors <- numeric()
-
+
-for (i in 1:ncol(aaData)) {
+for (i in 1:ncol(aaData)) {
-  cors[i] <- cor(PC1, aaData[ , i])
+  cors[i] <- cor(PC1, aaData[ , i])
-}
+}
-
+
-summary(cors)
+summary(cors)
-#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
-# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
+# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
-#
+#
-#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
+#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
-
+
-which(cors == max(cors, na.rm = TRUE))
+which(cors == max(cors, na.rm = TRUE))
-
+
-aaindex[[504]]   # Linker propensity ???
+aaindex[[504]]   # Linker propensity ???
-
+
-cor(PC1, aaindex[[504]]$I) # Did we get the right index?
+cor(PC1, aaindex[[504]]$I) # Did we get the right index?
-
+
-# Plot this ...
+# Plot this ...
-plot(aaPCA$rotation[ , 1],
+plot(aaPCA$rotation[ , 1],
-     aaindex[[504]]$I,
+     aaindex[[504]]$I,
-     type ="n")
+     type ="n")
-text(aaPCA$rotation[ , 1],
+text(aaPCA$rotation[ , 1],
-     aaindex[[504]]$I,
+     aaindex[[504]]$I,
-     labels = rownames(aaPCA$rotation))
+     labels = rownames(aaPCA$rotation))
-
+
-# This is essentially a random correlation but for Cysteine ...
+# This is essentially a random correlation but for Cysteine ...
-
+
-
+
-# ==== Rotation 2 ===================
+# ==== Rotation 2 ===================
-#
+#
-# same process
+# same process
-PC2 <- aaPCA$rotation[ , 2]
+PC2 <- aaPCA$rotation[ , 2]
-
+
-cors2 <- numeric()
+cors2 <- numeric()
-
+
-for (i in 1:ncol(aaData)) {
+for (i in 1:ncol(aaData)) {
-  cors2[i] <- cor(PC2, aaData[ , i])
+  cors2[i] <- cor(PC2, aaData[ , i])
-}
+}
-
+
-summary(cors2)
+summary(cors2)
-#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
+#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
-# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
+# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
-
+
-# Here we have quite strong correlations
+# Here we have quite strong correlations
-
+
-which(cors2 == max(cors2, na.rm = TRUE))
+which(cors2 == max(cors2, na.rm = TRUE))
-
+
-aaindex[[148]]
+aaindex[[148]]
-
+
-# this index itself is correlated with many other indices
+# this index itself is correlated with many other indices
-
+
-cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
+cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
-
+
-# Plot this too...
+# Plot this too...
-plot(aaPCA$rotation[ , 2],
+plot(aaPCA$rotation[ , 2],
-     aaindex[[148]]$I,
+     aaindex[[148]]$I,
-     type ="n")
+     type ="n")
-text(aaPCA$rotation[ , 2],
+text(aaPCA$rotation[ , 2],
-     aaindex[[148]]$I,
+     aaindex[[148]]$I,
-     labels = rownames(aaPCA$rotation))
+     labels = rownames(aaPCA$rotation))
-
+
-# This correlates well with hydrophobicity measures. In this case the
+# This correlates well with hydrophobicity measures. In this case the
-# PC is to a certain degree interpretable - but this is not always the case
+# PC is to a certain degree interpretable - but this is not always the case
-# with PCA (see the example of the first PC).
+# with PCA (see the example of the first PC).
-
+
-
+
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/ABC-Install_all_packages.R
+++ b/ABC-Install_all_packages.R
@ -1,161 +1,161 @@
-# tocID <- "ABC-Install_all_packages.R"
+# tocID <- "ABC-Install_all_packages.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              Installing all packages in this course
+#              Installing all packages in this course
-#
+#
-# Version:  1.0
+# Version:  1.0
-#
+#
-# Date:     2021  10
+# Date:     2021  10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.0    New code
+#           1.0    New code
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                          Line
+#TOC>   Section  Title                          Line
-#TOC> ----------------------------------------------
+#TOC> ----------------------------------------------
-#TOC>   1        Packages                         33
+#TOC>   1        Packages                         33
-#TOC>   2        CRAN packages                    98
+#TOC>   2        CRAN packages                    98
-#TOC>   3        Bioconductor packages           127
+#TOC>   3        Bioconductor packages           127
-#TOC>   4        Other package sources           142
+#TOC>   4        Other package sources           142
-#TOC>   5        Updating packages               148
+#TOC>   5        Updating packages               148
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Packages  ============================================================
+# =    1  Packages  ============================================================
-
+
-# Much of R's functionality is contributed in packages: bundles of R scripts
+# Much of R's functionality is contributed in packages: bundles of R scripts
-# or code in other languages, pre-configured objects, and datasets. Making this
+# or code in other languages, pre-configured objects, and datasets. Making this
-# functionality available is often done by issuing a library(<package-name>)
+# functionality available is often done by issuing a library(<package-name>)
-# command, however this is not the preferred way, since it may override other
+# command, however this is not the preferred way, since it may override other
-# R functions and it makes it harder to understand where the source code of
+# R functions and it makes it harder to understand where the source code of
-# a particular function is located. In this course we call the function name
+# a particular function is located. In this course we call the function name
-# prefixed with the package name and two colons:
+# prefixed with the package name and two colons:
-#   <package-name>::<function-name>()
+#   <package-name>::<function-name>()
-# This is the preferred way, since it is explicit.
+# This is the preferred way, since it is explicit.
-#
+#
-# Regardless of which idiom one uses to call the actual function, the package
+# Regardless of which idiom one uses to call the actual function, the package
-#  needs to be "installed" first, i.e. the code must have been downloaded
+#  needs to be "installed" first, i.e. the code must have been downloaded
-# from CRAN, or using the BiocManager::install() function.
+# from CRAN, or using the BiocManager::install() function.
-#
+#
-# This script contains download commands for all packages that are used in the
+# This script contains download commands for all packages that are used in the
-# course. You can execute the script line by line (or even source the entire
+# course. You can execute the script line by line (or even source the entire
-# script) to make sure all packages can be installed on your computer. Just
+# script) to make sure all packages can be installed on your computer. Just
-# one reminder: if you are ever asked to install from source, the correct
+# one reminder: if you are ever asked to install from source, the correct
-# answer is usually "no" - except if you really know what you are doing and why.
+# answer is usually "no" - except if you really know what you are doing and why.
-#
+#
-# Once packages are installed you can get additional information about
+# Once packages are installed you can get additional information about
-# the contents of a package with the commands:
+# the contents of a package with the commands:
-#  library(help=<package-name>)       # basic information
+#  library(help=<package-name>)       # basic information
-#  browseVignettes("<package-name>")  # available vignettes
+#  browseVignettes("<package-name>")  # available vignettes
-#  data(package = "<package-name>")   # available datasets
+#  data(package = "<package-name>")   # available datasets
-#
+#
-#  ... and you can load data sets with:
+#  ... and you can load data sets with:
-#  data(<data-set-name>, package = "<package-name>")
+#  data(<data-set-name>, package = "<package-name>")
-#
+#
-#  All packages here are installed only when they have not been installed
+#  All packages here are installed only when they have not been installed
-#  before, using the following idiom:
+#  before, using the following idiom:
-#
+#
-#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
+#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
-#       install.packages("<package-name>")
+#       install.packages("<package-name>")
-#     }
+#     }
-#
+#
-#  ... or its BiocManager::install() equivalent:
+#  ... or its BiocManager::install() equivalent:
-#
+#
-# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
+# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
-#   BiocManager::install("<bioconductor-package-name>")
+#   BiocManager::install("<bioconductor-package-name>")
-# }
+# }
-#
+#
-#  If you want to _force_ a re-installation of the package, simply issue
+#  If you want to _force_ a re-installation of the package, simply issue
-#  the install.packages("<package-name>") command on its own. For compactness
+#  the install.packages("<package-name>") command on its own. For compactness
-#  we wrap the idiom into a function, which can also switch between CRAN
+#  we wrap the idiom into a function, which can also switch between CRAN
-#  and BIOconductor sources:
+#  and BIOconductor sources:
-
+
-installIfNeeded <- function(package, s = "CRAN") {
+installIfNeeded <- function(package, s = "CRAN") {
-  # s: "CRAN" or "BIO"
+  # s: "CRAN" or "BIO"
-  if (s == "CRAN") {
+  if (s == "CRAN") {
-    if (! requireNamespace(package, quietly=TRUE)) {
+    if (! requireNamespace(package, quietly=TRUE)) {
-      install.packages(package)
+      install.packages(package)
-    }
+    }
-  } else if (s == "BIO") {
+  } else if (s == "BIO") {
-    if (! requireNamespace("BiocManager", quietly=TRUE)) {
+    if (! requireNamespace("BiocManager", quietly=TRUE)) {
-      install.packages("BiocManager")
+      install.packages("BiocManager")
-    }
+    }
-    if (! requireNamespace(package, quietly=TRUE)) {
+    if (! requireNamespace(package, quietly=TRUE)) {
-      BiocManager::install(package)
+      BiocManager::install(package)
-    }
+    }
-  } else {
+  } else {
-    stop(sprintf("Unknown source \"%s\".", s))
+    stop(sprintf("Unknown source \"%s\".", s))
-  }
+  }
-}
+}
-
+
-
+
-# =    2  CRAN packages  =======================================================
+# =    2  CRAN packages  =======================================================
-
+
-installIfNeeded("ape")
+installIfNeeded("ape")
-installIfNeeded("BiocManager")
+installIfNeeded("BiocManager")
-installIfNeeded("bio3d")
+installIfNeeded("bio3d")
-installIfNeeded("evd")
+installIfNeeded("evd")
-installIfNeeded("ggseqlogo")
+installIfNeeded("ggseqlogo")
-installIfNeeded("ggtern")
+installIfNeeded("ggtern")
-installIfNeeded("hexbin")
+installIfNeeded("hexbin")
-installIfNeeded("httr")
+installIfNeeded("httr")
-installIfNeeded("igraph")
+installIfNeeded("igraph")
-installIfNeeded("jsonlite")
+installIfNeeded("jsonlite")
-installIfNeeded("magrittr")
+installIfNeeded("magrittr")
-installIfNeeded("MASS")
+installIfNeeded("MASS")
-installIfNeeded("microbenchmark")
+installIfNeeded("microbenchmark")
-installIfNeeded("phangorn")
+installIfNeeded("phangorn")
-installIfNeeded("plotly")
+installIfNeeded("plotly")
-installIfNeeded("plotrix")
+installIfNeeded("plotrix")
-installIfNeeded("profvis")
+installIfNeeded("profvis")
-installIfNeeded("robustbase")
+installIfNeeded("robustbase")
-installIfNeeded("RColorBrewer")
+installIfNeeded("RColorBrewer")
-installIfNeeded("Rphylip")
+installIfNeeded("Rphylip")
-installIfNeeded("rvest")
+installIfNeeded("rvest")
-installIfNeeded("seqinr")
+installIfNeeded("seqinr")
-installIfNeeded("stringi")
+installIfNeeded("stringi")
-installIfNeeded("taxize")
+installIfNeeded("taxize")
-installIfNeeded("testthat")
+installIfNeeded("testthat")
-installIfNeeded("xml2")
+installIfNeeded("xml2")
-
+
-# =    3  Bioconductor packages  ===============================================
+# =    3  Bioconductor packages  ===============================================
-
+
-installIfNeeded("Biobase",       s = "BIO")
+installIfNeeded("Biobase",       s = "BIO")
-installIfNeeded("biomaRt",       s = "BIO")
+installIfNeeded("biomaRt",       s = "BIO")
-installIfNeeded("Biostrings",    s = "BIO")
+installIfNeeded("Biostrings",    s = "BIO")
-installIfNeeded("DECIPHER",      s = "BIO")
+installIfNeeded("DECIPHER",      s = "BIO")
-installIfNeeded("GEOquery",      s = "BIO")
+installIfNeeded("GEOquery",      s = "BIO")
-installIfNeeded("GOSim",         s = "BIO")
+installIfNeeded("GOSim",         s = "BIO")
-installIfNeeded("limma",         s = "BIO")
+installIfNeeded("limma",         s = "BIO")
-installIfNeeded("msa",           s = "BIO")
+installIfNeeded("msa",           s = "BIO")
-installIfNeeded("org.Sc.sgd.db", s = "BIO")
+installIfNeeded("org.Sc.sgd.db", s = "BIO")
-installIfNeeded("prada",         s = "BIO")
+installIfNeeded("prada",         s = "BIO")
-installIfNeeded("topGO",         s = "BIO")
+installIfNeeded("topGO",         s = "BIO")
-
+
-
+
-# =    4  Other package sources  ===============================================
+# =    4  Other package sources  ===============================================
-
+
-# Using sources other than CRAN or Bioconductor to download general-purpose
+# Using sources other than CRAN or Bioconductor to download general-purpose
-# programs that run on your computer is not generally recommended.
+# programs that run on your computer is not generally recommended.
-
+
-
+
-# =    5  Updating packages  ===================================================
+# =    5  Updating packages  ===================================================
-
+
-# From time to time, update CRAN packages with the following command ...
+# From time to time, update CRAN packages with the following command ...
-
+
-update.packages()
+update.packages()
-
+
-# ... and also update Bioconductor packages as follows:
+# ... and also update Bioconductor packages as follows:
-
+
-BiocManager::install()
+BiocManager::install()
-
+
-# [END]
+# [END]
--- a/ABC-addSACCE_APSESproteins.R
+++ b/ABC-addSACCE_APSESproteins.R
@ -1,100 +1,100 @@
-# addSACCE_APSESproteins.R
+# addSACCE_APSESproteins.R
-# Adds the Saccharomyces cerevisiae APSES proteins to myDB
+# Adds the Saccharomyces cerevisiae APSES proteins to myDB
-#
+#
-
+
-myDB$protein <-
+myDB$protein <-
-    rbind(myDB$protein,
+    rbind(myDB$protein,
-          data.frame(
+          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "SWI4_SACCE",
+              name = "SWI4_SACCE",
-              RefSeqID = "NP_011036",
+              RefSeqID = "NP_011036",
-              UniProtID = "P25302",
+              UniProtID = "P25302",
-              taxonomy.ID = as.integer(4932),
+              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
+              sequence = dbSanitizeSequence("
-        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
+        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
-       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
+       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
-       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
+       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
-       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
+       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
-       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
+       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
-       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
+       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
-       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
+       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
-       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
+       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
-       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
+       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
-       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
+       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
-       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
+       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
-       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
+       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
-       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
+       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
-       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
+       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
-       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
+       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
-       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
+       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
-       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
+       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
-       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
+       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
-       1081 klddiekdlr ana"),
+       1081 klddiekdlr ana"),
-              stringsAsFactors = FALSE))
+              stringsAsFactors = FALSE))
-
+
-myDB$protein <-
+myDB$protein <-
-    rbind(myDB$protein,
+    rbind(myDB$protein,
-          data.frame(
+          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "PHD1_SACCE",
+              name = "PHD1_SACCE",
-              RefSeqID = "NP_012881",
+              RefSeqID = "NP_012881",
-              UniProtID = "P36093",
+              UniProtID = "P36093",
-              taxonomy.ID = as.integer(4932),
+              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
+              sequence = dbSanitizeSequence("
-        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
+        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
-       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
+       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
-      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
+      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
-      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
+      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
-      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
+      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
-      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
+      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
-      361 aknels"),
+      361 aknels"),
-              stringsAsFactors = FALSE))
+              stringsAsFactors = FALSE))
-
+
-myDB$protein <-
+myDB$protein <-
-    rbind(myDB$protein,
+    rbind(myDB$protein,
-          data.frame(
+          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "SOK2_SACCE",
+              name = "SOK2_SACCE",
-              RefSeqID = "NP_013729",
+              RefSeqID = "NP_013729",
-              UniProtID = "P53438",
+              UniProtID = "P53438",
-              taxonomy.ID = as.integer(4932),
+              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
+              sequence = dbSanitizeSequence("
-        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
+        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
-       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
+       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
-      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
+      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
-      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
+      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
-      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
+      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
-      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
+      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
-      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
+      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
-      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
+      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
-      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
+      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
-      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
+      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
-      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
+      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
-      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
+      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
-      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
+      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
-      781 kkqek"),
+      781 kkqek"),
-              stringsAsFactors = FALSE))
+              stringsAsFactors = FALSE))
-
+
-myDB$protein <-
+myDB$protein <-
-    rbind(myDB$protein,
+    rbind(myDB$protein,
-          data.frame(
+          data.frame(
-              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
+              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
-              name = "XBP1_SACCE",
+              name = "XBP1_SACCE",
-              RefSeqID = "NP_012165",
+              RefSeqID = "NP_012165",
-              UniProtID = "P40489",
+              UniProtID = "P40489",
-              taxonomy.ID = as.integer(4932),
+              taxonomy.ID = as.integer(4932),
-              sequence = dbSanitizeSequence("
+              sequence = dbSanitizeSequence("
-        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
+        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
-       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
+       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
-      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
+      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
-      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
+      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
-      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
+      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
-      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
+      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
-      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
+      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
-      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
+      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
-      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
+      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
-      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
+      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
-      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
+      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
-              stringsAsFactors = FALSE))
+              stringsAsFactors = FALSE))
-
+
-# [END]
+# [END]
--- a/ABC-units.R
+++ b/ABC-units.R
@ -1,69 +1,69 @@
-# ABC-units.R
+# ABC-units.R
-#
+#
-# Purpose: A Bioinformatics Course: R code for learning units
+# Purpose: A Bioinformatics Course: R code for learning units
-#
+#
-# Version: 4.0
+# Version: 4.0
-#
+#
-# Date:    2020  09  16
+# Date:    2020  09  16
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-# V 4.0    2020 version
+# V 4.0    2020 version
-# V 3.0    2019 version
+# V 3.0    2019 version
-# V 2.0    2018 version
+# V 2.0    2018 version
-# V 1.0    2017 version
+# V 1.0    2017 version
-# V 0.1    First code
+# V 0.1    First code
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
+#
-# The R-scripts and datasets in this project will be continuously updated,
+# The R-scripts and datasets in this project will be continuously updated,
-# and updates will be posted on GitHub. To bring your version into the latest
+# and updates will be posted on GitHub. To bring your version into the latest
-# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
+# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
-# repository. However, this will overwrite locally edited version of files.
+# repository. However, this will overwrite locally edited version of files.
-
+
-# To edit code and experiment with it, for example to add your own comments and
+# To edit code and experiment with it, for example to add your own comments and
-# examples, save your edited version into the "myScripts" folder. Otherwise you
+# examples, save your edited version into the "myScripts" folder. Otherwise you
-# may have problems with git when you update the project to a new version. It's
+# may have problems with git when you update the project to a new version. It's
-# good practice to change the filename, for example by prepending your initials.
+# good practice to change the filename, for example by prepending your initials.
-# This helps distinguish the files you are working with e.g. in a list of
+# This helps distinguish the files you are working with e.g. in a list of
-# recent files. For example if your name is Honjo Tasuku, your edited
+# recent files. For example if your name is Honjo Tasuku, your edited
-# BIN-Sequence.R might be named HT-BIN-Sequence.R
+# BIN-Sequence.R might be named HT-BIN-Sequence.R
-
+
-# If you pull from github and get the following type of error ...
+# If you pull from github and get the following type of error ...
-#     ---------------
+#     ---------------
-#     error: Your local changes to the following files would be
+#     error: Your local changes to the following files would be
-#     overwritten by merge
+#     overwritten by merge
-#     ...
+#     ...
-#     Please commit your changes or stash them before you can merge.
+#     Please commit your changes or stash them before you can merge.
-#     ---------------
+#     ---------------
-# ... then, you need to bring the offending file into its original state.
+# ... then, you need to bring the offending file into its original state.
-# Open the Commit window, select the file, and click on the Revert button.
+# Open the Commit window, select the file, and click on the Revert button.
-#
+#
-# When working with these script DO NOT SIMPLY  source()  THESE FILES!
+# When working with these script DO NOT SIMPLY  source()  THESE FILES!
-
+
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
+#  going on. That's not how it works ...
-#
+#
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-# Once you have typed and executed the function init(), you will find a file
+# Once you have typed and executed the function init(), you will find a file
-# called myScript.R in the project directory.
+# called myScript.R in the project directory.
-#
+#
-# Open it, you can place all of your code-experiments and notes into that
+# Open it, you can place all of your code-experiments and notes into that
-# file. This will complement your "Course Journal". If you keep all contents in
+# file. This will complement your "Course Journal". If you keep all contents in
-# this one file, you can find everything by using the <cmd>-F find function. To
+# this one file, you can find everything by using the <cmd>-F find function. To
-# cross-reference code in your journal, create section headings.
+# cross-reference code in your journal, create section headings.
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-# The individual learning units' files can be opened by simply clicking on them
+# The individual learning units' files can be opened by simply clicking on them
-# in the File pane.
+# in the File pane.
-
+
-
+
-
+
-# [END]
+# [END]
--- a/ABC-units.Rproj
+++ b/ABC-units.Rproj
@ -1,16 +1,16 @@
-Version: 1.0
+Version: 1.0
-
+
-RestoreWorkspace: No
+RestoreWorkspace: No
-SaveWorkspace: No
+SaveWorkspace: No
-AlwaysSaveHistory: No
+AlwaysSaveHistory: No
-
+
-EnableCodeIndexing: Yes
+EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
+UseSpacesForTab: Yes
-NumSpacesForTab: 2
+NumSpacesForTab: 2
-Encoding: UTF-8
+Encoding: UTF-8
-
+
-RnwWeave: knitr
+RnwWeave: knitr
-LaTeX: XeLaTeX
+LaTeX: XeLaTeX
-
+
-AutoAppendNewline: Yes
+AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
+StripTrailingWhitespace: Yes
--- a/BIN-ALI-BLAST.R
+++ b/BIN-ALI-BLAST.R
@ -1,111 +1,111 @@
-# tocID <- "BIN-ALI-BLAST.R"
+# tocID <- "BIN-ALI-BLAST.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-BLAST unit.
+#              R code accompanying the BIN-ALI-BLAST unit.
-#
+#
-# ==============================================================================
+# ==============================================================================
-#
+#
-# Version:  1.3
+# Version:  1.3
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.3    2020 Maintenance
+#           1.3    2020 Maintenance
-#           1.2    Change from require() to requireNamespace(),
+#           1.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.1    Fixed parsing logic.
+#           1.1    Fixed parsing logic.
-#           1.0    First live version 2017.
+#           1.0    First live version 2017.
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                               Line
+#TOC>   Section  Title                               Line
-#TOC> ---------------------------------------------------
+#TOC> ---------------------------------------------------
-#TOC>   1        Defining the APSES domain             45
+#TOC>   1        Defining the APSES domain             45
-#TOC>   2        Executing the BLAST search            75
+#TOC>   2        Executing the BLAST search            75
-#TOC>   3        Analysing results                     97
+#TOC>   3        Analysing results                     97
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Defining the APSES domain  ===========================================
+# =    1  Defining the APSES domain  ===========================================
-
+
-# Load your protein database
+# Load your protein database
-source("makeProteinDB.R")
+source("makeProteinDB.R")
-
+
-# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
+# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
-# have entered this data into your database in the
+# have entered this data into your database in the
-# BIN-ALI-Optimal_sequence_alignment unit.)
+# BIN-ALI-Optimal_sequence_alignment unit.)
-
+
-( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
+( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
-                                                # name of the Mbp1 orthologue
+                                                # name of the Mbp1 orthologue
-                                                # of Mbp1 in your protein
+                                                # of Mbp1 in your protein
-                                                # database, DON'T continue. We
+                                                # database, DON'T continue. We
-                                                # need to fix this problem.
+                                                # need to fix this problem.
-                                                # Get in touch.
+                                                # Get in touch.
-
+
-(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
+(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                               myDB$annotation$featureID == ftrID])
+                               myDB$annotation$featureID == ftrID])
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
+                 start,
-                 end))
+                 end))
-
+
-# The MYSPE "apses" sequence is the sequence that we will use for our reverse
+# The MYSPE "apses" sequence is the sequence that we will use for our reverse
-# BLAST search.
+# BLAST search.
-
+
-
+
-# =    2  Executing the BLAST search  ==========================================
+# =    2  Executing the BLAST search  ==========================================
-
+
-# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
+# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
-# through its Web API, and to parse results. Have a look at the script, then
+# through its Web API, and to parse results. Have a look at the script, then
-# source it:
+# source it:
-
+
-source("./scripts/BLAST.R")
+source("./scripts/BLAST.R")
-
+
-# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
+# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
-# cerevisiae:
+# cerevisiae:
-
+
-BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
+BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
-                     db = "refseq_protein",        # database to search in
+                     db = "refseq_protein",        # database to search in
-                     nHits = 10,                   #
+                     nHits = 10,                   #
-                     E = 0.01,                     #
+                     E = 0.01,                     #
-                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
+                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
-
+
-
+
-length(BLASTresults$hits)  # There should be at least one hit there. Ask for
+length(BLASTresults$hits)  # There should be at least one hit there. Ask for
-                           # advice in case this step fails.
+                           # advice in case this step fails.
-
+
-
+
-# =    3  Analysing results  ===================================================
+# =    3  Analysing results  ===================================================
-
+
-(topHit <- BLASTresults$hits[[1]])   # Get the top hit
+(topHit <- BLASTresults$hits[[1]])   # Get the top hit
-
+
-# What is the refseq ID of the top hit
+# What is the refseq ID of the top hit
-topHit$accession
+topHit$accession
-
+
-# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
+# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
-# domain. If it is not, ask me for advice.
+# domain. If it is not, ask me for advice.
-
+
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-ALI-Dotplot.R
+++ b/BIN-ALI-Dotplot.R
@ -1,195 +1,195 @@
-# tocID <- "BIN-ALI-Dotplot.R"
+# tocID <- "BIN-ALI-Dotplot.R"
-#
+#
-#
+#
-# ==============================================================================
+# ==============================================================================
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Dotplot unit.
+#              R code accompanying the BIN-ALI-Dotplot unit.
-#
+#
-# Version:  0.2
+# Version:  0.2
-#
+#
-# Date:     2019  01  07
+# Date:     2019  01  07
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           0.2    Change from require() to requireNamespace(),
+#           0.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                  Line
+#TOC>   Section  Title                  Line
-#TOC> --------------------------------------
+#TOC> --------------------------------------
-#TOC>   1        ___Section___            42
+#TOC>   1        ___Section___            42
-#TOC>   2        Tasks                   190
+#TOC>   2        Tasks                   190
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  ___Section___  =======================================================
+# =    1  ___Section___  =======================================================
-
+
-if (!requireNamespace("BiocManager", quietly=TRUE)) {
+if (!requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (!requireNamespace("Biostrings", quietly=TRUE)) {
+if (!requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-if (!requireNamespace("seqinr", quietly=TRUE)) {
+if (!requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
+  install.packages("seqinr")
-}
+}
-
+
-
+
-# Let's load BLOSUM62
+# Let's load BLOSUM62
-data(BLOSUM62, package = "Biostrings")
+data(BLOSUM62, package = "Biostrings")
-
+
-# Now let's craft code for a dotplot. That's surprisingly simple. We build a
+# Now let's craft code for a dotplot. That's surprisingly simple. We build a
-# matrix that has as many rows as one sequence, as many columns as another. Then
+# matrix that has as many rows as one sequence, as many columns as another. Then
-# we go through every cell of the matrix and enter the pairscore we encounter
+# we go through every cell of the matrix and enter the pairscore we encounter
-# for the amino acid pair whose position corresponds to the row and column
+# for the amino acid pair whose position corresponds to the row and column
-# index. Finally we visualize the matrix in a plot.
+# index. Finally we visualize the matrix in a plot.
-#
+#
-
+
-# First we fetch our sequences and split them into single characters.
+# First we fetch our sequences and split them into single characters.
-sel <- myDB$protein$name == "MBP1_SACCE"
+sel <- myDB$protein$name == "MBP1_SACCE"
-MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
+MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
-
+
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
+MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
-
+
-# Check that we have two character vectors of the expected length.
+# Check that we have two character vectors of the expected length.
-str(MBP1_SACCE)
+str(MBP1_SACCE)
-str(MBP1_MYSPE)
+str(MBP1_MYSPE)
-
+
-# How do we get the pairscore values? Consider: a single pair of amino acids can
+# How do we get the pairscore values? Consider: a single pair of amino acids can
-# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
+# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
-MBP1_SACCE[13]
+MBP1_SACCE[13]
-MBP1_MYSPE[21]
+MBP1_MYSPE[21]
-
+
-# ... using these as subsetting expressions, we can pull the pairscore
+# ... using these as subsetting expressions, we can pull the pairscore
-# from the MDM
+# from the MDM
-BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
+BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
-
+
-# First we build an empty matrix that will hold all pairscores ...
+# First we build an empty matrix that will hold all pairscores ...
-dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
+dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
-                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
+                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
-
+
-# ... then we loop over the sequences and store the scores in the matrix.
+# ... then we loop over the sequences and store the scores in the matrix.
-#
+#
-for (i in 1:length(MBP1_SACCE)) {
+for (i in 1:length(MBP1_SACCE)) {
-  for (j in 1:length(MBP1_MYSPE)) {
+  for (j in 1:length(MBP1_MYSPE)) {
-    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
+    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
-  }
+  }
-}
+}
-
+
-# Even though this is a large matrix, this does not take much time ...
+# Even though this is a large matrix, this does not take much time ...
-# Let's have a look at a small block of the values:
+# Let's have a look at a small block of the values:
-
+
-dotMat[1:10, 1:10]
+dotMat[1:10, 1:10]
-
+
-# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
+# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
-# the matrix correspond to an amino acid from MBP1_MYSPE.
+# the matrix correspond to an amino acid from MBP1_MYSPE.
-
+
-# To plot this, we use the image() function. Here, with default parameters.
+# To plot this, we use the image() function. Here, with default parameters.
-
+
-image(dotMat)
+image(dotMat)
-
+
-# Be patient, this takes a few moments to render: more than 500,000 values.
+# Be patient, this takes a few moments to render: more than 500,000 values.
-# Nice.
+# Nice.
-# What do you expect?
+# What do you expect?
-# What would similar sequences look like?
+# What would similar sequences look like?
-# What do you see?
+# What do you see?
-
+
-#You migh notice a thin line of yellow along the diagonal, moving approximately
+#You migh notice a thin line of yellow along the diagonal, moving approximately
-# from bottom left to top right, fading in and out of existence. This is the
+# from bottom left to top right, fading in and out of existence. This is the
-# signature of extended sequence similarity.
+# signature of extended sequence similarity.
-
+
-# Let's magnify this a bit by looking at only the first 200 amino acids ...
+# Let's magnify this a bit by looking at only the first 200 amino acids ...
-image(dotMat[1:200, 1:200])
+image(dotMat[1:200, 1:200])
-
+
-# ... and, according to our normal writing convention, we would like the
+# ... and, according to our normal writing convention, we would like the
-# diagonal to run from top-left to bottom-right since we write from left to
+# diagonal to run from top-left to bottom-right since we write from left to
-# right and from top to bottom...
+# right and from top to bottom...
-image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
+image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
-
+
-# ... and we would like the range of the x- and y- axis to correspond to the
+# ... and we would like the range of the x- and y- axis to correspond to the
-# sequence position ...
+# sequence position ...
-image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
+image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
-
+
-# ... and labels! Axis labels would be nice ...
+# ... and labels! Axis labels would be nice ...
-image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
+image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
-      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
-
+
-# ... and why don't we have axis-numbers on all four sides? Go, make that right
+# ... and why don't we have axis-numbers on all four sides? Go, make that right
-# too ...
+# too ...
-len <- 200
+len <- 200
-image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
+image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
-      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
+      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
-box()
+box()
-axis(1, at = c(1, seq(10, len, by=10)))
+axis(1, at = c(1, seq(10, len, by=10)))
-axis(2, at = c(1, seq(10, len, by=10)))
+axis(2, at = c(1, seq(10, len, by=10)))
-axis(3, at = c(1, seq(10, len, by=10)))
+axis(3, at = c(1, seq(10, len, by=10)))
-axis(4, at = c(1, seq(10, len, by=10)))
+axis(4, at = c(1, seq(10, len, by=10)))
-
+
-# ... you get the idea, we can infinitely customize our plot. However a good way
+# ... you get the idea, we can infinitely customize our plot. However a good way
-# to do this is to develop a particular view for, say, a report or publication
+# to do this is to develop a particular view for, say, a report or publication
-# in a script and then put it into a function. I have put a function into the
+# in a script and then put it into a function. I have put a function into the
-# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
+# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
-# there already is a dotplot function in the seqinr package:
+# there already is a dotplot function in the seqinr package:
-
+
-seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
+seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
-dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
-
+
-# Which one do you prefer? You can probably see the block patterns that arise
+# Which one do you prefer? You can probably see the block patterns that arise
-# from segments of repetitive, low complexity sequence. But you probably have to
+# from segments of repetitive, low complexity sequence. But you probably have to
-# look very closely to discern the faint diagonals that correspond to similar
+# look very closely to discern the faint diagonals that correspond to similar
-# sequence.
+# sequence.
-
+
-
+
-# Let's see if we can enhance the contrast between distributed noise and the
+# Let's see if we can enhance the contrast between distributed noise and the
-# actual alignment of conserved residues. We can filter the dot matrix with a
+# actual alignment of conserved residues. We can filter the dot matrix with a
-# pattern that enhances diagonally repeated values. Every value in the matrix
+# pattern that enhances diagonally repeated values. Every value in the matrix
-# will be replaced by a weighted average of its neighborhood. Here is  a
+# will be replaced by a weighted average of its neighborhood. Here is  a
-# diagonal-filter:
+# diagonal-filter:
-
+
-myFilter <- matrix(numeric(25), nrow = 5)
+myFilter <- matrix(numeric(25), nrow = 5)
-myFilter[1, ] <- c( 1, 0, 0, 0, 0)
+myFilter[1, ] <- c( 1, 0, 0, 0, 0)
-myFilter[2, ] <- c( 0, 1, 0, 0, 0)
+myFilter[2, ] <- c( 0, 1, 0, 0, 0)
-myFilter[3, ] <- c( 0, 0, 1, 0, 0)
+myFilter[3, ] <- c( 0, 0, 1, 0, 0)
-myFilter[4, ] <- c( 0, 0, 0, 1, 0)
+myFilter[4, ] <- c( 0, 0, 0, 1, 0)
-myFilter[5, ] <- c( 0, 0, 0, 0, 1)
+myFilter[5, ] <- c( 0, 0, 0, 0, 1)
-
+
-# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
+# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
-
+
-dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
+dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
-
+
-# I think the result shows quite nicely how the two sequences are globally
+# I think the result shows quite nicely how the two sequences are globally
-# related and where the regions of sequence similarity are. Play with this a bit
+# related and where the regions of sequence similarity are. Play with this a bit
-# ...  Can you come up with a better filter? If so, eMail us.
+# ...  Can you come up with a better filter? If so, eMail us.
-
+
-
+
-
+
-
+
-# =    2  Tasks  ===============================================================
+# =    2  Tasks  ===============================================================
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-ALI-MSA.R
+++ b/BIN-ALI-MSA.R
--- a/BIN-ALI-Optimal_sequence_alignment.R
+++ b/BIN-ALI-Optimal_sequence_alignment.R
@ -1,365 +1,365 @@
-# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
+# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
+#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
-#
+#
-# ==============================================================================
+# ==============================================================================
-# Version:  1.7.1
+# Version:  1.7.1
-#
+#
-# Date:     2017-09   -   2020-10
+# Date:     2017-09   -   2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
+#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
-#           1.7    2020 updates
+#           1.7    2020 updates
-#           1.6    Maintenance
+#           1.6    Maintenance
-#           1.5    Change from require() to requireNamespace(),
+#           1.5    Change from require() to requireNamespace(),
-#                    use <package>::<function>() idiom throughout
+#                    use <package>::<function>() idiom throughout
-#           1.4    Pull s2c() from seqinr package, rather then loading the
+#           1.4    Pull s2c() from seqinr package, rather then loading the
-#                    entire library.
+#                    entire library.
-#           1.3    Updated confirmation task with correct logic
+#           1.3    Updated confirmation task with correct logic
-#           1.2    Added missing load of seqinr package
+#           1.2    Added missing load of seqinr package
-#           1.1    Update annotation file logic - it could already have been
+#           1.1    Update annotation file logic - it could already have been
-#                    prepared in the BIN-FUNC-Annotation unit.
+#                    prepared in the BIN-FUNC-Annotation unit.
-#           1.0.1  bugfix
+#           1.0.1  bugfix
-#           1.0    First 2017 live version.
+#           1.0    First 2017 live version.
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                                      Line
+#TOC>   Section  Title                                                      Line
-#TOC> --------------------------------------------------------------------------
+#TOC> --------------------------------------------------------------------------
-#TOC>   1        Prepare                                                      58
+#TOC>   1        Prepare                                                      58
-#TOC>   2        Biostrings Pairwise Alignment                                75
+#TOC>   2        Biostrings Pairwise Alignment                                75
-#TOC>   2.1        Optimal global alignment                                   93
+#TOC>   2.1        Optimal global alignment                                   93
-#TOC>   2.2        Optimal local alignment                                   156
+#TOC>   2.2        Optimal local alignment                                   156
-#TOC>   3        APSES Domain annotation by alignment                        180
+#TOC>   3        APSES Domain annotation by alignment                        180
-#TOC>   4        Update your database script                                 261
+#TOC>   4        Update your database script                                 261
-#TOC>   4.1        Preparing an annotation file ...                          267
+#TOC>   4.1        Preparing an annotation file ...                          267
-#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
+#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
-#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
+#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
-#TOC>   4.2        Execute and Validate                                      338
+#TOC>   4.2        Execute and Validate                                      338
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Prepare  =============================================================
+# =    1  Prepare  =============================================================
-
+
-if (! requireNamespace("seqinr", quietly=TRUE)) {
+if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
+  install.packages("seqinr")
-}
+}
-# You can get package information with the following commands:
+# You can get package information with the following commands:
-# library(help = seqinr)       # basic information
+# library(help = seqinr)       # basic information
-# browseVignettes("seqinr")    # available vignettes
+# browseVignettes("seqinr")    # available vignettes
-# data(package = "seqinr")     # available datasets
+# data(package = "seqinr")     # available datasets
-
+
-
+
-# You need to recreate the protein database that you have constructed in the
+# You need to recreate the protein database that you have constructed in the
-# BIN-Storing_data unit.
+# BIN-Storing_data unit.
-
+
-source("./myScripts/makeProteinDB.R")
+source("./myScripts/makeProteinDB.R")
-
+
-
+
-# =    2  Biostrings Pairwise Alignment  =======================================
+# =    2  Biostrings Pairwise Alignment  =======================================
-
+
-
+
-if (!requireNamespace("BiocManager", quietly=TRUE)) {
+if (!requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (!requireNamespace("Biostrings", quietly=TRUE)) {
+if (!requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-
+
-# Biostrings stores sequences in "XString" objects. Once we have converted our
+# Biostrings stores sequences in "XString" objects. Once we have converted our
-# target sequences to AAString objects, the alignment itself is straightforward.
+# target sequences to AAString objects, the alignment itself is straightforward.
-
+
-# ==   2.1  Optimal global alignment  ==========================================
+# ==   2.1  Optimal global alignment  ==========================================
-
+
-# The pairwiseAlignment() function was written to behave
+# The pairwiseAlignment() function was written to behave
-# exactly like the functions you encountered on the EMBOSS server.
+# exactly like the functions you encountered on the EMBOSS server.
-
+
-# First: make AAString objects ...
+# First: make AAString objects ...
-sel <- myDB$protein$name == "MBP1_SACCE"
+sel <- myDB$protein$name == "MBP1_SACCE"
-aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
+aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
-
+
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
+aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
-
+
-?pairwiseAlignment
+?pairwiseAlignment
-# ... and align.
+# ... and align.
-# Global optimal alignment with end-gap penalties is default.
+# Global optimal alignment with end-gap penalties is default.
-ali1 <-  Biostrings::pairwiseAlignment(
+ali1 <-  Biostrings::pairwiseAlignment(
-  aaMBP1_SACCE,
+  aaMBP1_SACCE,
-  aaMBP1_MYSPE,
+  aaMBP1_MYSPE,
-  substitutionMatrix = "BLOSUM62",
+  substitutionMatrix = "BLOSUM62",
-  gapOpening = 10,
+  gapOpening = 10,
-  gapExtension = 0.5)
+  gapExtension = 0.5)
-
+
-str(ali1)  # ... it's complicated
+str(ali1)  # ... it's complicated
-
+
-# This is a Biostrings alignment object. But we can use Biostrings functions to
+# This is a Biostrings alignment object. But we can use Biostrings functions to
-# tame it:
+# tame it:
-ali1
+ali1
-Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
+Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
-
+
-# And we can make the internal structure work for us  (@ is for classes as
+# And we can make the internal structure work for us  (@ is for classes as
-# $ is for lists ...)
+# $ is for lists ...)
-str(ali1@pattern)
+str(ali1@pattern)
-ali1@pattern
+ali1@pattern
-ali1@pattern@range
+ali1@pattern@range
-ali1@pattern@indel
+ali1@pattern@indel
-ali1@pattern@mismatch
+ali1@pattern@mismatch
-
+
-# or work with "normal" R functions
+# or work with "normal" R functions
-# the alignment length
+# the alignment length
-nchar(as.character(ali1@pattern))
+nchar(as.character(ali1@pattern))
-
+
-# the number of identities
+# the number of identities
-sum(seqinr::s2c(as.character(ali1@pattern)) ==
+sum(seqinr::s2c(as.character(ali1@pattern)) ==
-    seqinr::s2c(as.character(ali1@subject)))
+    seqinr::s2c(as.character(ali1@subject)))
-
+
-# ... e.g. to calculate the percentage of identities
+# ... e.g. to calculate the percentage of identities
-100 *
+100 *
-  sum(seqinr::s2c(as.character(ali1@pattern)) ==
+  sum(seqinr::s2c(as.character(ali1@pattern)) ==
-      seqinr::s2c(as.character(ali1@subject))) /
+      seqinr::s2c(as.character(ali1@subject))) /
-  nchar(as.character(ali1@pattern))
+  nchar(as.character(ali1@pattern))
-# ... which should be the same as reported in the writePairwiseAlignments()
+# ... which should be the same as reported in the writePairwiseAlignments()
-# output. Awkward to type? Then it calls for a function:
+# output. Awkward to type? Then it calls for a function:
-#
+#
-percentID <- function(al) {
+percentID <- function(al) {
-  # returns the percent-identity of a Biostrings alignment object
+  # returns the percent-identity of a Biostrings alignment object
-  return(100 *
+  return(100 *
-         sum(seqinr::s2c(as.character(al@pattern)) ==
+         sum(seqinr::s2c(as.character(al@pattern)) ==
-             seqinr::s2c(as.character(al@subject))) /
+             seqinr::s2c(as.character(al@subject))) /
-         nchar(as.character(al@pattern)))
+         nchar(as.character(al@pattern)))
-}
+}
-
+
-percentID(ali1)
+percentID(ali1)
-
+
-# ==   2.2  Optimal local alignment  ===========================================
+# ==   2.2  Optimal local alignment  ===========================================
-
+
-# Compare with local optimal alignment (like EMBOSS Water)
+# Compare with local optimal alignment (like EMBOSS Water)
-ali2 <-  Biostrings::pairwiseAlignment(
+ali2 <-  Biostrings::pairwiseAlignment(
-  aaMBP1_SACCE,
+  aaMBP1_SACCE,
-  aaMBP1_MYSPE,
+  aaMBP1_MYSPE,
-  type = "local",
+  type = "local",
-  substitutionMatrix = "BLOSUM62",
+  substitutionMatrix = "BLOSUM62",
-  gapOpening = 50,
+  gapOpening = 50,
-  gapExtension = 10)
+  gapExtension = 10)
-
+
-Biostrings::writePairwiseAlignments(ali2)
+Biostrings::writePairwiseAlignments(ali2)
-# This has probably only aligned the N-terminal DNA binding domain - but that
+# This has probably only aligned the N-terminal DNA binding domain - but that
-# one has quite high sequence identity:
+# one has quite high sequence identity:
-percentID(ali2)
+percentID(ali2)
-
+
-# == TASK: ==
+# == TASK: ==
-
+
-# Compare the two alignments. I have weighted the local alignment heavily
+# Compare the two alignments. I have weighted the local alignment heavily
-# towards an ungapped alignment by setting very high gap penalties. Try changing
+# towards an ungapped alignment by setting very high gap penalties. Try changing
-# the gap penalties and see what happens: how does the number of indels change,
+# the gap penalties and see what happens: how does the number of indels change,
-# how does the length of indels change...
+# how does the length of indels change...
-
+
-
+
-# =    3  APSES Domain annotation by alignment  ================================
+# =    3  APSES Domain annotation by alignment  ================================
-
+
-# In this section we define the MYSPE APSES sequence by performing a global,
+# In this section we define the MYSPE APSES sequence by performing a global,
-# optimal sequence alignment of the yeast APSES domain with the full length
+# optimal sequence alignment of the yeast APSES domain with the full length
-# protein sequence of the protein that was the most similar to the yeast APSES
+# protein sequence of the protein that was the most similar to the yeast APSES
-# domain.
+# domain.
-#
+#
-
+
-# I have annotated the yeast APSES domain as a feature in the
+# I have annotated the yeast APSES domain as a feature in the
-# database. To view the annotation, we can retrieve it via the proteinID and
+# database. To view the annotation, we can retrieve it via the proteinID and
-# featureID. Here is the yeast protein ID:
+# featureID. Here is the yeast protein ID:
-(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
+(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
-
+
-
+
-# ... and if you look at the feature table, you can identify the feature ID
+# ... and if you look at the feature table, you can identify the feature ID
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-
+
-# ... and with the two annotations we can get the corresponding ID from the
+# ... and with the two annotations we can get the corresponding ID from the
-# annotation table
+# annotation table
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                             myDB$annotation$featureID == ftrID])
+                             myDB$annotation$featureID == ftrID])
-
+
-myDB$annotation[myDB$annotation$ID == proID &
+myDB$annotation[myDB$annotation$ID == proID &
-                myDB$annotation$ID == ftrID, ]
+                myDB$annotation$ID == ftrID, ]
-
+
-# The annotation record contains the start and end coordinates which we can use
+# The annotation record contains the start and end coordinates which we can use
-# to define the APSES domain sequence with a substr() expression.
+# to define the APSES domain sequence with a substr() expression.
-
+
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
+                 start,
-                 end))
+                 end))
-
+
-# Lots of code. But don't get lost. Let's recapitulate what we have done: we
+# Lots of code. But don't get lost. Let's recapitulate what we have done: we
-# have selected from the sequence column of the protein table the sequence whose
+# have selected from the sequence column of the protein table the sequence whose
-# name is "MBP1_SACCE", and selected from the annotation table the start
+# name is "MBP1_SACCE", and selected from the annotation table the start
-# and end coordinates of the annotation that joins an "APSES fold" feature with
+# and end coordinates of the annotation that joins an "APSES fold" feature with
-# the sequence, and used the start and end coordinates to extract a substring.
+# the sequence, and used the start and end coordinates to extract a substring.
-
+
-# Let's convert this to an AAstring and assign it:
+# Let's convert this to an AAstring and assign it:
-aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
+aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
-
+
-# Now let's align these two sequences of very different length without end-gap
+# Now let's align these two sequences of very different length without end-gap
-# penalties using the "overlap" type. "overlap" turns the
+# penalties using the "overlap" type. "overlap" turns the
-# end-gap penalties off and that is crucially important since
+# end-gap penalties off and that is crucially important since
-# the sequences have very different length.
+# the sequences have very different length.
-
+
-aliApses <-  Biostrings::pairwiseAlignment(
+aliApses <-  Biostrings::pairwiseAlignment(
-  aaMB1_SACCE_APSES,
+  aaMB1_SACCE_APSES,
-  aaMBP1_MYSPE,
+  aaMBP1_MYSPE,
-  type = "overlap",
+  type = "overlap",
-  substitutionMatrix = "BLOSUM62",
+  substitutionMatrix = "BLOSUM62",
-  gapOpening = 10,
+  gapOpening = 10,
-  gapExtension = 0.5)
+  gapExtension = 0.5)
-
+
-# Inspect the result. The aligned sequences should be clearly
+# Inspect the result. The aligned sequences should be clearly
-# homologous, and have (almost) no indels. The entire "pattern"
+# homologous, and have (almost) no indels. The entire "pattern"
-# sequence from QIYSAR ... to ... KPLFDF  should be matched
+# sequence from QIYSAR ... to ... KPLFDF  should be matched
-# with the "query". Is this correct?
+# with the "query". Is this correct?
-Biostrings::writePairwiseAlignments(aliApses)
+Biostrings::writePairwiseAlignments(aliApses)
-
+
-# If this is correct, you can extract the matched sequence from
+# If this is correct, you can extract the matched sequence from
-# the alignment object. The syntax is a bit different from what
+# the alignment object. The syntax is a bit different from what
-# you have seen before: this is an "S4 object", not a list. No
+# you have seen before: this is an "S4 object", not a list. No
-# worries: as.character() returns a normal string.
+# worries: as.character() returns a normal string.
-as.character(aliApses@subject)
+as.character(aliApses@subject)
-
+
-# Now, what are the aligned start and end coordinates? You can read them from
+# Now, what are the aligned start and end coordinates? You can read them from
-# the output of writePairwiseAlignments(), or you can get them from the range of
+# the output of writePairwiseAlignments(), or you can get them from the range of
-# the match.
+# the match.
-
+
-str(aliApses@subject@range)
+str(aliApses@subject@range)
-
+
-# start is:
+# start is:
-aliApses@subject@range@start
+aliApses@subject@range@start
-
+
-# ... and end is:
+# ... and end is:
-aliApses@subject@range@start + aliApses@subject@range@width - 1
+aliApses@subject@range@start + aliApses@subject@range@width - 1
-
+
-
+
-# =    4  Update your database script  =========================================
+# =    4  Update your database script  =========================================
-
+
-
+
-# Since we have this feature defined now, we can create a feature annotation
+# Since we have this feature defined now, we can create a feature annotation
-# right away and store it in myDB.
+# right away and store it in myDB.
-
+
-# ==   4.1  Preparing an annotation file ...  ==================================
+# ==   4.1  Preparing an annotation file ...  ==================================
-#
+#
-# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
+# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
-#
+#
-#
+#
-#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
+#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
+#   ./myScripts/ directory:
-#
+#
-#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
+#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
-#     myScripts/ directory.
+#     myScripts/ directory.
-#
+#
-#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
+#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
-#     if MYSPE is called "Crptycoccus neoformans", your file should be called
+#     if MYSPE is called "Crptycoccus neoformans", your file should be called
-#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
+#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
-#     "MBP1_CRYNE").
+#     "MBP1_CRYNE").
-#
+#
-#   - Open the file in the RStudio editor and delete all blocks for
+#   - Open the file in the RStudio editor and delete all blocks for
-#     the Mbp1 protein annotations except the first one.
+#     the Mbp1 protein annotations except the first one.
-#
+#
-#   - From that block, delete all lines except for the line that says:
+#   - From that block, delete all lines except for the line that says:
-#
+#
-# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
+# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
-#
+#
-#   - Then delete the comma at the end of the line (your file will just have
+#   - Then delete the comma at the end of the line (your file will just have
-#     this one annotation).
+#     this one annotation).
-#
+#
-#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
+#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
-#     "start" and "end" features to the coordinates you just discovered for the
+#     "start" and "end" features to the coordinates you just discovered for the
-#     APSES domain in your sequence.
+#     APSES domain in your sequence.
-#
+#
-#   - Save the file in your myScripts/ directory
+#   - Save the file in your myScripts/ directory
-#
+#
-##   - Validate your file online at https://jsonlint.com/
+##   - Validate your file online at https://jsonlint.com/
-#
+#
-#   - Update your "./myScripts/makeProteinDB.R" script to load your new
+#   - Update your "./myScripts/makeProteinDB.R" script to load your new
-#     annotation when you recreate the database. Open the script in the
+#     annotation when you recreate the database. Open the script in the
-#     RStudio editor, and add the following command at the end:
+#     RStudio editor, and add the following command at the end:
-#
+#
-#     myDB <- dbAddAnnotation(myDB,
+#     myDB <- dbAddAnnotation(myDB,
-#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
+#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
-#                                                 ^^^^^^^
+#                                                 ^^^^^^^
-#                                                edit this!
+#                                                edit this!
-#   - save and close the file.
+#   - save and close the file.
-#
+#
-# Then SKIP the next section.
+# Then SKIP the next section.
-#
+#
-#
+#
-# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
+# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
-#
+#
-#
+#
-#   You DO already have a file called "<MYSPE>-Annotations.json" in the
+#   You DO already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
+#   ./myScripts/ directory:
-#
+#
-#   - Open the file in the RStudio editor.
+#   - Open the file in the RStudio editor.
-#
+#
-#   - Below the last feature lines (but before the closing "]") add the
+#   - Below the last feature lines (but before the closing "]") add the
-#     following feature line (without the "#")
+#     following feature line (without the "#")
-#
+#
-# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
+# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
-#
+#
-#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
+#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
-#     "start" and "end" features to the coordinates you just discovered for the
+#     "start" and "end" features to the coordinates you just discovered for the
-#     APSES domain in your sequence.
+#     APSES domain in your sequence.
-#
+#
-#   - Add a comma after the preceding feature line.
+#   - Add a comma after the preceding feature line.
-#
+#
-#   - Save your file.
+#   - Save your file.
-#
+#
-#   - Validate your file online at https://jsonlint.com/
+#   - Validate your file online at https://jsonlint.com/
-#
+#
-#
+#
-# ==   4.2  Execute and Validate  ==============================================
+# ==   4.2  Execute and Validate  ==============================================
-#
+#
-#   - source() your database creation script:
+#   - source() your database creation script:
-#
+#
-#  source("./myScripts/makeProteinDB.R")
+#  source("./myScripts/makeProteinDB.R")
-#
+#
-#     This should run without errors or warnings. If it doesn't work and you
+#     This should run without errors or warnings. If it doesn't work and you
-#     can't figure out quickly what's happening, ask on the mailing list for
+#     can't figure out quickly what's happening, ask on the mailing list for
-#     help.
+#     help.
-#
+#
-#   - Confirm
+#   - Confirm
-#     The following commands should retrieve the correct start and end
+#     The following commands should retrieve the correct start and end
-#     coordinates and sequence of the MBP1_MYSPE APSES domain:
+#     coordinates and sequence of the MBP1_MYSPE APSES domain:
-
+
-sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
+sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
-
+
-(proID <- myDB$protein$ID[sel])
+(proID <- myDB$protein$ID[sel])
-(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
+(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
-                             myDB$annotation$featureID == ftrID])
+                             myDB$annotation$featureID == ftrID])
-(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
+(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
-(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
+(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
-(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
+(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
-                 start,
+                 start,
-                 end))
+                 end))
-
+
-
+
-# [END]
+# [END]
--- a/BIN-ALI-Similarity.R
+++ b/BIN-ALI-Similarity.R
@ -1,313 +1,313 @@
-# tocID <- "BIN-ALI-Similarity.R"
+# tocID <- "BIN-ALI-Similarity.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-ALI-Similarity unit.
+#              R code accompanying the BIN-ALI-Similarity unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Updates
+#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.0    Refactored for 2017; add aaindex, ternary plot.
+#           1.0    Refactored for 2017; add aaindex, ternary plot.
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#   Update ggtern:: ternary plot to use aacol dots under text
+#   Update ggtern:: ternary plot to use aacol dots under text
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                          Line
+#TOC>   Section  Title                          Line
-#TOC> ----------------------------------------------
+#TOC> ----------------------------------------------
-#TOC>   1        Amino Acid Properties            43
+#TOC>   1        Amino Acid Properties            43
-#TOC>   2        Mutation Data matrix            189
+#TOC>   2        Mutation Data matrix            189
-#TOC>   3        Background score                230
+#TOC>   3        Background score                230
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Amino Acid Properties  ===============================================
+# =    1  Amino Acid Properties  ===============================================
-
+
-# A large collection of amino acid property tables is available via the seqinr
+# A large collection of amino acid property tables is available via the seqinr
-# package:
+# package:
-
+
-if (! requireNamespace("seqinr", quietly=TRUE)) {
+if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
+  install.packages("seqinr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = seqinr)       # basic information
+#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
+#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
+#  data(package = "seqinr")     # available datasets
-
+
-# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
+# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
-#  data:
+#  data:
-
+
-?aaindex
+?aaindex
-data(aaindex, package = "seqinr")  # load the aaindex list from the package
+data(aaindex, package = "seqinr")  # load the aaindex list from the package
-
+
-length(aaindex)
+length(aaindex)
-
+
-# Here are all the index descriptions
+# Here are all the index descriptions
-for (i in 1:length(aaindex)) {
+for (i in 1:length(aaindex)) {
-  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
+  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
-}
+}
-
+
-# It's a bit cumbersome to search through the descriptions ... here is a
+# It's a bit cumbersome to search through the descriptions ... here is a
-# function to make this easier:
+# function to make this easier:
-
+
-searchAAindex <- function(patt) {
+searchAAindex <- function(patt) {
-  # Searches the aaindex descriptions for regular expression "patt"
+  # Searches the aaindex descriptions for regular expression "patt"
-  # and prints index number and description.
+  # and prints index number and description.
-  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
+  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
-  for (i in seq_along(hits)) {
+  for (i in seq_along(hits)) {
-    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
+    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
-  }
+  }
-}
+}
-
+
-
+
-searchAAindex("free energy")          # Search for "free energy"
+searchAAindex("free energy")          # Search for "free energy"
-searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
+searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
-
+
-
+
-
+
-
+
-# Let's examine ...
+# Let's examine ...
-# ... a hydrophobicity index
+# ... a hydrophobicity index
-(Y <- aaindex[[528]][c("D", "I")])
+(Y <- aaindex[[528]][c("D", "I")])
-
+
-# ... a volume index
+# ... a volume index
-(V <- aaindex[[150]][c("D", "I")])
+(V <- aaindex[[150]][c("D", "I")])
-
+
-# ... and one of our own: side-chain pK values as reported by
+# ... and one of our own: side-chain pK values as reported by
-# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
+# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
-# to 7.4 (physiological pH)
+# to 7.4 (physiological pH)
-K <- list(I = c( 7.4,   # Ala
+K <- list(I = c( 7.4,   # Ala
-                12.3,   # Arg
+                12.3,   # Arg
-                 7.4,   # Asn
+                 7.4,   # Asn
-                 3.9,   # Asp
+                 3.9,   # Asp
-                 8.6,   # Cys
+                 8.6,   # Cys
-                 7.4,   # Gln
+                 7.4,   # Gln
-                 4.3,   # Glu
+                 4.3,   # Glu
-                 7.4,   # Gly
+                 7.4,   # Gly
-                 6.5,   # His
+                 6.5,   # His
-                 7.4,   # Ile
+                 7.4,   # Ile
-                 7.4,   # Leu
+                 7.4,   # Leu
-                10.4,   # Lys
+                10.4,   # Lys
-                 7.4,   # Met
+                 7.4,   # Met
-                 7.4,   # Phe
+                 7.4,   # Phe
-                 7.4,   # Pro
+                 7.4,   # Pro
-                 7.4,   # Ser
+                 7.4,   # Ser
-                 7.4,   # Thr
+                 7.4,   # Thr
-                 7.4,   # Trp
+                 7.4,   # Trp
-                 9.8,   # Tyr
+                 9.8,   # Tyr
-                 7.4))  # Val
+                 7.4))  # Val
-names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
+names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
-                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
+                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
-
+
-
+
-# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
+# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
-
+
-# pull the names from Y$I, convert them to single letter code, and reorder the
+# pull the names from Y$I, convert them to single letter code, and reorder the
-# AACOLS palette accordingly ...
+# AACOLS palette accordingly ...
-aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
+aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
-
+
-plot(Y$I, V$I,
+plot(Y$I, V$I,
-     xlab = "hydrophobicity", ylab = "volume",
+     xlab = "hydrophobicity", ylab = "volume",
-     pch = 21,
+     pch = 21,
-     cex = 6,
+     cex = 6,
-     col = aac,
+     col = aac,
-     bg  = aac)
+     bg  = aac)
-text(Y$I, V$I, names(Y$I), cex = 0.8)
+text(Y$I, V$I, names(Y$I), cex = 0.8)
-
+
-plot(Y$I, K$I,
+plot(Y$I, K$I,
-     xlab = "hydrophobicity", ylab = "pK",
+     xlab = "hydrophobicity", ylab = "pK",
-     pch = 21,
+     pch = 21,
-     cex = 6,
+     cex = 6,
-     col = aac,
+     col = aac,
-     bg  = aac)
+     bg  = aac)
-text(Y$I, K$I, names(Y$I), cex = 0.8)
+text(Y$I, K$I, names(Y$I), cex = 0.8)
-
+
-# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
+# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
-# plots are in general unintuitive and hard to interpret. One alternative is a
+# plots are in general unintuitive and hard to interpret. One alternative is a
-# so-called "ternary plot":
+# so-called "ternary plot":
-
+
-if (! requireNamespace("ggtern", quietly=TRUE)) {
+if (! requireNamespace("ggtern", quietly=TRUE)) {
-  install.packages("ggtern")
+  install.packages("ggtern")
-}
+}
-# Package information:
+# Package information:
-#  library(help = ggtern)       # basic information
+#  library(help = ggtern)       # basic information
-#  browseVignettes("ggtern")    # available vignettes
+#  browseVignettes("ggtern")    # available vignettes
-#  data(package = "ggtern")     # available datasets
+#  data(package = "ggtern")     # available datasets
-
+
-
+
-
+
-# collect into data frame, normalize to (0.05, 0.95)
+# collect into data frame, normalize to (0.05, 0.95)
-myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
+myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
-                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
+                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
-                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
+                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
-                    stringsAsFactors = FALSE)
+                    stringsAsFactors = FALSE)
-rownames(myDat) <- names(Y$I)
+rownames(myDat) <- names(Y$I)
-
+
-ggtern::ggtern(data = myDat,
+ggtern::ggtern(data = myDat,
-               ggplot2::aes(x = vol,
+               ggplot2::aes(x = vol,
-                   y = phi,
+                   y = phi,
-                   z = pK,
+                   z = pK,
-                   label = rownames(myDat))) + ggplot2::geom_text()
+                   label = rownames(myDat))) + ggplot2::geom_text()
-
+
-# This results in a mapping of amino acids relative to each other that is
+# This results in a mapping of amino acids relative to each other that is
-# similar to the Venn diagram you have seen in the notes.
+# similar to the Venn diagram you have seen in the notes.
-
+
-# ... or we could use principal components analysis, to pull out the
+# ... or we could use principal components analysis, to pull out the
-# best projection of the three feature dimensions into two. (Done here without delving
+# best projection of the three feature dimensions into two. (Done here without delving
-# into the theory ...)
+# into the theory ...)
-prc <- prcomp(myDat)
+prc <- prcomp(myDat)
-plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
+plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
-     pch=19, cex=6, col=aad, cex.main=0.7,
+     pch=19, cex=6, col=aad, cex.main=0.7,
-     main="Principal Component Analysis of Amino Acid Features")
+     main="Principal Component Analysis of Amino Acid Features")
-text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
+text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
-
+
-# This matches the intuition rather well in that "similar" amino acids are close
+# This matches the intuition rather well in that "similar" amino acids are close
-# on the plot. But we can't interpret the distances in terms of just one of the
+# on the plot. But we can't interpret the distances in terms of just one of the
-# parameters. Whatever - nature has a different way to define similarity:
+# parameters. Whatever - nature has a different way to define similarity:
-# mutations to similar amino acids are less likely to break the protein.
+# mutations to similar amino acids are less likely to break the protein.
-
+
-
+
-# =    2  Mutation Data matrix  ================================================
+# =    2  Mutation Data matrix  ================================================
-
+
-# A mutation data matrix encodes all amino acid pairscores in a matrix.
+# A mutation data matrix encodes all amino acid pairscores in a matrix.
-
+
-# The Biostrings package contains the most common mutation data matrices.
+# The Biostrings package contains the most common mutation data matrices.
-
+
-if (! requireNamespace("BiocManager", quietly=TRUE)) {
+if (! requireNamespace("BiocManager", quietly=TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly=TRUE)) {
+if (! requireNamespace("Biostrings", quietly=TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help=Biostrings)       # basic information
+#  library(help=Biostrings)       # basic information
-#  browseVignettes("Biostrings")  # available vignettes
+#  browseVignettes("Biostrings")  # available vignettes
-#  data(package = "Biostrings")   # available datasets
+#  data(package = "Biostrings")   # available datasets
-
+
-# Let's attach the BLOSUM62 mutation data matrix from the package
+# Let's attach the BLOSUM62 mutation data matrix from the package
-data(BLOSUM62, package = "Biostrings")
+data(BLOSUM62, package = "Biostrings")
-
+
-# ... and see what it contains. (You've seen this matrix before.)
+# ... and see what it contains. (You've seen this matrix before.)
-BLOSUM62
+BLOSUM62
-
+
-# We can simply access values via the row/column names.
+# We can simply access values via the row/column names.
-# Identical amino acids have high scores ...
+# Identical amino acids have high scores ...
-BLOSUM62["H", "H"]   # Score for a pair of two histidines
+BLOSUM62["H", "H"]   # Score for a pair of two histidines
-BLOSUM62["S", "S"]   # Score for a pair of two serines
+BLOSUM62["S", "S"]   # Score for a pair of two serines
-
+
-# Similar amino acids have low positive scores ...
+# Similar amino acids have low positive scores ...
-BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
+BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
-BLOSUM62["F", "Y"]   # etc.
+BLOSUM62["F", "Y"]   # etc.
-
+
-# Dissimilar amino acids have negative scores ...
+# Dissimilar amino acids have negative scores ...
-BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
+BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
-BLOSUM62["Q", "P"]   # etc.
+BLOSUM62["Q", "P"]   # etc.
-
+
-
+
-BLOSUM62["R", "W"]   # the matrix is symmetric!
+BLOSUM62["R", "W"]   # the matrix is symmetric!
-BLOSUM62["W", "R"]
+BLOSUM62["W", "R"]
-
+
-
+
-# =    3  Background score  ====================================================
+# =    3  Background score  ====================================================
-
+
-# The mutation data matrix is designed to give high scores to homologous
+# The mutation data matrix is designed to give high scores to homologous
-# sequences, low scores to non-homologous sequences. What score on average
+# sequences, low scores to non-homologous sequences. What score on average
-# should we expect for a random sequence?
+# should we expect for a random sequence?
-
+
-# If we sample amino acid pairs at random, we will get a score that is the
+# If we sample amino acid pairs at random, we will get a score that is the
-# average of the individual pairscores in the matrix. Omitting the ambiguity
+# average of the individual pairscores in the matrix. Omitting the ambiguity
-# codes and the gap character:
+# codes and the gap character:
-
+
-sum(BLOSUM62[1:20, 1:20])/400
+sum(BLOSUM62[1:20, 1:20])/400
-
+
-# But that score could be higher for real sequences, for which the amino acid
+# But that score could be higher for real sequences, for which the amino acid
-# distribution is not random. For example membrane proteins have a large number
+# distribution is not random. For example membrane proteins have a large number
-# of hydrophobic residues - an alignment of unrelated proteins might produce
+# of hydrophobic residues - an alignment of unrelated proteins might produce
-# positive scores. And there are other proteins with biased amino acid
+# positive scores. And there are other proteins with biased amino acid
-# compositions, in particular poteins that interact with multiple other
+# compositions, in particular poteins that interact with multiple other
-# proteins. Let's test how this impacts the background score by comparing a
+# proteins. Let's test how this impacts the background score by comparing a
-# sequence with shuffled sequences. These have the same composition, but are
+# sequence with shuffled sequences. These have the same composition, but are
-# obvioulsy not homologous. The data directory contains the FASTA file for the
+# obvioulsy not homologous. The data directory contains the FASTA file for the
-# PDB ID 3FG7 - a villin headpiece structure with a large amount of
+# PDB ID 3FG7 - a villin headpiece structure with a large amount of
-# low-complexity amino acid sequence ...
+# low-complexity amino acid sequence ...
-
+
-aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
+aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
-
+
-# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
+# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
-# with an exceptionally high percentage of hydrophobic residues.
+# with an exceptionally high percentage of hydrophobic residues.
-
+
-aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
+aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
-
+
-# Here is a function that takes two sequences and
+# Here is a function that takes two sequences and
-# returns their average pairscore.
+# returns their average pairscore.
-
+
-averagePairScore <- function(a, b, MDM = BLOSUM62) {
+averagePairScore <- function(a, b, MDM = BLOSUM62) {
-  # Returns average pairscore of two sequences.
+  # Returns average pairscore of two sequences.
-  # Parameters:
+  # Parameters:
-  #    a, b   chr   amino acid sequence string
+  #    a, b   chr   amino acid sequence string
-  #    MDM          mutation data matrix. Default is BLOSUM62
+  #    MDM          mutation data matrix. Default is BLOSUM62
-  # Value:    num   average pairscore.
+  # Value:    num   average pairscore.
-  a <- unlist(strsplit(a, ""))
+  a <- unlist(strsplit(a, ""))
-  b <- unlist(strsplit(b, ""))
+  b <- unlist(strsplit(b, ""))
-  v <- 0
+  v <- 0
-  for (i in seq_along(a)) {
+  for (i in seq_along(a)) {
-    v <- v + MDM[ a[i], b[i] ]
+    v <- v + MDM[ a[i], b[i] ]
-  }
+  }
-  return(v / length(a))
+  return(v / length(a))
-}
+}
-
+
-orig3FG7 <- toString(aa3FG7)
+orig3FG7 <- toString(aa3FG7)
-orig2F1C <- toString(aa2F1C)
+orig2F1C <- toString(aa2F1C)
-N <- 1000
+N <- 1000
-scores3FG7 <- numeric(N)
+scores3FG7 <- numeric(N)
-scores2F1C <- numeric(N)
+scores2F1C <- numeric(N)
-for (i in 1:N) {
+for (i in 1:N) {
-  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
+  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
-  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
+  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
-}
+}
-
+
-# Plot the distributions
+# Plot the distributions
-hist(scores3FG7,
+hist(scores3FG7,
-     col="#5599EE33",
+     col="#5599EE33",
-     breaks = seq(-1.5, 0, by=0.1),
+     breaks = seq(-1.5, 0, by=0.1),
-     main = "Pairscores for randomly shuffled sequences",
+     main = "Pairscores for randomly shuffled sequences",
-     xlab = "Average pairscore from BLOSUM 62")
+     xlab = "Average pairscore from BLOSUM 62")
-hist(scores2F1C,
+hist(scores2F1C,
-     col="#55EE9933",
+     col="#55EE9933",
-     breaks = seq(-1.5, 0, by=0.1),
+     breaks = seq(-1.5, 0, by=0.1),
-     add = TRUE)
+     add = TRUE)
-abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
+abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
-legend('topright',
+legend('topright',
-       c("3FG7 (villin)", "2F1C (OmpG)"),
+       c("3FG7 (villin)", "2F1C (OmpG)"),
-       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
+       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
-       inset = 0.1)
+       inset = 0.1)
-
+
-# This is an important result: even though we have shuffled significantly biased
+# This is an important result: even though we have shuffled significantly biased
-# sequences, and the average scores trend above the average of the mutation data
+# sequences, and the average scores trend above the average of the mutation data
-# matrix, the average scores still remain comfortably below zero. This means
+# matrix, the average scores still remain comfortably below zero. This means
-# that we can't (in general) improve a high-scoring alignment by simply
+# that we can't (in general) improve a high-scoring alignment by simply
-# extending it with randomly matched residues. We will only improve the score if
+# extending it with randomly matched residues. We will only improve the score if
-# the similarity of newly added residues is larger than what we expect to get by
+# the similarity of newly added residues is larger than what we expect to get by
-# random chance!
+# random chance!
-
+
-
+
-# [END]
+# [END]
--- a/BIN-Data_integration.R
+++ b/BIN-Data_integration.R
@ -1,216 +1,216 @@
-# tocID <- "BIN-Data_integration.R"
+# tocID <- "BIN-Data_integration.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-Data_integration unit.
+#              R code accompanying the BIN-Data_integration unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2018-10  -  2020-09
+# Date:     2018-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance and updates
+#           1.2    2020 Maintenance and updates
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.0.1  Bugfix: UniProt ID Mapping service API change
+#           1.0.1  Bugfix: UniProt ID Mapping service API change
-#           1.0    First live version
+#           1.0    First live version
-#
+#
-#
+#
-# TODO:
+# TODO:
-#           Develop a fungi-specific BioMart example.
+#           Develop a fungi-specific BioMart example.
-#           (cf.
+#           (cf.
-# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
+# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                             Line
+#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
+#TOC> -------------------------------------------------
-#TOC>   1        Identifier mapping                  42
+#TOC>   1        Identifier mapping                  42
-#TOC>   2        Cross-referencing tables           165
+#TOC>   2        Cross-referencing tables           165
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Identifier mapping  ==================================================
+# =    1  Identifier mapping  ==================================================
-
+
-# UniProt provides a well-designed ID mapping tool that can be accessed
+# UniProt provides a well-designed ID mapping tool that can be accessed
-# online at     http://www.uniprot.org/mapping/
+# online at     http://www.uniprot.org/mapping/
-#
+#
-# Here we will use the UniProt Web API for this tool to map identifiers. The
+# Here we will use the UniProt Web API for this tool to map identifiers. The
-# UniProt ID mapping service supports a "RESTful API": responses can be obtained
+# UniProt ID mapping service supports a "RESTful API": responses can be obtained
-# simply via a Web- browsers request. Such requests are commonly sent via the
+# simply via a Web- browsers request. Such requests are commonly sent via the
-# GET or POST verbs that a Webserver responds to, when a client asks for data.
+# GET or POST verbs that a Webserver responds to, when a client asks for data.
-# GET requests are visible in the URL of the request; POST requests are not
+# GET requests are visible in the URL of the request; POST requests are not
-# directly visible, they are commonly used to send the contents of forms, or
+# directly visible, they are commonly used to send the contents of forms, or
-# when transmitting larger, complex data items. The UniProt ID mapping sevice
+# when transmitting larger, complex data items. The UniProt ID mapping sevice
-# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
+# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
-# and  POST() functions are part of the httr package.
+# and  POST() functions are part of the httr package.
-
+
-# To begin, we load  httr, which supports sending and receiving data via the
+# To begin, we load  httr, which supports sending and receiving data via the
-# http protocol, just like a Web browser.
+# http protocol, just like a Web browser.
-if (! requireNamespace("httr", quietly=TRUE)) {
+if (! requireNamespace("httr", quietly=TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = httr)       # basic information
+#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
+#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
+#  data(package = "httr")     # available datasets
-
+
-
+
-# We will walk through the process with the refSeqID
+# We will walk through the process with the refSeqID
-# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
+# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
-# happens if the ID can't be mapped:
+# happens if the ID can't be mapped:
-myQueryIDs <- "NP_010227 NP_00000 NP_011036"
+myQueryIDs <- "NP_010227 NP_00000 NP_011036"
-
+
-
+
-# The UniProt ID mapping service API is very straightforward to use: just define
+# The UniProt ID mapping service API is very straightforward to use: just define
-# the URL of the server and send a list of items labelled as "query" in the body
+# the URL of the server and send a list of items labelled as "query" in the body
-# of the request. GET() and POST() are functions from httr.
+# of the request. GET() and POST() are functions from httr.
-
+
-# Note. A recent bug in the interaction between the server expectations and the
+# Note. A recent bug in the interaction between the server expectations and the
-# curl client libraries requires the following initialization
+# curl client libraries requires the following initialization
-httr::set_config(httr::config(http_version = 0))
+httr::set_config(httr::config(http_version = 0))
-# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
+# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
-
+
-
+
-URL <- "https://www.uniprot.org/mapping/"
+URL <- "https://www.uniprot.org/mapping/"
-response <- httr::POST(URL,
+response <- httr::POST(URL,
-                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
+                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
-                                   to = "ACC",             # UniProt ID
+                                   to = "ACC",             # UniProt ID
-                                   format = "tab",
+                                   format = "tab",
-                                   query = myQueryIDs))
+                                   query = myQueryIDs))
-
+
-cat(httr::content(response))
+cat(httr::content(response))
-
+
-# We need to check the status code - if it is not 200, an error ocurred and we
+# We need to check the status code - if it is not 200, an error ocurred and we
-# can't process the result:
+# can't process the result:
-httr::status_code(response)
+httr::status_code(response)
-
+
-# If the query is successful, tabbed text is returned. We can assign that to a
+# If the query is successful, tabbed text is returned. We can assign that to a
-# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
+# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
-
+
-myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
+myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
-                          sep = "\t",
+                          sep = "\t",
-                          stringsAsFactors = FALSE)
+                          stringsAsFactors = FALSE)
-myMappedIDs
+myMappedIDs
-
+
-# If this works as expected, you should see:
+# If this works as expected, you should see:
-#        From     To
+#        From     To
-# 1 NP_010227 P39678
+# 1 NP_010227 P39678
-# 2 NP_011036 P25302
+# 2 NP_011036 P25302
-#
+#
-# ... and note that there are only two entries, because nothing was returned
+# ... and note that there are only two entries, because nothing was returned
-# for the dummy "RefSeq ID" NP_00000
+# for the dummy "RefSeq ID" NP_00000
-
+
-# If the query can't be fulfilled because of a problem with the server, a
+# If the query can't be fulfilled because of a problem with the server, a
-# WebPage is returned. But the server status is also returned and we can check
+# WebPage is returned. But the server status is also returned and we can check
-# the status code. I have lately gotten many "503" status codes: Server Not
+# the status code. I have lately gotten many "503" status codes: Server Not
-# Available...
+# Available...
-
+
-# We wrap this into a function:
+# We wrap this into a function:
-
+
-myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
+myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
-  # Use UniProt ID mapping service to map one or more IDs
+  # Use UniProt ID mapping service to map one or more IDs
-  # Parameters:
+  # Parameters:
-  #    s  char  A string of separated IDs
+  #    s  char  A string of separated IDs
-  #    mapFrom  char  the database in which the IDs in s are valid. Default
+  #    mapFrom  char  the database in which the IDs in s are valid. Default
-  #                     is RefSeq protein
+  #                     is RefSeq protein
-  #    mapTo    char  the database in which the target IDs are valid. Default
+  #    mapTo    char  the database in which the target IDs are valid. Default
-  #                     is UniProtKB
+  #                     is UniProtKB
-  # Value
+  # Value
-  #    a data frame of mapped IDs, with column names From and To, or an
+  #    a data frame of mapped IDs, with column names From and To, or an
-  #    empty data frame if the mapping was unsuccessful. No rows are returned
+  #    empty data frame if the mapping was unsuccessful. No rows are returned
-  #    for IDs that are not mapped.
+  #    for IDs that are not mapped.
-
+
-  # Initialize curl
+  # Initialize curl
-  httr::set_config(httr::config(http_version = 0))
+  httr::set_config(httr::config(http_version = 0))
-
+
-  URL <- "https://www.uniprot.org/uploadlists/"
+  URL <- "https://www.uniprot.org/uploadlists/"
-  response <- httr::POST(URL,
+  response <- httr::POST(URL,
-                         body = list(from = mapFrom,
+                         body = list(from = mapFrom,
-                                     to = mapTo,
+                                     to = mapTo,
-                                     format = "tab",
+                                     format = "tab",
-                                     query = s))
+                                     query = s))
-
+
-  if (httr::status_code(response) == 200) { # 200: oK
+  if (httr::status_code(response) == 200) { # 200: oK
-    myMap <- read.delim(file = textConnection(httr::content(response)),
+    myMap <- read.delim(file = textConnection(httr::content(response)),
-                        sep = "\t",
+                        sep = "\t",
-                        stringsAsFactors = FALSE)
+                        stringsAsFactors = FALSE)
-    colnames(myMap) <- c("From", "To")
+    colnames(myMap) <- c("From", "To")
-  } else {
+  } else {
-    myMap <- data.frame()
+    myMap <- data.frame()
-    warning(paste("No uniProt ID mapping returned:",
+    warning(paste("No uniProt ID mapping returned:",
-                  "server sent status",
+                  "server sent status",
-                  httr::status_code(response)))
+                  httr::status_code(response)))
-  }
+  }
-
+
-  return(myMap)
+  return(myMap)
-}
+}
-
+
-# Try it out ...
+# Try it out ...
-myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
+myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
-
+
-# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
+# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
-# into your workspace on startup.
+# into your workspace on startup.
-
+
-
+
-# =    2  Cross-referencing tables  ============================================
+# =    2  Cross-referencing tables  ============================================
-
+
-# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
+# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
-# genes in a model organism database such as SGD, or from the Human Genen
+# genes in a model organism database such as SGD, or from the Human Genen
-# Nomenclature commission. How do we map one set of identifiers to another one?
+# Nomenclature commission. How do we map one set of identifiers to another one?
-
+
-# The function to use is match().
+# The function to use is match().
-# Here is a tiny set of identifiers taken from a much larger table to
+# Here is a tiny set of identifiers taken from a much larger table to
-# illustrate the principle:
+# illustrate the principle:
-#
+#
-
+
-myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
+myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
-                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
+                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
-                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
+                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
-                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
+                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
-                    refID = c("NP_014657", "NP_009386",
+                    refID = c("NP_014657", "NP_009386",
-                              "NP_012683", "NP_012559",
+                              "NP_012683", "NP_012559",
-                              "NP_010038", "NP_014882",
+                              "NP_010038", "NP_014882",
-                              "NP_012616", "NP_013254",
+                              "NP_012616", "NP_013254",
-                              "NP_014555", "NP_013629"))
+                              "NP_014555", "NP_013629"))
-
+
-myIDs
+myIDs
-
+
-# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
+# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
-# their gene names.
+# their gene names.
-myQuery <- c("NP_010038", "NP_999999", "NP_013629")
+myQuery <- c("NP_010038", "NP_999999", "NP_013629")
-
+
-# %in% will only tell us if these IDs are present in the table:
+# %in% will only tell us if these IDs are present in the table:
-myQuery %in% myIDs$refID
+myQuery %in% myIDs$refID
-
+
-# ... but not where they are located. But match() does what we need here:
+# ... but not where they are located. But match() does what we need here:
-match(myQuery, myIDs$refID)
+match(myQuery, myIDs$refID)
-
+
-# ... and we can use the result to subset the column that we want to map to:
+# ... and we can use the result to subset the column that we want to map to:
-myIDs$name[match(myQuery, myIDs$refID)]
+myIDs$name[match(myQuery, myIDs$refID)]
-
+
-# Note that the output preserves the NA - i.e. the length of the mapped
+# Note that the output preserves the NA - i.e. the length of the mapped
-# values is exactly the same as the length of the query.
+# values is exactly the same as the length of the query.
-
+
-# task: map the three genes to their UniProt Identifier.
+# task: map the three genes to their UniProt Identifier.
-
+
-
+
-#
+#
-# Note: if you want to do very many queries in very large tables, use the
+# Note: if you want to do very many queries in very large tables, use the
-# fmatch() function in the "fastmatch" package for a considerable
+# fmatch() function in the "fastmatch" package for a considerable
-# speedup.
+# speedup.
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@ -1,435 +1,435 @@
-# tocID <- "BIN-FUNC-Domain_annotation.R"
+# tocID <- "BIN-FUNC-Domain_annotation.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-FUNC-Domain_annotation unit.
+#              R code accompanying the BIN-FUNC-Domain_annotation unit.
-#
+#
-# ==============================================================================
+# ==============================================================================
-# Version:  1.4
+# Version:  1.4
-#
+#
-# Date:     2017-11  -  2020-10
+# Date:     2017-11  -  2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.4    Add code for shared data import from the Wiki
+#           1.4    Add code for shared data import from the Wiki
-#           1.3    Add code for database export to JSON and instructions
+#           1.3    Add code for database export to JSON and instructions
-#                  for uploading annotations to the Public Student Wiki page
+#                  for uploading annotations to the Public Student Wiki page
-#           1.2    Consistently: data in ./myScripts/ ;
+#           1.2    Consistently: data in ./myScripts/ ;
-#                    begin SHARING DATA section
+#                    begin SHARING DATA section
-#           1.1    2020 Updates
+#           1.1    2020 Updates
-#           1.0    Live version 2017
+#           1.0    Live version 2017
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-# TODO:
+# TODO:
-#           Put the domain plot into a function
+#           Put the domain plot into a function
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                                 Line
+#TOC>   Section  Title                                                 Line
-#TOC> ---------------------------------------------------------------------
+#TOC> ---------------------------------------------------------------------
-#TOC>   1        Update your database script                             51
+#TOC>   1        Update your database script                             51
-#TOC>   1.1        Preparing an annotation file ...                      58
+#TOC>   1.1        Preparing an annotation file ...                      58
-#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
+#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
-#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
+#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
-#TOC>   1.2        Execute and Validate                                 136
+#TOC>   1.2        Execute and Validate                                 136
-#TOC>   2        Plot Annotations                                       161
+#TOC>   2        Plot Annotations                                       161
-#TOC>   3        SHARING DATA                                           287
+#TOC>   3        SHARING DATA                                           287
-#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
+#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
-#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
+#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Update your database script  =========================================
+# =    1  Update your database script  =========================================
-
+
-
+
-# Since you have recorded domain features at the SMART database, we can store
+# Since you have recorded domain features at the SMART database, we can store
-# the feature annotations in myDB ...
+# the feature annotations in myDB ...
-
+
-
+
-# ==   1.1  Preparing an annotation file ...  ==================================
+# ==   1.1  Preparing an annotation file ...  ==================================
-
+
-
+
-# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
+# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
-#
+#
-#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
+#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
-#
+#
-#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
+#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
-#   ./myScripts/ directory:
+#   ./myScripts/ directory:
-#
+#
-#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
+#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
-#     myScripts/ directory.
+#     myScripts/ directory.
-#
+#
-#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
+#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
-#     if MYSPE is called "Crptycoccus neoformans", your file should be called
+#     if MYSPE is called "Crptycoccus neoformans", your file should be called
-#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
+#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
-#     "MBP1_CRYNE").
+#     "MBP1_CRYNE").
-#
+#
-#   - Open the file in the RStudio editor and delete all blocks for
+#   - Open the file in the RStudio editor and delete all blocks for
-#     the Mbp1 protein annotations except the first one.
+#     the Mbp1 protein annotations except the first one.
-#
+#
-#   - From that block, delete all lines that have annotations you did not
+#   - From that block, delete all lines that have annotations you did not
-#     find in SMART for MBP1_MYSPE.
+#     find in SMART for MBP1_MYSPE.
-#
+#
-#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
+#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
-#     lines to have a line for each feature you found.
+#     lines to have a line for each feature you found.
-#
+#
-#   - Then delete the comma at the end of the last line.
+#   - Then delete the comma at the end of the last line.
-#
+#
-#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
+#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
-#     and change the "start" and "end" features to the coordinates you
+#     and change the "start" and "end" features to the coordinates you
-#     recorded in the SMART database.
+#     recorded in the SMART database.
-#
+#
-#   - Save your file in the ./myScripts/ folder.
+#   - Save your file in the ./myScripts/ folder.
-#
+#
-#   - Validate your file online at https://jsonlint.com/
+#   - Validate your file online at https://jsonlint.com/
-#
+#
-#   - Update your "./myScripts/makeProteinDB.R" script to load your new
+#   - Update your "./myScripts/makeProteinDB.R" script to load your new
-#     annotation when you recreate the database. Open the script in the
+#     annotation when you recreate the database. Open the script in the
-#     RStudio editor, and add the following command at the end:
+#     RStudio editor, and add the following command at the end:
-#
+#
-#     myDB <- dbAddAnnotation(myDB,
+#     myDB <- dbAddAnnotation(myDB,
-#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
+#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
-#                                         ^^^^^^^
+#                                         ^^^^^^^
-#                                        edit this!
+#                                        edit this!
-#
+#
-#   - save and close the file.
+#   - save and close the file.
-#
+#
-# Then SKIP the next section.
+# Then SKIP the next section.
-#
+#
-#
+#
-# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
+# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
-#
+#
-#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
+#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
-#
+#
-#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
+#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
-#  ./myScripts/ directory:
+#  ./myScripts/ directory:
-#
+#
-#   - Open the file in the RStudio editor.
+#   - Open the file in the RStudio editor.
-#
+#
-#   - Make as many copies of the "APSES fold" line as you have found
+#   - Make as many copies of the "APSES fold" line as you have found
-#     features in SMART.
+#     features in SMART.
-#
+#
-#   - Add a comma after every line except for the last one
+#   - Add a comma after every line except for the last one
-#
+#
-#   - Edit the annotations but include only features that are in the
+#   - Edit the annotations but include only features that are in the
-#     myDB$feature table. Check which features are in the database by executing
+#     myDB$feature table. Check which features are in the database by executing
-#
+#
-#        myDB$feature$name
+#        myDB$feature$name
-#
+#
-#   - Update the "start" and "end" coordinates for each feature to the
+#   - Update the "start" and "end" coordinates for each feature to the
-#     values you found.
+#     values you found.
-#
+#
-#   - Save your file.
+#   - Save your file.
-#
+#
-#   - Validate your file online at https://jsonlint.com/
+#   - Validate your file online at https://jsonlint.com/
-#
+#
-#
+#
-# ==   1.2  Execute and Validate  ==============================================
+# ==   1.2  Execute and Validate  ==============================================
-#
+#
-#   - source() your database creation script:
+#   - source() your database creation script:
-#
+#
-#     source("./myScripts/makeProteinDB.R")
+#     source("./myScripts/makeProteinDB.R")
-#
+#
-#     This should run without errors or warnings. If it doesn't work and you
+#     This should run without errors or warnings. If it doesn't work and you
-#     can't figure out quickly what's happening, ask for help on the
+#     can't figure out quickly what's happening, ask for help on the
-#     Discussion Board.
+#     Discussion Board.
-#
+#
-#   - Confirm
+#   - Confirm
-#     The following commands should retrieve all of the features that have been
+#     The following commands should retrieve all of the features that have been
-#     annotated for MBP1_MYSPE
+#     annotated for MBP1_MYSPE
-
+
-sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
+sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
-
+
-(proID  <- myDB$protein$ID[sel])
+(proID  <- myDB$protein$ID[sel])
-(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
+(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
-(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
+(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
-myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
+myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
-                          # (once). If not, consider what could have gone wrong
+                          # (once). If not, consider what could have gone wrong
-                          # and ask on the list if you have difficulties fixing
+                          # and ask on the list if you have difficulties fixing
-                          # it.
+                          # it.
-
+
-
+
-# =    2  Plot Annotations  ====================================================
+# =    2  Plot Annotations  ====================================================
-
+
-# In this section we will plot domain annotations as colored rectangles on a
+# In this section we will plot domain annotations as colored rectangles on a
-# sequence, as an example of using the R plotting system for generic, data
+# sequence, as an example of using the R plotting system for generic, data
-# driven images.
+# driven images.
-
+
-# We need a small utility function that draws the annotation boxes on a
+# We need a small utility function that draws the annotation boxes on a
-# representation of sequence. It should accept the start and end coordinates,
+# representation of sequence. It should accept the start and end coordinates,
-# the y value where it should be plotted and the color of the box, and plot a
+# the y value where it should be plotted and the color of the box, and plot a
-# rectangle using R's rect() function.
+# rectangle using R's rect() function.
-
+
-drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
+drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
-  # Draw a box from xStart to xEnd at y, filled with colour myCol
+  # Draw a box from xStart to xEnd at y, filled with colour myCol
-  # The height of the box is y +- DELTA
+  # The height of the box is y +- DELTA
-  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
+  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
-       border = "black", col = myCol)
+       border = "black", col = myCol)
-}
+}
-
+
-# test this:
+# test this:
-plot(c(-1.5, 1.5), c(0, 0), type = "l")
+plot(c(-1.5, 1.5), c(0, 0), type = "l")
-drawBox(-1, 1, 0.0, "peachpuff")
+drawBox(-1, 1, 0.0, "peachpuff")
-
+
-# Next, we define a function to plot annotations for one protein: the name of
+# Next, we define a function to plot annotations for one protein: the name of
-# the protein, a horizontal grey line for its length, and all of its features.
+# the protein, a horizontal grey line for its length, and all of its features.
-
+
-plotProtein <- function(DB, name, y) {
+plotProtein <- function(DB, name, y) {
-  # DB: protein database
+  # DB: protein database
-  # name: the name of the protein in the database.
+  # name: the name of the protein in the database.
-  # y: height where to draw the plot
+  # y: height where to draw the plot
-  #
+  #
-  # Define colors: we create a vector of color values, one for
+  # Define colors: we create a vector of color values, one for
-  # each feature, and we give it names of the feature ID. Then we
+  # each feature, and we give it names of the feature ID. Then we
-  # can easily get the color value from the feature name.
+  # can easily get the color value from the feature name.
-  # A: make a vector of color values. The syntax may appear unusual -
+  # A: make a vector of color values. The syntax may appear unusual -
-  #    colorRampPalette() returns a function, and we simply append
+  #    colorRampPalette() returns a function, and we simply append
-  #    the parameter (number-of-features) without assigning the function
+  #    the parameter (number-of-features) without assigning the function
-  #    to its own variable name.
+  #    to its own variable name.
-  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
+  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
-                               "#62C923", "#0A9A9B", "#1958C3",
+                               "#62C923", "#0A9A9B", "#1958C3",
-                               "#8000D3", "#D0007F"),
+                               "#8000D3", "#D0007F"),
-                             space="Lab",
+                             space="Lab",
-                             interpolate="linear")(nrow(DB$feature))
+                             interpolate="linear")(nrow(DB$feature))
-  # B: Features may overlap, so we make the colors transparent by setting
+  # B: Features may overlap, so we make the colors transparent by setting
-  #    their "alpha channel" to 1/3  (hex: 55)
+  #    their "alpha channel" to 1/3  (hex: 55)
-  ftrCol <- paste0(ftrCol, "55")
+  ftrCol <- paste0(ftrCol, "55")
-  # C: we asssign names
+  # C: we asssign names
-  names(ftrCol) <- DB$feature$ID
+  names(ftrCol) <- DB$feature$ID
-  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
+  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
-
+
-  # find the row-index of the protein ID in the protein table of DB
+  # find the row-index of the protein ID in the protein table of DB
-  iProtein <- which(DB$protein$name == name)
+  iProtein <- which(DB$protein$name == name)
-
+
-  # write the name of the protein
+  # write the name of the protein
-  text(-30, y, adj=1, labels=name, cex=0.75 )
+  text(-30, y, adj=1, labels=name, cex=0.75 )
-
+
-  #draw a line from 0 to nchar(sequence-of-the-protein)
+  #draw a line from 0 to nchar(sequence-of-the-protein)
-  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
+  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
-        lwd=3, col="#999999")
+        lwd=3, col="#999999")
-
+
-  # get the rows of feature annotations for the protein
+  # get the rows of feature annotations for the protein
-  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
+  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
-
+
-  # draw a colored box for each feature
+  # draw a colored box for each feature
-  for (i in iFtr) {
+  for (i in iFtr) {
-    drawBox(DB$annotation$start[i],
+    drawBox(DB$annotation$start[i],
-            DB$annotation$end[i],
+            DB$annotation$end[i],
-            y,
+            y,
-            ftrCol[ DB$annotation$featureID[i] ])
+            ftrCol[ DB$annotation$featureID[i] ])
-  }
+  }
-}
+}
-
+
-# Plot each annotated protein:
+# Plot each annotated protein:
-# Get the rows of all unique annotated Mbp1 proteins in myDB
+# Get the rows of all unique annotated Mbp1 proteins in myDB
-
+
-iRows <- grep("^MBP1_", myDB$protein$name)
+iRows <- grep("^MBP1_", myDB$protein$name)
-
+
-# define the size of the plot-frame to accomodate all proteins
+# define the size of the plot-frame to accomodate all proteins
-yMax <- length(iRows) * 1.1
+yMax <- length(iRows) * 1.1
-xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
+xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
-
+
-# plot an empty frame
+# plot an empty frame
-oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
+oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
-                                        # decrease margins
+                                        # decrease margins
-plot(1, 1,
+plot(1, 1,
-     xlim = c(-200, xMax + 100),
+     xlim = c(-200, xMax + 100),
-     ylim = c(0, yMax),
+     ylim = c(0, yMax),
-     type = "n",
+     type = "n",
-     axes = FALSE,
+     axes = FALSE,
-     bty = "n",
+     bty = "n",
-     main = "Mbp1 orthologue domain annotations",
+     main = "Mbp1 orthologue domain annotations",
-     xlab = "sequence position",
+     xlab = "sequence position",
-     cex.axis = 0.8,
+     cex.axis = 0.8,
-     ylab="")
+     ylab="")
-axis(1, at = seq(0, xMax, by = 100))
+axis(1, at = seq(0, xMax, by = 100))
-myCol <- colorRampPalette(c("#f2003c", "#F0A200",
+myCol <- colorRampPalette(c("#f2003c", "#F0A200",
-                            "#f0ea00", "#62C923",
+                            "#f0ea00", "#62C923",
-                            "#0A9A9B", "#1958C3",
+                            "#0A9A9B", "#1958C3",
-                            "#8000D3", "#D0007F"),
+                            "#8000D3", "#D0007F"),
-                          space="Lab",
+                          space="Lab",
-                          interpolate="linear")(nrow(myDB$feature))
+                          interpolate="linear")(nrow(myDB$feature))
-myCol <- paste0(myCol, "55")
+myCol <- paste0(myCol, "55")
-legend(xMax - 150, 7,
+legend(xMax - 150, 7,
-       legend = myDB$feature$name,
+       legend = myDB$feature$name,
-       cex = 0.7,
+       cex = 0.7,
-       fill = myCol,
+       fill = myCol,
-       bty = "n")
+       bty = "n")
-
+
-# Finally, iterate over all proteins and call plotProtein()
+# Finally, iterate over all proteins and call plotProtein()
-for (i in seq_along(iRows)) {
+for (i in seq_along(iRows)) {
-  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
+  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
-}
+}
-par(oPar)  # reset the plot parameters
+par(oPar)  # reset the plot parameters
-
+
-
+
-# The plot shows what is variable and what is constant about the annotations in
+# The plot shows what is variable and what is constant about the annotations in
-# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
+# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
-# top.
+# top.
-
+
-# Task:
+# Task:
-#    Put a copy of the plot into your journal and interpret it with respect
+#    Put a copy of the plot into your journal and interpret it with respect
-#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
+#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
-
+
-# Task:
+# Task:
-#    It would be better to align the motif borders, at least approximately (not
+#    It would be better to align the motif borders, at least approximately (not
-#    all proteins have all motifs). How would you go about doing that?
+#    all proteins have all motifs). How would you go about doing that?
-
+
-# =    3  SHARING DATA  ========================================================
+# =    3  SHARING DATA  ========================================================
-
+
-# It's particularly interesting to compare such annotations across many
+# It's particularly interesting to compare such annotations across many
-# homologous proteins. I have created a page on the Student Wiki () that you can
+# homologous proteins. I have created a page on the Student Wiki () that you can
-# edit, and then download the data from the entire class directly to your
+# edit, and then download the data from the entire class directly to your
-# RStudio project.
+# RStudio project.
-#
+#
-
+
-# I have provided a function that extracts all information that refers to a
+# I have provided a function that extracts all information that refers to a
-# single protein from the database, and prints it out as well-formatted JSON,
+# single protein from the database, and prints it out as well-formatted JSON,
-# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
+# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
-# bookkeeping involved, but the code is not otherwise very enlightening so I
+# bookkeeping involved, but the code is not otherwise very enlightening so I
-# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
+# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
-# would want to have a look.
+# would want to have a look.
-
+
-
+
-# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
+# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
-
+
-# Task:
+# Task:
-# =====
+# =====
-# 1: Run the following code:
+# 1: Run the following code:
-
+
-cat("{{Vspace}}",
+cat("{{Vspace}}",
-    "<!-- ==== BEGIN  PROTEIN ==== -->",
+    "<!-- ==== BEGIN  PROTEIN ==== -->",
-    "<pre class=\"protein-data\">",
+    "<pre class=\"protein-data\">",
-    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
+    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
-    "</pre>",
+    "</pre>",
-    "<!-- ===== END PROTEIN ====== -->",
+    "<!-- ===== END PROTEIN ====== -->",
-    "", sep = "\n"
+    "", sep = "\n"
-)
+)
-
+
-# 2: Copy the entire output from the console.
+# 2: Copy the entire output from the console.
-# 3: Navigate to
+# 3: Navigate to
-#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
+#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
-#    ... edit the page, and paste your output at the top.
+#    ... edit the page, and paste your output at the top.
-# 4: Save your edits.
+# 4: Save your edits.
-
+
-
+
-
+
-# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
+# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
-
+
-# Once we have collected a number of protein annotations, we can access the
+# Once we have collected a number of protein annotations, we can access the
-# Wiki-page and import the data into our database. The Wiki page is  an html
+# Wiki-page and import the data into our database. The Wiki page is  an html
-# document with lots of MediaWiki specific stuff - but the contents we are
+# document with lots of MediaWiki specific stuff - but the contents we are
-# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
+# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
-# work like normal HTML <pre> tags, but we have defined a special class for them
+# work like normal HTML <pre> tags, but we have defined a special class for them
-# to make it easy to parse out the contents we want. The rvest:: package in
+# to make it easy to parse out the contents we want. The rvest:: package in
-# combination with xml2:: provides us with all the tools we need for such
+# combination with xml2:: provides us with all the tools we need for such
-# "Webscraping" of data....
+# "Webscraping" of data....
-
+
-if (! requireNamespace("rvest", quietly=TRUE)) {
+if (! requireNamespace("rvest", quietly=TRUE)) {
-  install.packages("rvest")
+  install.packages("rvest")
-}
+}
-
+
-if (! requireNamespace("xml2", quietly=TRUE)) {
+if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
+  install.packages("xml2")
-}
+}
-
+
-# Here's the process:
+# Here's the process:
-# The URL is an "open" page on the student Wiki. Users that are not logged in
+# The URL is an "open" page on the student Wiki. Users that are not logged in
-# can view the contents, but you can only edit if you are logged in.
+# can view the contents, but you can only edit if you are logged in.
-myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
+myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
-
+
-# First thing is to retrieve the HTML from the url...
+# First thing is to retrieve the HTML from the url...
-x <- xml2::read_html(myURL)
+x <- xml2::read_html(myURL)
-
+
-# This retrieves the page source, but that still needs to be parsed into its
+# This retrieves the page source, but that still needs to be parsed into its
-# logical elements. HTML is a subset of XML and such documents are structured as
+# logical elements. HTML is a subset of XML and such documents are structured as
-# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
+# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
-# parses out the document structure and then uses a so-called "xpath" expression
+# parses out the document structure and then uses a so-called "xpath" expression
-# to select nodes we are interested in. Now, xpath is one of those specialized
+# to select nodes we are interested in. Now, xpath is one of those specialized
-# languages of which there are a few more to learn than one would care for. You
+# languages of which there are a few more to learn than one would care for. You
-# MUST know how to format sprintf() expressions, and you SHOULD be competent
+# MUST know how to format sprintf() expressions, and you SHOULD be competent
-# with regular expressions. But if you want to be really competent in your work,
+# with regular expressions. But if you want to be really competent in your work,
-# basic HTML and CSS is required ... and enough knowledge about xpath to be able
+# basic HTML and CSS is required ... and enough knowledge about xpath to be able
-# to search on Stackoverflow for what you need for parsing data out of Web
+# to search on Stackoverflow for what you need for parsing data out of Web
-# documents...
+# documents...
-
+
-# The expression we use below is:
+# The expression we use below is:
-#   - get any node anywhere in the tree ("//*") ...
+#   - get any node anywhere in the tree ("//*") ...
-#   - that has a particular attribute("[@ ... ]").
+#   - that has a particular attribute("[@ ... ]").
-#   - The attribute we want is that the class of the node is "protein-data";
+#   - The attribute we want is that the class of the node is "protein-data";
-#      that is the class we have defined for our <pre> tags.
+#      that is the class we have defined for our <pre> tags.
-# As a result of this selection, we get a list of pointers to the document tree.
+# As a result of this selection, we get a list of pointers to the document tree.
-y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
+y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
-
+
-# Next we fetch the actual payload - the text - from the tree:
+# Next we fetch the actual payload - the text - from the tree:
-# rvest::html_text() gets the text from the list of pointers. The result is a
+# rvest::html_text() gets the text from the list of pointers. The result is a
-# normal list of character strings.
+# normal list of character strings.
-z <- rvest::html_text(y)
+z <- rvest::html_text(y)
-
+
-# Finally we can iterate over the list, and add all proteins we don't already
+# Finally we can iterate over the list, and add all proteins we don't already
-# have to our database. There may well be items that are rejected because they
+# have to our database. There may well be items that are rejected because they
-# are already present in the database - for example, unless somebody has
+# are already present in the database - for example, unless somebody has
-# annotated new features, all of the features are already there. Don't worry -
+# annotated new features, all of the features are already there. Don't worry -
-# that is intended; we don't want duplicate entries.
+# that is intended; we don't want duplicate entries.
-
+
-for (thisJSON in z) {
+for (thisJSON in z) {
-  thisData <- jsonlite::fromJSON(thisJSON)
+  thisData <- jsonlite::fromJSON(thisJSON)
-  if (! thisData$protein$name %in% myDB$protein$name) {
+  if (! thisData$protein$name %in% myDB$protein$name) {
-    myDB <- dbAddProtein(myDB, thisData$protein)
+    myDB <- dbAddProtein(myDB, thisData$protein)
-    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
+    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
-    myDB <- dbAddFeature(myDB, thisData$feature)
+    myDB <- dbAddFeature(myDB, thisData$feature)
-    myDB <- dbAddAnnotation(myDB, thisData$annotation)
+    myDB <- dbAddAnnotation(myDB, thisData$annotation)
-  }
+  }
-}
+}
-
+
-# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
+# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
-
+
-iRows <- grep("^MBP1_", myDB$protein$name)
+iRows <- grep("^MBP1_", myDB$protein$name)
-yMax <- length(iRows) * 1.1
+yMax <- length(iRows) * 1.1
-xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
+xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
-
+
-# plot an empty frame
+# plot an empty frame
-oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
+oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
-plot(1, 1,
+plot(1, 1,
-     xlim = c(-200, xMax + 100),
+     xlim = c(-200, xMax + 100),
-     ylim = c(0, yMax),
+     ylim = c(0, yMax),
-     type = "n",
+     type = "n",
-     axes = FALSE,
+     axes = FALSE,
-     bty = "n",
+     bty = "n",
-     main = "Mbp1 orthologue domain annotations",
+     main = "Mbp1 orthologue domain annotations",
-     xlab = "sequence position",
+     xlab = "sequence position",
-     cex.axis = 0.8,
+     cex.axis = 0.8,
-     ylab="")
+     ylab="")
-axis(1, at = seq(0, xMax, by = 100))
+axis(1, at = seq(0, xMax, by = 100))
-myCol <- colorRampPalette(c("#f2003c", "#F0A200",
+myCol <- colorRampPalette(c("#f2003c", "#F0A200",
-                            "#f0ea00", "#62C923",
+                            "#f0ea00", "#62C923",
-                            "#0A9A9B", "#1958C3",
+                            "#0A9A9B", "#1958C3",
-                            "#8000D3", "#D0007F"),
+                            "#8000D3", "#D0007F"),
-                          space="Lab",
+                          space="Lab",
-                          interpolate="linear")(nrow(myDB$feature))
+                          interpolate="linear")(nrow(myDB$feature))
-myCol <- paste0(myCol, "55")
+myCol <- paste0(myCol, "55")
-legend(xMax - 150, 7,
+legend(xMax - 150, 7,
-       legend = myDB$feature$name,
+       legend = myDB$feature$name,
-       cex = 0.7,
+       cex = 0.7,
-       fill = myCol,
+       fill = myCol,
-       bty = "n")
+       bty = "n")
-
+
-for (i in seq_along(iRows)) {
+for (i in seq_along(iRows)) {
-  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
+  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
-}
+}
-par(oPar)  # reset the plot parameters
+par(oPar)  # reset the plot parameters
-
+
-# ... the more proteins we can compare, the more we learn about the
+# ... the more proteins we can compare, the more we learn about the
-# architectural principles of this family's domains.
+# architectural principles of this family's domains.
-
+
-
+
-# [END]
+# [END]
--- a/BIN-FUNC-Semantic_similarity.R
+++ b/BIN-FUNC-Semantic_similarity.R
@ -1,169 +1,169 @@
-# tocID <- "BIN-FUNC-Semantic_similarity.R"
+# tocID <- "BIN-FUNC-Semantic_similarity.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
+#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-11  -  2020-09
+# Date:     2017-11  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance
+#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0    New code.
+#           1.0    New code.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                                Line
+#TOC>   Section  Title                                                Line
-#TOC> --------------------------------------------------------------------
+#TOC> --------------------------------------------------------------------
-#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
+#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
-#TOC>   2        Fetch GO Annotations                                  100
+#TOC>   2        Fetch GO Annotations                                  100
-#TOC>   3        Semantic Similarities                                 109
+#TOC>   3        Semantic Similarities                                 109
-#TOC>   4        GO Term Enrichment in Gene Sets                       127
+#TOC>   4        GO Term Enrichment in Gene Sets                       127
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
+# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-
+
-# GOSim is an R-package in the Bioconductor project.
+# GOSim is an R-package in the Bioconductor project.
-if (! requireNamespace("GOSim", quietly = TRUE)) {
+if (! requireNamespace("GOSim", quietly = TRUE)) {
-  BiocManager::install("GOSim")
+  BiocManager::install("GOSim")
-}
+}
-# Package information:
+# Package information:
-#  library(help = GOSim)       # basic information
+#  library(help = GOSim)       # basic information
-#  browseVignettes("GOSim")    # available vignettes
+#  browseVignettes("GOSim")    # available vignettes
-#  data(package = "GOSim")     # available datasets
+#  data(package = "GOSim")     # available datasets
-
+
-# GOSim makes extensive assumptions about loaded packages, and many base
+# GOSim makes extensive assumptions about loaded packages, and many base
-# methods are masked. We will thus use library(GOSim) to load it
+# methods are masked. We will thus use library(GOSim) to load it
-# in its entirety and with all packages it depends on. We will still use
+# in its entirety and with all packages it depends on. We will still use
-# the <package>::<function>() syntax in the code below, but this now serves
+# the <package>::<function>() syntax in the code below, but this now serves
-# more of a didactic purpose, rather than actual syntax requirements.
+# more of a didactic purpose, rather than actual syntax requirements.
-
+
-library(GOSim)
+library(GOSim)
-
+
-# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
+# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
-# annotations instead...
+# annotations instead...
-if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
+if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
-  BiocManager::install("org.Sc.sgd.db")
+  BiocManager::install("org.Sc.sgd.db")
-}
+}
-
+
-# Bioconductor annotation packages won't work stably unless we actually load
+# Bioconductor annotation packages won't work stably unless we actually load
-# them:
+# them:
-library(org.Sc.sgd.db)
+library(org.Sc.sgd.db)
-
+
-# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
+# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
-# databases exist for all model organisms. It's a kind of a fancy data frame
+# databases exist for all model organisms. It's a kind of a fancy data frame
-# from which we can get annotations by rows (genes) with the keys() funtion ...
+# from which we can get annotations by rows (genes) with the keys() funtion ...
-AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
+AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
-
+
-# ... and the types of available annotations with the columns() function
+# ... and the types of available annotations with the columns() function
-AnnotationDbi::columns(org.Sc.sgd.db)
+AnnotationDbi::columns(org.Sc.sgd.db)
-
+
-# Note that one of the columns is "GO" ... and we load that into the
+# Note that one of the columns is "GO" ... and we load that into the
-# datastructures used by GOSim:
+# datastructures used by GOSim:
-
+
-# Choose GOterms to use
+# Choose GOterms to use
-GOSim::setEvidenceLevel(evidences = "all",
+GOSim::setEvidenceLevel(evidences = "all",
-                        organism = org.Sc.sgdORGANISM,
+                        organism = org.Sc.sgdORGANISM,
-                        gomap = org.Sc.sgdGO)
+                        gomap = org.Sc.sgdGO)
-
+
-# Use Biological Process ontology
+# Use Biological Process ontology
-GOSim::setOntology("BP", loadIC = FALSE)
+GOSim::setOntology("BP", loadIC = FALSE)
-
+
-# confirm that we loaded the correct ontology
+# confirm that we loaded the correct ontology
-head(get("gomap", envir = GOSimEnv))
+head(get("gomap", envir = GOSimEnv))
-
+
-
+
-
+
-# =    2  Fetch GO Annotations  ================================================
+# =    2  Fetch GO Annotations  ================================================
-
+
-
+
-# All keys being used here are yeast systematic names.
+# All keys being used here are yeast systematic names.
-
+
-# Get one set of annotations
+# Get one set of annotations
-GOSim::getGOInfo(c("YDL056W"))  # Mbp1
+GOSim::getGOInfo(c("YDL056W"))  # Mbp1
-
+
-
+
-# =    3  Semantic Similarities  ===============================================
+# =    3  Semantic Similarities  ===============================================
-
+
-
+
-# Get semantic similarities between genes
+# Get semantic similarities between genes
-?getGeneSim
+?getGeneSim
-
+
-# There are _many_ different metrics of term similarity implemented
+# There are _many_ different metrics of term similarity implemented
-# in this package.
+# in this package.
-
+
-                                                         # Mbp1 and...
+                                                         # Mbp1 and...
-GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
+GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
-GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
+GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
-GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
+GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
-GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
+GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
-GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
+GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
-GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
+GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
-
+
-
+
-# =    4  GO Term Enrichment in Gene Sets  =====================================
+# =    4  GO Term Enrichment in Gene Sets  =====================================
-
+
-
+
-# Calculating GO term enrichment in gene sets is done with the Bioconductor
+# Calculating GO term enrichment in gene sets is done with the Bioconductor
-# topGO package.
+# topGO package.
-if (! requireNamespace("topGO", quietly = TRUE)) {
+if (! requireNamespace("topGO", quietly = TRUE)) {
-  BiocManager::install("topGO")
+  BiocManager::install("topGO")
-}
+}
-# Package information:
+# Package information:
-#  library(help = topGO)       # basic information
+#  library(help = topGO)       # basic information
-#  browseVignettes("topGO")    # available vignettes
+#  browseVignettes("topGO")    # available vignettes
-#  data(package = "topGO")     # available datasets
+#  data(package = "topGO")     # available datasets
-
+
-# Once again - assumptions are made by GOsim that require us to load the
+# Once again - assumptions are made by GOsim that require us to load the
-# topGO package wholesale:
+# topGO package wholesale:
-library(topGO)
+library(topGO)
-
+
-# Let's define a gene set: GOterm enrichment for G1/S switch activators:
+# Let's define a gene set: GOterm enrichment for G1/S switch activators:
-mySet <- c("YFR028C", # Cdc14
+mySet <- c("YFR028C", # Cdc14
-           "YDL056W", # Mbp1
+           "YDL056W", # Mbp1
-           "YLR182W", # Swi6
+           "YLR182W", # Swi6
-           "YER111C", # Swi4
+           "YER111C", # Swi4
-           "YOR083W", # Whi5
+           "YOR083W", # Whi5
-           "YBR160W", # Cdc28
+           "YBR160W", # Cdc28
-           "YMR199W", # Cln1
+           "YMR199W", # Cln1
-           "YPL256C", # Cln2
+           "YPL256C", # Cln2
-           "YAL040C") # Cln3
+           "YAL040C") # Cln3
-
+
-allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
+allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
-allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
+allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
-                                            # we define enrichment
+                                            # we define enrichment
-
+
-myEnr <- GOenrichment(mySet, allGenes)
+myEnr <- GOenrichment(mySet, allGenes)
-
+
-sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
+sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
-
+
-#Most significantly enriched is GO:0071931. What is this?
+#Most significantly enriched is GO:0071931. What is this?
-annotate::getGOTerm("GO:0071931")  # ... makes sense.
+annotate::getGOTerm("GO:0071931")  # ... makes sense.
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-MYSPE.R
+++ b/BIN-MYSPE.R
@ -1,351 +1,351 @@
-# tocID <- "BIN-MYSPE.R"
+# tocID <- "BIN-MYSPE.R"
-#
+#
-# Purpose: A Bioinformatics Course:
+# Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-MYSPE unit
+#              R code accompanying the BIN-MYSPE unit
-#
+#
-#
+#
-# Version: 1.4
+# Version: 1.4
-#
+#
-# Date:    2017-09 - 2021-10
+# Date:    2017-09 - 2021-10
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# V 1.4    Add troubleshooting hints via errText[[...]]
+# V 1.4    Add troubleshooting hints via errText[[...]]
-# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
+# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
-# V 1.2    Reorganized proportional plot section into a "further reading"
+# V 1.2    Reorganized proportional plot section into a "further reading"
-#          section, added nested-box, and sankey plot visualization of
+#          section, added nested-box, and sankey plot visualization of
-#          proportions. Introduced plotly.
+#          proportions. Introduced plotly.
-# V 1.1    2020 Workflow changes
+# V 1.1    2020 Workflow changes
-# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
+# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
-# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
+# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
-# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
+# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
-#
+#
-# TODO:    Sample solution for sankey plot function.
+# TODO:    Sample solution for sankey plot function.
-#
+#
-#
+#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
+#
-# DO NOT SIMPLY  source()  THESE FILES!
+# DO NOT SIMPLY  source()  THESE FILES!
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
+#  going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                             Line
+#TOC>   Section  Title                                             Line
-#TOC> -----------------------------------------------------------------
+#TOC> -----------------------------------------------------------------
-#TOC>   1        PREPARATIONS                                        52
+#TOC>   1        PREPARATIONS                                        52
-#TOC>   2        SUITABLE MYSPE SPECIES                              65
+#TOC>   2        SUITABLE MYSPE SPECIES                              65
-#TOC>   3        ADOPT "MYSPE"                                       89
+#TOC>   3        ADOPT "MYSPE"                                       89
-#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
+#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
-#TOC>   4.1        Percentages                                      146
+#TOC>   4.1        Percentages                                      146
-#TOC>   4.2        Visualizing proportions: Pie chart               165
+#TOC>   4.2        Visualizing proportions: Pie chart               165
-#TOC>   4.3        Visualizing proportions: Nested squares          243
+#TOC>   4.3        Visualizing proportions: Nested squares          243
-#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
+#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  PREPARATIONS  ========================================================
+# =    1  PREPARATIONS  ========================================================
-#
+#
-
+
-# Execute the two conditionals below:
+# Execute the two conditionals below:
-if (! file.exists("./myScripts/.myProfile.R")) {
+if (! file.exists("./myScripts/.myProfile.R")) {
-  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
+  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
-}
+}
-
+
-if (! exists("myStudentNumber")) {
+if (! exists("myStudentNumber")) {
-  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
+  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
-}
+}
-
+
-
+
-# =    2  SUITABLE MYSPE SPECIES  ==============================================
+# =    2  SUITABLE MYSPE SPECIES  ==============================================
-
+
-
+
-# In this unit we will select one species from a list of genome sequenced fungi
+# In this unit we will select one species from a list of genome sequenced fungi
-# and write it into your personalized profile file. This species will be called
+# and write it into your personalized profile file. This species will be called
-# "MYSPE" (My Species) for other learning units and exercises.
+# "MYSPE" (My Species) for other learning units and exercises.
-
+
-# A detailed description of the process of compiling the list of genome
+# A detailed description of the process of compiling the list of genome
-# sequenced fungi with protein annotations and Mbp1 homologues is in the file
+# sequenced fungi with protein annotations and Mbp1 homologues is in the file
-# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
+# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
-# was retrieved from https://fungi.ensembl.org; a search for homologues to
+# was retrieved from https://fungi.ensembl.org; a search for homologues to
-# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
+# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
-# A representative organism at each genus-level was chosen from those hits
+# A representative organism at each genus-level was chosen from those hits
-# that actual;ly have a homologue. Finally, a mapping table was constructed to
+# that actual;ly have a homologue. Finally, a mapping table was constructed to
-# asymmetrically retrieve unique species: a student number will retrieve
+# asymmetrically retrieve unique species: a student number will retrieve
-# a species, but (public) knowledge of the species cannot reconstruct the
+# a species, but (public) knowledge of the species cannot reconstruct the
-# student number.
+# student number.
-
+
-# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
+# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
-#       of selecting and combining data from various data resources. Studying
+#       of selecting and combining data from various data resources. Studying
-#       it will give you a better sense of how such workflows can be
+#       it will give you a better sense of how such workflows can be
-#       implemented in practice.
+#       implemented in practice.
-
+
-
+
-# =    3  ADOPT "MYSPE"  =======================================================
+# =    3  ADOPT "MYSPE"  =======================================================
-
+
-# Execute:
+# Execute:
-( MYSPE <- getMYSPE(myStudentNumber) )
+( MYSPE <- getMYSPE(myStudentNumber) )
-
+
-# If this produced an error, this session has not been properly set up. You
+# If this produced an error, this session has not been properly set up. You
-# may not yet have run  init()  and edited  .myProfile.R , or that file is not
+# may not yet have run  init()  and edited  .myProfile.R , or that file is not
-# in your  myScripts/  folder. Fix this, and execute:
+# in your  myScripts/  folder. Fix this, and execute:
-#
+#
-#    source(".Rprofile") .
+#    source(".Rprofile") .
-
+
-# If this produced NA, your Student Number may not be correct, or you are not in
+# If this produced NA, your Student Number may not be correct, or you are not in
-# my class-list. Contact me. Otherwise, this should have printed a species name,
+# my class-list. Contact me. Otherwise, this should have printed a species name,
-# and the taxonomy ID of its genome-sequenced strain. This is your unique
+# and the taxonomy ID of its genome-sequenced strain. This is your unique
-# speciesfor this course. Note it in your journal ...
+# speciesfor this course. Note it in your journal ...
-
+
-biCode(MYSPE) # and also note it's "BiCode" ...
+biCode(MYSPE) # and also note it's "BiCode" ...
-( myTaxID <- names(MYSPE) )  # and its taxID
+( myTaxID <- names(MYSPE) )  # and its taxID
-
+
-
+
-# Task:
+# Task:
-# =====
+# =====
-#   Note down the species name and its five letter BiCode on your Student
+#   Note down the species name and its five letter BiCode on your Student
-#   Wiki user page. Use this species whenever this or future assignments refer
+#   Wiki user page. Use this species whenever this or future assignments refer
-#   to MYSPE. Whenever you start a session, it will automatically be loaded
+#   to MYSPE. Whenever you start a session, it will automatically be loaded
-#   from  myScripts/.myProfile.R  and is available as  MYSPE .
+#   from  myScripts/.myProfile.R  and is available as  MYSPE .
-
+
-# Here is some more information about MYSPE, taken from the table of genome-
+# Here is some more information about MYSPE, taken from the table of genome-
-# sequenced fungi that is in your ./data folder.
+# sequenced fungi that is in your ./data folder.
-fungiDat <- read.csv("data/Species.csv")
+fungiDat <- read.csv("data/Species.csv")
-iMs <- which(fungiDat$Taxon.ID == myTaxID)
+iMs <- which(fungiDat$Taxon.ID == myTaxID)
-
+
-( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
+( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
-( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
+( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
-( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
+( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
-
+
-# That's all.
+# That's all.
-
+
-
+
-# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
+# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
-
+
-# The material below is an exploration of data-preparation and plotting
+# The material below is an exploration of data-preparation and plotting
-# techniques; you can treat this as additional practice and further reading and
+# techniques; you can treat this as additional practice and further reading and
-# I expect that some of the code and plotting examples may be useful in a
+# I expect that some of the code and plotting examples may be useful in a
-# different context.
+# different context.
-
+
-# A frequent task is to visualize the proportion of elements with given
+# A frequent task is to visualize the proportion of elements with given
-# categories in a sample. For example, we might ask what the proportion of the
+# categories in a sample. For example, we might ask what the proportion of the
-# different orders of fungi is the order of MYSPE? Let's first collect the
+# different orders of fungi is the order of MYSPE? Let's first collect the
-# numbers.
+# numbers.
-
+
-( nFungi <- nrow(fungiDat) )                            # sequenced fungi
+( nFungi <- nrow(fungiDat) )                            # sequenced fungi
-( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
+( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
-( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
+( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
-( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
+( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
-
+
-
+
-# ==   4.1  Percentages  =======================================================
+# ==   4.1  Percentages  =======================================================
-
+
-# The zeroth-order approach to visualization is simply to print percentages:
+# The zeroth-order approach to visualization is simply to print percentages:
-
+
-cat(sprintf("\n%s comprise %5.2f%% of fungi.",
+cat(sprintf("\n%s comprise %5.2f%% of fungi.",
-        myOr,
+        myOr,
-        (nOrder * 100) / nFungi))
+        (nOrder * 100) / nFungi))
-
+
-# ... or, adding the actual numbers:
+# ... or, adding the actual numbers:
-
+
-cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
+cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
-            myOr,
+            myOr,
-            (nOrder * 100) / nFungi,
+            (nOrder * 100) / nFungi,
-            nOrder,
+            nOrder,
-            nFungi))
+            nFungi))
-
+
-# But that's hard to visualize for most of us, and anyway, we don't know how
+# But that's hard to visualize for most of us, and anyway, we don't know how
-# that relates to other orders.
+# that relates to other orders.
-
+
-# ==   4.2  Visualizing proportions: Pie chart  ================================
+# ==   4.2  Visualizing proportions: Pie chart  ================================
-
+
-# Often, we will use a pie chart instead. Pie charts are rather informal types
+# Often, we will use a pie chart instead. Pie charts are rather informal types
-# of plots, not well suited for analysis. But easy to do:
+# of plots, not well suited for analysis. But easy to do:
-
+
-# Define four colors to identify the four categories
+# Define four colors to identify the four categories
-pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
+pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
-
+
-oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
+oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
-                                           # and remember the
+                                           # and remember the
-                                           # previous setting
+                                           # previous setting
-
+
-pie(c(nSpecies,                            # subtract numbers since these
+pie(c(nSpecies,                            # subtract numbers since these
-      nGenus - nSpecies,                   # categories are mutually contained
+      nGenus - nSpecies,                   # categories are mutually contained
-      nOrder - nGenus - nSpecies,          # in each other
+      nOrder - nGenus - nSpecies,          # in each other
-      nFungi - nOrder - nGenus - nSpecies),
+      nFungi - nOrder - nGenus - nSpecies),
-      labels = "",
+      labels = "",
-      radius = 0.9,
+      radius = 0.9,
-      main = "MYSPE in genome-sequenced fungi",
+      main = "MYSPE in genome-sequenced fungi",
-      lty = 0,                             # turn borders for wedges off
+      lty = 0,                             # turn borders for wedges off
-      col = pCol,
+      col = pCol,
-      clockwise = TRUE,
+      clockwise = TRUE,
-      init.angle = 90)
+      init.angle = 90)
-
+
-title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
+title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
-
+
-legend(x = 0.95, y = 0.8,    # place at legend here
+legend(x = 0.95, y = 0.8,    # place at legend here
-       legend = c("Species", "Genus", "Order", "Fungi"),
+       legend = c("Species", "Genus", "Order", "Fungi"),
-       y.intersp = 2,                      # line spacing for labels
+       y.intersp = 2,                      # line spacing for labels
-       cex = 0.8,                          # character size for labels
+       cex = 0.8,                          # character size for labels
-       bty = "n",                          # "no" box around the legend
+       bty = "n",                          # "no" box around the legend
-       pt.cex = 2,                         # size of colour boxes
+       pt.cex = 2,                         # size of colour boxes
-       pch = 15,                           # a filled square
+       pch = 15,                           # a filled square
-       col = pCol)
+       col = pCol)
-
+
-par(oPar)                                  # reset graphics state
+par(oPar)                                  # reset graphics state
-
+
-# Unless MYSPE is one of the frequently sequenced species, there will only be a
+# Unless MYSPE is one of the frequently sequenced species, there will only be a
-# very thin wedge visible. Pie charts are not well suited to visualize small
+# very thin wedge visible. Pie charts are not well suited to visualize small
-# proportions.
+# proportions.
-
+
-# It is a little more useful if we have non-nested proportions - like the
+# It is a little more useful if we have non-nested proportions - like the
-# number of species in the same order overall:
+# number of species in the same order overall:
-
+
-myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
+myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
-head(myTbl)
+head(myTbl)
-
+
-# pie() does a reasonable job out of the box to interpret table() data:
+# pie() does a reasonable job out of the box to interpret table() data:
-pie(myTbl)
+pie(myTbl)
-
+
-# ... we can improve this quickly with a bit of tweaking:
+# ... we can improve this quickly with a bit of tweaking:
-
+
-N <- length(myTbl)
+N <- length(myTbl)
-sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
+sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
-
+
-myCol <- rep(pCol[4], N)       # N elements of pCol[1]
+myCol <- rep(pCol[4], N)       # N elements of pCol[1]
-myCol[sel] <- pCol[1]          # replace this one color
+myCol[sel] <- pCol[1]          # replace this one color
-
+
-myLbl <- rep("", N)            # N labels of ""
+myLbl <- rep("", N)            # N labels of ""
-myLbl[sel] <- myOr             # replace this one label with the MYSPE order
+myLbl[sel] <- myOr             # replace this one label with the MYSPE order
-
+
-
+
-oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
+oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
-
+
-pie(myTbl,
+pie(myTbl,
-    labels = myLbl,
+    labels = myLbl,
-    radius = 0.9,
+    radius = 0.9,
-    main = "MYSPE order",
+    main = "MYSPE order",
-    border = "#DDDDDD",
+    border = "#DDDDDD",
-    col = myCol,
+    col = myCol,
-    clockwise = TRUE,
+    clockwise = TRUE,
-    init.angle = 90)
+    init.angle = 90)
-
+
-par(oPar)                                  # reset graphics state
+par(oPar)                                  # reset graphics state
-
+
-# But the overall problem remains.
+# But the overall problem remains.
-
+
-
+
-# ==   4.3  Visualizing proportions: Nested squares  ===========================
+# ==   4.3  Visualizing proportions: Nested squares  ===========================
-
+
-# A simple alternative is to draw such proportions as nested squares:
+# A simple alternative is to draw such proportions as nested squares:
-
+
-x <- sqrt(nFungi)
+x <- sqrt(nFungi)
-
+
-# set margins to ~ 0 and type to square
+# set margins to ~ 0 and type to square
-oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
+oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
-
+
-# empty, square plot
+# empty, square plot
-plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
+plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
-     type="n", axes=FALSE, xlab="", ylab="")
+     type="n", axes=FALSE, xlab="", ylab="")
-
+
-# basic square for all genomes
+# basic square for all genomes
-rect(0, 0, x,              x,              col = pCol[4])
+rect(0, 0, x,              x,              col = pCol[4])
-
+
-# grid
+# grid
-u <- 0:floor(x)
+u <- 0:floor(x)
-N <- length(u)
+N <- length(u)
-segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
+segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
-segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
+segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
-# each square on this grid is one genome
+# each square on this grid is one genome
-
+
-# colored squares
+# colored squares
-rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
+rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
-rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
+rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
-rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
+rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
-
+
-# labels
+# labels
-text(x/2, x/2,      "Fungi")
+text(x/2, x/2,      "Fungi")
-text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
+text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
-text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
+text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
-text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
+text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
-
+
-par(oPar)                                  # reset graphics state
+par(oPar)                                  # reset graphics state
-
+
-
+
-# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
+# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
-
+
-# Sankey diagrams are an excellent way to visualize complicated nested
+# Sankey diagrams are an excellent way to visualize complicated nested
-# proportions and their changes (see here for example:
+# proportions and their changes (see here for example:
-# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
+# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
-# example with the MYSPE proportions, as an illustration of the plotting
+# example with the MYSPE proportions, as an illustration of the plotting
-# principle.
+# principle.
-
+
-if (! requireNamespace("plotly")) {
+if (! requireNamespace("plotly")) {
-  install.packages("plotly")
+  install.packages("plotly")
-}
+}
-# Package information:
+# Package information:
-#  library(help   = plotly)     # basic information
+#  library(help   = plotly)     # basic information
-#  browseVignettes("plotly")    # available vignettes
+#  browseVignettes("plotly")    # available vignettes
-#  data(package  = "plotly")    # available datasets
+#  data(package  = "plotly")    # available datasets
-
+
-# Here, we use the plotly package that wraps a very well developed javascript
+# Here, we use the plotly package that wraps a very well developed javascript
-# library with many options for interactive plots. I am producing this plot
+# library with many options for interactive plots. I am producing this plot
-# hard-coded for the sample organism "Sporothrix schenkii"; you would need
+# hard-coded for the sample organism "Sporothrix schenkii"; you would need
-# to change the code to adapt it to your own MYSPE - or even build a function
+# to change the code to adapt it to your own MYSPE - or even build a function
-# for this. Do try this if you have a bit of coding experience, sankey diagrams
+# for this. Do try this if you have a bit of coding experience, sankey diagrams
-# are a good way to show hierarchical data relations - and if you get this
+# are a good way to show hierarchical data relations - and if you get this
-# working for your own organism you can be proud that you have understood
+# working for your own organism you can be proud that you have understood
-# how preparing the data works.
+# how preparing the data works.
-
+
-
+
-myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
+myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
-                          "Ophiostomatales (6)",       # 1
+                          "Ophiostomatales (6)",       # 1
-                          "Other...",                  # 2
+                          "Other...",                  # 2
-                          "Sporothrix (4)",            # 3
+                          "Sporothrix (4)",            # 3
-                          "Other...",                  # 4
+                          "Other...",                  # 4
-                          "Sporothrix schenckii (2)",  # 5
+                          "Sporothrix schenckii (2)",  # 5
-                          "Other..."                   # 6
+                          "Other..."                   # 6
-                          ),
+                          ),
-                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
+                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
-                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
+                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
-                color = c("#f2f2f0", #
+                color = c("#f2f2f0", #
-                          "#ffd5c4",
+                          "#ffd5c4",
-                          "#CCCCCC",
+                          "#CCCCCC",
-                          "#ff9582",
+                          "#ff9582",
-                          "#CCCCCC",
+                          "#CCCCCC",
-                          "#ed394e",
+                          "#ed394e",
-                          "#CCCCCC"
+                          "#CCCCCC"
-                          ),
+                          ),
-                pad = 15,
+                pad = 15,
-                thickness = 20,
+                thickness = 20,
-                line = list(color = "black",
+                line = list(color = "black",
-                            width = 0.5))
+                            width = 0.5))
-
+
-myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
+myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
-                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
+                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
-                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
+                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
-
+
-# Setting up the actual plot ...
+# Setting up the actual plot ...
-fig  <-  plotly::plot_ly(type = "sankey",
+fig  <-  plotly::plot_ly(type = "sankey",
-                         arrangement = "snap",
+                         arrangement = "snap",
-                         orientation = "h",
+                         orientation = "h",
-                         node = myNodes,
+                         node = myNodes,
-                         link = myLinks)
+                         link = myLinks)
-
+
-# Adding and adjusting a few layout parameters
+# Adding and adjusting a few layout parameters
-fig <- plotly::layout(fig,
+fig <- plotly::layout(fig,
-              title = "Fungi Genomes - Classification",
+              title = "Fungi Genomes - Classification",
-              font = list(size = 10))
+              font = list(size = 10))
-
+
-fig     # plot the diagram
+fig     # plot the diagram
-
+
-# Note that the plot appears in the Viewer window, not the Plot window, and that
+# Note that the plot appears in the Viewer window, not the Plot window, and that
-# it is interactive: you can hover over nodes and links, and drag the nodes
+# it is interactive: you can hover over nodes and links, and drag the nodes
-# around.
+# around.
-
+
-# [END]
+# [END]
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@ -1,234 +1,234 @@
-# tocID <- "BIN-PHYLO-Data_preparation.R"
+# tocID <- "BIN-PHYLO-Data_preparation.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Data_preparation unit.
+#              R code accompanying the BIN-PHYLO-Data_preparation unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance
+#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0    First 2017 version
+#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                     Line
+#TOC>   Section  Title                                     Line
-#TOC> ---------------------------------------------------------
+#TOC> ---------------------------------------------------------
-#TOC>   1        Preparations                                45
+#TOC>   1        Preparations                                45
-#TOC>   2        Fetching sequences                          77
+#TOC>   2        Fetching sequences                          77
-#TOC>   3        Multiple Sequence Alignment                118
+#TOC>   3        Multiple Sequence Alignment                118
-#TOC>   4        Reviewing and Editing Alignments           137
+#TOC>   4        Reviewing and Editing Alignments           137
-#TOC>   4.1        Masking workflow                         153
+#TOC>   4.1        Masking workflow                         153
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Preparations  ========================================================
+# =    1  Preparations  ========================================================
-
+
-
+
-# You need to reload your protein database, including changes that might have
+# You need to reload your protein database, including changes that might have
-# been made to the reference files. If you have worked with the prerequiste
+# been made to the reference files. If you have worked with the prerequiste
-# units, you should have a script named "makeProteinDB.R" that will create the
+# units, you should have a script named "makeProteinDB.R" that will create the
-# myDB object with a protein and feature database. Ask for advice if not.
+# myDB object with a protein and feature database. Ask for advice if not.
-source("myScripts/makeProteinDB.R")
+source("myScripts/makeProteinDB.R")
-
+
-# Load packages we need
+# Load packages we need
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-
+
-if (! requireNamespace("msa", quietly = TRUE)) {
+if (! requireNamespace("msa", quietly = TRUE)) {
-  BiocManager::install("msa")
+  BiocManager::install("msa")
-}
+}
-# Package information:
+# Package information:
-#  library(help = msa)       # basic information
+#  library(help = msa)       # basic information
-#  browseVignettes("msa")  # available vignettes
+#  browseVignettes("msa")  # available vignettes
-#  data(package = "msa")   # available datasets
+#  data(package = "msa")   # available datasets
-
+
-
+
-# =    2  Fetching sequences  ==================================================
+# =    2  Fetching sequences  ==================================================
-
+
-
+
-# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
+# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
-# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
+# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
-# domains. You have annotated their ranges as a feature. The following code
+# domains. You have annotated their ranges as a feature. The following code
-# retrieves the sequences from myDB. You have seen similar code in other units.
+# retrieves the sequences from myDB. You have seen similar code in other units.
-
+
-sel <- grep("^MBP1_", myDB$protein$name)
+sel <- grep("^MBP1_", myDB$protein$name)
-(proNames <- myDB$protein$name[sel])
+(proNames <- myDB$protein$name[sel])
-(proIDs <- myDB$protein$ID[sel])
+(proIDs <- myDB$protein$ID[sel])
-
+
-(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
+(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
-(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
+(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
-                              myDB$annotation$featureID == sel])      #  ==  !
+                              myDB$annotation$featureID == sel])      #  ==  !
-                                                                      # Why?
+                                                                      # Why?
-APSI <- character(length(fanIDs))
+APSI <- character(length(fanIDs))
-
+
-for (i in seq_along(fanIDs)) {
+for (i in seq_along(fanIDs)) {
-  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
+  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
-  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
+  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
-  start <- myDB$annotation$start[sel]       # get start ...
+  start <- myDB$annotation$start[sel]       # get start ...
-  end   <- myDB$annotation$end[sel]         # ... and end
+  end   <- myDB$annotation$end[sel]         # ... and end
-
+
-  sel <- myDB$protein$ID == proID           # get the protein row index ...
+  sel <- myDB$protein$ID == proID           # get the protein row index ...
-                                            # ... and the sequence
+                                            # ... and the sequence
-  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
+  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
-  names(APSI)[i] <- (myDB$protein$name[sel])
+  names(APSI)[i] <- (myDB$protein$name[sel])
-}
+}
-
+
-head(APSI)
+head(APSI)
-
+
-# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
+# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
-# phylogenetic tree (see the unit's Wiki page for details on the sequence).
+# phylogenetic tree (see the unit's Wiki page for details on the sequence).
-
+
-APSI <- c(APSI,
+APSI <- c(APSI,
-"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
+"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
-names(APSI)[length(APSI)] <- "KILA_ESCCO"
+names(APSI)[length(APSI)] <- "KILA_ESCCO"
-tail(APSI)
+tail(APSI)
-
+
-
+
-# =    3  Multiple Sequence Alignment  =========================================
+# =    3  Multiple Sequence Alignment  =========================================
-
+
-# This vector of sequences with named elements fulfills the requirements to be
+# This vector of sequences with named elements fulfills the requirements to be
-# imported as a Biostrings object - an AAStringSet - which we need as input for
+# imported as a Biostrings object - an AAStringSet - which we need as input for
-# the MSA algorithms in Biostrings.
+# the MSA algorithms in Biostrings.
-#
+#
-
+
-APSESSet <- Biostrings::AAStringSet(APSI)
+APSESSet <- Biostrings::AAStringSet(APSI)
-APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
+APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
-
+
-# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
+# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
-# that happens in your case, just use msaClustalOmega() instead.
+# that happens in your case, just use msaClustalOmega() instead.
-
+
-# inspect the alignment.
+# inspect the alignment.
-writeALN(APSESMsa)
+writeALN(APSESMsa)
-
+
-# What do you think? Is this a good alignment for phylogenetic inference?
+# What do you think? Is this a good alignment for phylogenetic inference?
-
+
-
+
-# =    4  Reviewing and Editing Alignments  ====================================
+# =    4  Reviewing and Editing Alignments  ====================================
-
+
-
+
-# Head back to the Wiki page for this unit and read up on the background
+# Head back to the Wiki page for this unit and read up on the background
-# first.
+# first.
-
+
-# Let's mask out all columns that have observations for
+# Let's mask out all columns that have observations for
-# less than 1/3 of the sequences in the dataset. This
+# less than 1/3 of the sequences in the dataset. This
-# means they have more than round(nrow(msaSet) * (2/3))
+# means they have more than round(nrow(msaSet) * (2/3))
-# hyphens in a column.
+# hyphens in a column.
-#
+#
-# We take all sequences, split them into single
+# We take all sequences, split them into single
-# characters, and put them into a matrix. Then we
+# characters, and put them into a matrix. Then we
-# go through the matrix, column by column and decide
+# go through the matrix, column by column and decide
-# whether we want to include that column.
+# whether we want to include that column.
-
+
-# ==   4.1  Masking workflow  ==================================================
+# ==   4.1  Masking workflow  ==================================================
-
+
-# get the length of the alignment
+# get the length of the alignment
-(lenAli <- APSESMsa@unmasked@ranges@width[1])
+(lenAli <- APSESMsa@unmasked@ranges@width[1])
-
+
-# initialize a matrix that can hold all characters
+# initialize a matrix that can hold all characters
-# individually
+# individually
-msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
+msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
-                    ncol = lenAli)
+                    ncol = lenAli)
-
+
-# assign the correct rownames
+# assign the correct rownames
-rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
+rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
-for (i in 1:nrow(APSESMsa)) {
+for (i in 1:nrow(APSESMsa)) {
-  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
+  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
-}
+}
-
+
-# inspect the result
+# inspect the result
-msaMatrix[1:7, 30:40]
+msaMatrix[1:7, 30:40]
-
+
-# Now let's make a logical vector with an element for each column that selects
+# Now let's make a logical vector with an element for each column that selects
-# which columns should be masked out.
+# which columns should be masked out.
-
+
-# The number of hyphens in a column is easy to count. Consider:
+# The number of hyphens in a column is easy to count. Consider:
-
+
-    msaMatrix[ , 20]             # column 20
+    msaMatrix[ , 20]             # column 20
-    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
+    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
-sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
+sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
-
+
-# Thus filling our logical vector is simple:
+# Thus filling our logical vector is simple:
-
+
-# initialize a mask
+# initialize a mask
-colMask <- logical(ncol(msaMatrix))
+colMask <- logical(ncol(msaMatrix))
-
+
-# define the threshold for rejecting a column
+# define the threshold for rejecting a column
-limit <- round(nrow(APSESMsa) * (2/3))
+limit <- round(nrow(APSESMsa) * (2/3))
-
+
-# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
+# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
-# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
+# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
-# and FALSE columns will be rejected.
+# and FALSE columns will be rejected.
-for (i in 1:ncol(msaMatrix)) {
+for (i in 1:ncol(msaMatrix)) {
-  count <- sum(msaMatrix[ , i] == "-")
+  count <- sum(msaMatrix[ , i] == "-")
-  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
+  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
-}
+}
-
+
-# Inspect the mask
+# Inspect the mask
-colMask
+colMask
-
+
-# How many positions are being kept?
+# How many positions are being kept?
-sum(colMask)
+sum(colMask)
-
+
-cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
+cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
-            100 * (1 - (sum(colMask) / length(colMask)))))
+            100 * (1 - (sum(colMask) / length(colMask)))))
-
+
-
+
-# Next, we use colMask to remove the masked columns from the matrix
+# Next, we use colMask to remove the masked columns from the matrix
-# in one step:
+# in one step:
-maskedMatrix <- msaMatrix[ , colMask]
+maskedMatrix <- msaMatrix[ , colMask]
-
+
-# check:
+# check:
-ncol(maskedMatrix)
+ncol(maskedMatrix)
-
+
-# ... then collapse each row of single characters back into a string ...
+# ... then collapse each row of single characters back into a string ...
-APSESphyloSet <- character()
+APSESphyloSet <- character()
-for (i in 1:nrow(maskedMatrix)) {
+for (i in 1:nrow(maskedMatrix)) {
-  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
+  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
-}
+}
-names(APSESphyloSet) <- rownames(maskedMatrix)
+names(APSESphyloSet) <- rownames(maskedMatrix)
-
+
-# inspect ...
+# inspect ...
-writeALN(APSESphyloSet)
+writeALN(APSESphyloSet)
-
+
-# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
+# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
-# several indels from the KILA_ESCCO outgroup sequence.
+# several indels from the KILA_ESCCO outgroup sequence.
-
+
-
+
-# We save the aligned, masked domains to a file in the data/ directory,
+# We save the aligned, masked domains to a file in the data/ directory,
-# in multi-FASTA format.
+# in multi-FASTA format.
-writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
+writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-PHYLO-Tree_analysis.R
+++ b/BIN-PHYLO-Tree_analysis.R
@ -1,406 +1,406 @@
-# tocID <- "BIN-PHYLO-Tree_analysis.R"
+# tocID <- "BIN-PHYLO-Tree_analysis.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
+#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
+#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
-#                  Rewrite of tip re-ordering. Better handling of
+#                  Rewrite of tip re-ordering. Better handling of
-#                  messages. pBar() for randomization.
+#                  messages. pBar() for randomization.
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0.2  Typo in variable name, style changes
+#           1.0.2  Typo in variable name, style changes
-#           1.0.1  Wrong section heading
+#           1.0.1  Wrong section heading
-#           1.0    First 2017 version
+#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                              Line
+#TOC>   Section  Title                              Line
-#TOC> --------------------------------------------------
+#TOC> --------------------------------------------------
-#TOC>   1        Preparation and Tree Plot            50
+#TOC>   1        Preparation and Tree Plot            50
-#TOC>   2        SPECIES REFERENCE TREE               66
+#TOC>   2        SPECIES REFERENCE TREE               66
-#TOC>   3        Tree Analysis                       117
+#TOC>   3        Tree Analysis                       117
-#TOC>   3.1        Rooting Trees                     177
+#TOC>   3.1        Rooting Trees                     177
-#TOC>   3.2        Rotating Clades                   222
+#TOC>   3.2        Rotating Clades                   222
-#TOC>   3.3        Computing tree distances          309
+#TOC>   3.3        Computing tree distances          309
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Preparation and Tree Plot  ===========================================
+# =    1  Preparation and Tree Plot  ===========================================
-
+
-
+
-if (! requireNamespace("ape", quietly = TRUE)) {
+if (! requireNamespace("ape", quietly = TRUE)) {
-  install.packages("ape")
+  install.packages("ape")
-}
+}
-# Package information:
+# Package information:
-#  library(help = ape)       # basic information
+#  library(help = ape)       # basic information
-#  browseVignettes("ape")    # available vignettes
+#  browseVignettes("ape")    # available vignettes
-#  data(package = "ape")     # available datasets
+#  data(package = "ape")     # available datasets
-
+
-# We change the graphics parameters from time to time, let's define the
+# We change the graphics parameters from time to time, let's define the
-# default so we can recreate a sane state:
+# default so we can recreate a sane state:
-dev.off()
+dev.off()
-PAR <- par()
+PAR <- par()
-
+
-# =    2  SPECIES REFERENCE TREE  ==============================================
+# =    2  SPECIES REFERENCE TREE  ==============================================
-
+
-# Before we do any kind of phylogenetic analysis of genes from several species,
+# Before we do any kind of phylogenetic analysis of genes from several species,
-# we MUST have a reference tree of the taxonomic relationships in hand. This
+# we MUST have a reference tree of the taxonomic relationships in hand. This
-# context is absolutely required for the interpretation of our tree.
+# context is absolutely required for the interpretation of our tree.
-
+
-# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
+# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
-
+
-if (! requireNamespace("taxize", quietly = TRUE)) {
+if (! requireNamespace("taxize", quietly = TRUE)) {
-  install.packages("taxize")
+  install.packages("taxize")
-}
+}
-# Package information:
+# Package information:
-#  library(help   = taxize)       # basic information
+#  library(help   = taxize)       # basic information
-#  browseVignettes("taxize")    # available vignettes
+#  browseVignettes("taxize")    # available vignettes
-#  data(package  = "taxize")     # available datasets
+#  data(package  = "taxize")     # available datasets
-
+
-( mySOI <- c(myDB$taxonomy$ID, "83333") )
+( mySOI <- c(myDB$taxonomy$ID, "83333") )
-myClass <- taxize::classification(mySOI, db = "ncbi")
+myClass <- taxize::classification(mySOI, db = "ncbi")
-str(myClass)
+str(myClass)
-
+
-myClass[[1]]
+myClass[[1]]
-
+
-fungiTree <- taxize::class2tree(myClass, check = TRUE)
+fungiTree <- taxize::class2tree(myClass, check = TRUE)
-plot(fungiTree)
+plot(fungiTree)
-
+
-# The tree produced by taxize:: contains full length species names,
+# The tree produced by taxize:: contains full length species names,
-# but it would be more convenient if it had bicodes instead. Also, the actual
+# but it would be more convenient if it had bicodes instead. Also, the actual
-# tree is only part of the list(), which will cause problems later:
+# tree is only part of the list(), which will cause problems later:
-str(fungiTree)
+str(fungiTree)
-
+
-# we therefor simplify
+# we therefor simplify
-fungiTree <- fungiTree$phylo
+fungiTree <- fungiTree$phylo
-str(fungiTree)
+str(fungiTree)
-
+
-# The species names are in a vector $phylo$tip.label of this list.
+# The species names are in a vector $phylo$tip.label of this list.
-# We can use biCode() to shorten them.
+# We can use biCode() to shorten them.
-fungiTree$tip.label <- biCode(fungiTree$tip.label)
+fungiTree$tip.label <- biCode(fungiTree$tip.label)
-
+
-# Plot the tree
+# Plot the tree
-nSP <- length(fungiTree$tip.label)
+nSP <- length(fungiTree$tip.label)
-plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
+plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
-text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
+text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
-ape::nodelabels(text = fungiTree$node.label,
+ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.6,
+                cex = 0.6,
-                adj = 0.2,
+                adj = 0.2,
-                bg = "#D4F2DA")
+                bg = "#D4F2DA")
-# Note that you can use the arrow buttons in the menu above the plot pane to
+# Note that you can use the arrow buttons in the menu above the plot pane to
-# scroll back to plots you have created earlier - so you can reference back to
+# scroll back to plots you have created earlier - so you can reference back to
-# this species tree in your later analysis.
+# this species tree in your later analysis.
-
+
-
+
-# =    3  Tree Analysis  =======================================================
+# =    3  Tree Analysis  =======================================================
-
+
-
+
-# 1.1  Visualizing your tree
+# 1.1  Visualizing your tree
-# The trees that are produced by Rphylip are stored as an object of class
+# The trees that are produced by Rphylip are stored as an object of class
-# "phylo". This is a class for phylogenetic trees that is widely used in the
+# "phylo". This is a class for phylogenetic trees that is widely used in the
-# community, practically all R phylogenetics packages will options to read and
+# community, practically all R phylogenetics packages will options to read and
-# manipulate such trees. Outside of R, a popular interchange format is the
+# manipulate such trees. Outside of R, a popular interchange format is the
-# Newick_format that you have seen above. It's easy to output your calculated
+# Newick_format that you have seen above. It's easy to output your calculated
-# trees in Newick format and visualize them elsewhere.
+# trees in Newick format and visualize them elsewhere.
-
+
-# The "phylo" class object is one of R's "S3" objects and methods to plot and
+# The "phylo" class object is one of R's "S3" objects and methods to plot and
-# print it have been defined with the Rphylip package, and in ape. You can
+# print it have been defined with the Rphylip package, and in ape. You can
-# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
+# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
-# to plot it. The underlying function is plot.phylo(), and documentation for its
+# to plot it. The underlying function is plot.phylo(), and documentation for its
-# many options can by found by typing:
+# many options can by found by typing:
-
+
-?plot.phylo
+?plot.phylo
-
+
-# We load the APSES sequence tree that you produced in the
+# We load the APSES sequence tree that you produced in the
-# BIN-PHYLO-Tree_building unit:
+# BIN-PHYLO-Tree_building unit:
-apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
+apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
-
+
-plot(apsTree) # default type is "phylogram"
+plot(apsTree) # default type is "phylogram"
-plot(apsTree, type = "unrooted")
+plot(apsTree, type = "unrooted")
-plot(apsTree, type = "fan", no.margin = TRUE)
+plot(apsTree, type = "fan", no.margin = TRUE)
-
+
-# rescale to show all of the labels:
+# rescale to show all of the labels:
-# record the current plot parameters by assigning them to a variable ...
+# record the current plot parameters by assigning them to a variable ...
-(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
+(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
-# ... and adjust the plot limits for a new plot:
+# ... and adjust the plot limits for a new plot:
-plot(apsTree,
+plot(apsTree,
-     type = "fan",
+     type = "fan",
-     x.lim = tmp$x.lim * 1.8,
+     x.lim = tmp$x.lim * 1.8,
-     y.lim = tmp$y.lim * 1.8,
+     y.lim = tmp$y.lim * 1.8,
-     cex = 0.8,
+     cex = 0.8,
-     no.margin = TRUE)
+     no.margin = TRUE)
-
+
-# Inspect the tree object
+# Inspect the tree object
-str(apsTree)
+str(apsTree)
-apsTree$tip.label
+apsTree$tip.label
-apsTree$edge
+apsTree$edge
-apsTree$edge.length
+apsTree$edge.length
-
+
-# show the node / edge and tip labels on a plot
+# show the node / edge and tip labels on a plot
-plot(apsTree)
+plot(apsTree)
-ape::nodelabels()
+ape::nodelabels()
-ape::edgelabels()
+ape::edgelabels()
-ape::tiplabels()
+ape::tiplabels()
-
+
-# show the number of nodes, edges and tips
+# show the number of nodes, edges and tips
-ape::Nnode(apsTree)
+ape::Nnode(apsTree)
-ape::Nedge(apsTree)
+ape::Nedge(apsTree)
-ape::Ntip(apsTree)
+ape::Ntip(apsTree)
-
+
-par(PAR)   # reset graphics state
+par(PAR)   # reset graphics state
-
+
-# Finally, write the tree to console in Newick format
+# Finally, write the tree to console in Newick format
-ape::write.tree(apsTree)
+ape::write.tree(apsTree)
-
+
-# ==   3.1  Rooting Trees  =====================================================
+# ==   3.1  Rooting Trees  =====================================================
-
+
-# In order to analyse the tree, it is helpful to root it first and reorder its
+# In order to analyse the tree, it is helpful to root it first and reorder its
-# clades. Contrary to documentation, Rproml() returns an unrooted tree.
+# clades. Contrary to documentation, Rproml() returns an unrooted tree.
-
+
-ape::is.rooted(apsTree)
+ape::is.rooted(apsTree)
-
+
-# You can root the tree with the command root() from the "ape" package.
+# You can root the tree with the command root() from the "ape" package.
-
+
-plot(apsTree)
+plot(apsTree)
-
+
-# add labels for internal nodes and tips
+# add labels for internal nodes and tips
-ape::nodelabels(cex = 0.5, frame = "circle")
+ape::nodelabels(cex = 0.5, frame = "circle")
-ape::tiplabels(cex = 0.5, frame = "rect")
+ape::tiplabels(cex = 0.5, frame = "rect")
-
+
-# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
+# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
-# number in yours. Substitute the correct node number below for "outgroup".
+# number in yours. Substitute the correct node number below for "outgroup".
-apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
+apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
-plot(apsTree)
+plot(apsTree)
-ape::is.rooted(apsTree)
+ape::is.rooted(apsTree)
-
+
-# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
+# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
-# an edge of length zero was added to connect the MRCA (Most Recent Common
+# an edge of length zero was added to connect the MRCA (Most Recent Common
-# Ancestor) of the ingroup.
+# Ancestor) of the ingroup.
-
+
-# The edge lengths are stored in the phylo object:
+# The edge lengths are stored in the phylo object:
-apsTree$edge.length
+apsTree$edge.length
-
+
-# ... and you can assign a small arbitrary value to the edge
+# ... and you can assign a small arbitrary value to the edge
-# to show how it connects to the tree without having an
+# to show how it connects to the tree without having an
-# overlap.
+# overlap.
-apsTree$edge.length[1] <- 0.1
+apsTree$edge.length[1] <- 0.1
-plot(apsTree, cex = 0.7)
+plot(apsTree, cex = 0.7)
-ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
+ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
-
+
-
+
-# This procedure does however not assign an actual length to a root edge, and
+# This procedure does however not assign an actual length to a root edge, and
-# therefore no root edge is visible on the plot. Why? , you might ask. I ask
+# therefore no root edge is visible on the plot. Why? , you might ask. I ask
-# myself that too. We'll just add a length by hand.
+# myself that too. We'll just add a length by hand.
-
+
-apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
+apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
-plot(apsTree, cex = 0.7, root.edge = TRUE)
+plot(apsTree, cex = 0.7, root.edge = TRUE)
-ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
+ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
-
+
-
+
-# ==   3.2  Rotating Clades  ===================================================
+# ==   3.2  Rotating Clades  ===================================================
-
+
-# To interpret the tree, it is useful to rotate the clades so that they appear
+# To interpret the tree, it is useful to rotate the clades so that they appear
-# in the order expected from the cladogram of species.
+# in the order expected from the cladogram of species.
-
+
-# We can either rotate around individual internal nodes ...
+# We can either rotate around individual internal nodes ...
-layout(matrix(1:2, 1, 2))
+layout(matrix(1:2, 1, 2))
-plot(apsTree, no.margin = TRUE, root.edge = TRUE)
+plot(apsTree, no.margin = TRUE, root.edge = TRUE)
-ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
+ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
-plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
+plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
-ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
+ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
-# Note that the species at the bottom of the clade descending from node
+# Note that the species at the bottom of the clade descending from node
-# 17 is now plotted at the top.
+# 17 is now plotted at the top.
-
+
-par(PAR)   # reset graphics state
+par(PAR)   # reset graphics state
-
+
-# ... or we can rearrange the tree so it corresponds as well as possible to a
+# ... or we can rearrange the tree so it corresponds as well as possible to a
-# predefined tip ordering. Here we use the ordering that taxize:: has inferred
+# predefined tip ordering. Here we use the ordering that taxize:: has inferred
-# from the NCBI taxonomic classification.
+# from the NCBI taxonomic classification.
-
+
-nOrg <- length(apsTree$tip.label)
+nOrg <- length(apsTree$tip.label)
-
+
-plot(fungiTree,
+plot(fungiTree,
-     no.margin = FALSE, root.edge = TRUE)
+     no.margin = FALSE, root.edge = TRUE)
-ape::nodelabels(text = fungiTree$node.label,
+ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.5,
+                cex = 0.5,
-                adj = 0.2,
+                adj = 0.2,
-                bg = "#D4F2DA")
+                bg = "#D4F2DA")
-
+
-# These are the fungi tree tips ...
+# These are the fungi tree tips ...
-fungiTree$tip.label
+fungiTree$tip.label
-# ... and their order is determined by the edge-list that is stored in
+# ... and their order is determined by the edge-list that is stored in
-fungiTree$edge
+fungiTree$edge
-# which edges join the tips?
+# which edges join the tips?
-ape::tiplabels(cex = 0.5, frame = "rect")
+ape::tiplabels(cex = 0.5, frame = "rect")
-# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
+# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
-# ordered from bottom to top.
+# ordered from bottom to top.
-# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
+# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
-
+
-sel <- fungiTree$edge[ , 2 ] <= nOrg
+sel <- fungiTree$edge[ , 2 ] <= nOrg
-( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
+( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
-
+
-# Now, here are the genes of the apsTree tips ...
+# Now, here are the genes of the apsTree tips ...
-apsTree$tip.label
+apsTree$tip.label
-
+
-# ... and the "constraint"  we need for reordering, according to the help page
+# ... and the "constraint"  we need for reordering, according to the help page
-# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
+# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
-# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
+# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
-oSp <- gsub("^", "MBP1_", oSp)
+oSp <- gsub("^", "MBP1_", oSp)
-( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
+( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
-
+
-# Then we can plot the two trees to compare: the fungi- tree
+# Then we can plot the two trees to compare: the fungi- tree
-par(PAR)   # reset graphics state
+par(PAR)   # reset graphics state
-layout(matrix(1:2, 1, 2))
+layout(matrix(1:2, 1, 2))
-plot(fungiTree,
+plot(fungiTree,
-    no.margin = TRUE,
+    no.margin = TRUE,
-     root.edge = TRUE)
+     root.edge = TRUE)
-ape::nodelabels(text = fungiTree$node.label,
+ape::nodelabels(text = fungiTree$node.label,
-                cex = 0.5,
+                cex = 0.5,
-                adj = 0.2,
+                adj = 0.2,
-                bg = "#D4F2DA")
+                bg = "#D4F2DA")
-
+
-# and the re-organized apsesTree ...
+# and the re-organized apsesTree ...
-plot(ape::rotateConstr(apsTree, constraint = oSp[]),
+plot(ape::rotateConstr(apsTree, constraint = oSp[]),
-     no.margin = TRUE,
+     no.margin = TRUE,
-     root.edge = TRUE)
+     root.edge = TRUE)
-
+
-par(PAR)   # reset graphics state
+par(PAR)   # reset graphics state
-
+
-# As you can see, the reordering is not perfect, since the topologies are
+# As you can see, the reordering is not perfect, since the topologies are
-# different, mostly due to the unresolved nodes in the reference tree. One
+# different, mostly due to the unresolved nodes in the reference tree. One
-# could play with that ...
+# could play with that ...
-
+
-
+
-# Task: Study the two trees and consider their similarities and differences.
+# Task: Study the two trees and consider their similarities and differences.
-#         What do you expect? What do you find? Note that this is not a "mixed"
+#         What do you expect? What do you find? Note that this is not a "mixed"
-#         gene tree yet, since it contains only a single gene for the species
+#         gene tree yet, since it contains only a single gene for the species
-#         we considered. All of the branch points in this tree are speciation
+#         we considered. All of the branch points in this tree are speciation
-#         events. Thus the gene tree should have the same topology as the
+#         events. Thus the gene tree should have the same topology as the
-#         species tree. Does it? Are the differences important? How many
+#         species tree. Does it? Are the differences important? How many
-#         branches would you need to remove and reinsert elsewhere to get the
+#         branches would you need to remove and reinsert elsewhere to get the
-#         same topology as the species tree?
+#         same topology as the species tree?
-
+
-# In order to quantify how different these two trees are, we need to compute
+# In order to quantify how different these two trees are, we need to compute
-# tree distances.
+# tree distances.
-
+
-
+
-# ==   3.3  Computing tree distances  ==========================================
+# ==   3.3  Computing tree distances  ==========================================
-
+
-
+
-# Many superb phylogeny tools are contributed by the phangorn package.
+# Many superb phylogeny tools are contributed by the phangorn package.
-
+
-if (! requireNamespace("phangorn", quietly = TRUE)) {
+if (! requireNamespace("phangorn", quietly = TRUE)) {
-  install.packages("phangorn")
+  install.packages("phangorn")
-}
+}
-# Package information:
+# Package information:
-#  library(help = phangorn)       # basic information
+#  library(help = phangorn)       # basic information
-#  browseVignettes("phangorn")    # available vignettes
+#  browseVignettes("phangorn")    # available vignettes
-#  data(package = "phangorn")     # available datasets
+#  data(package = "phangorn")     # available datasets
-
+
-# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
+# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
-# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
+# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
-apsTree2 <- apsTree
+apsTree2 <- apsTree
-apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
+apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
-
+
-
+
-# phangorn provides several functions to compute tree-differences (and there
+# phangorn provides several functions to compute tree-differences (and there
-# is a _whole_ lot of theory on how to compare trees). treedist() returns the
+# is a _whole_ lot of theory on how to compare trees). treedist() returns the
-# "symmetric difference"
+# "symmetric difference"
-phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
+phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
-
+
-# Numbers. What do they mean? How much more similar is our apsTree to the
+# Numbers. What do they mean? How much more similar is our apsTree to the
-# (presumably) ground truth of fungiTree than a random tree would be?
+# (presumably) ground truth of fungiTree than a random tree would be?
-# The ape package provides the function rtree()
+# The ape package provides the function rtree()
-# to compute random trees.
+# to compute random trees.
-
+
-ape::rtree(n = length(apsTree2$tip.label), # number of tips
+ape::rtree(n = length(apsTree2$tip.label), # number of tips
-          rooted = TRUE,                   # we rooted the tree above,
+          rooted = TRUE,                   # we rooted the tree above,
-                                           #  and fungiTree is rooted anyway
+                                           #  and fungiTree is rooted anyway
-          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
+          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
-          br = NULL)                       # don't generate branch lengths since
+          br = NULL)                       # don't generate branch lengths since
-                                           #   fungiTree has none, so we can't
+                                           #   fungiTree has none, so we can't
-                                           #   compare them anyway.
+                                           #   compare them anyway.
-
+
-# (Note the warning message about non-binary trees; we'll suppress that later
+# (Note the warning message about non-binary trees; we'll suppress that later
-#  by wrapping the function call in supressMessages(); we don't want to
+#  by wrapping the function call in supressMessages(); we don't want to
-#  print it 10,000 times :-)
+#  print it 10,000 times :-)
-
+
-
+
-# Let's compute some random trees this way, calculate the distances to
+# Let's compute some random trees this way, calculate the distances to
-# fungiTree, and then compare the values we get for apsTree2. The random
+# fungiTree, and then compare the values we get for apsTree2. The random
-# trees are provided by ape::rtree().
+# trees are provided by ape::rtree().
-
+
-N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
+N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
-            # defined in .utilities.R  to keep track of where we are at:
+            # defined in .utilities.R  to keep track of where we are at:
-myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
+myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
-colnames(myTreeDistances) <- c("symm", "path")
+colnames(myTreeDistances) <- c("symm", "path")
-
+
-set.seed(112358)
+set.seed(112358)
-for (i in 1:N) {
+for (i in 1:N) {
-  pBar(i, N)
+  pBar(i, N)
-  xTree <- ape::rtree(n = length(apsTree2$tip.label),
+  xTree <- ape::rtree(n = length(apsTree2$tip.label),
-                      rooted = TRUE,
+                      rooted = TRUE,
-                      tip.label = apsTree2$tip.label,
+                      tip.label = apsTree2$tip.label,
-                      br = NULL)
+                      br = NULL)
-  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
+  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
-}
+}
-set.seed(NULL)                      # reset the random number generator
+set.seed(NULL)                      # reset the random number generator
-
+
-table(myTreeDistances[, "symm"])
+table(myTreeDistances[, "symm"])
-
+
-( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
+( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
-
+
-# Random events less-or-equal to observation, divided by total number of
+# Random events less-or-equal to observation, divided by total number of
-# events gives us the empirical p-value.
+# events gives us the empirical p-value.
-cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
+cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
-            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
+            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
-
+
-par(PAR)   # reset graphics state
+par(PAR)   # reset graphics state
-hist(myTreeDistances[, "path"],
+hist(myTreeDistances[, "path"],
-     col = "aliceblue",
+     col = "aliceblue",
-     main = "Distances of random Trees to fungiTree")
+     main = "Distances of random Trees to fungiTree")
-(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
+(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
-abline(v = pathObs, col = "chartreuse")
+abline(v = pathObs, col = "chartreuse")
-
+
-# Random events less-or-equal to observation, divided by total number of
+# Random events less-or-equal to observation, divided by total number of
-# events gives us the empirical p-value.
+# events gives us the empirical p-value.
-cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
+cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
-            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
+            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
-
+
-# Indeed, our apsTree is _very_ much more similar to the species tree than
+# Indeed, our apsTree is _very_ much more similar to the species tree than
-# we would expect by random chance.
+# we would expect by random chance.
-
+
-# What do we gain from that analysis? Analyzing the tree we get from a single
+# What do we gain from that analysis? Analyzing the tree we get from a single
-# gene of orthologous sequences is a positive control in our computational
+# gene of orthologous sequences is a positive control in our computational
-# experiment. If these genes are indeed orthologues, a correct tree-building
+# experiment. If these genes are indeed orthologues, a correct tree-building
-# program ought to give us a tree that exactly matches the species tree.
+# program ought to give us a tree that exactly matches the species tree.
-# Evaluating how far off we are from the known correct result gives us a way to
+# Evaluating how far off we are from the known correct result gives us a way to
-# validate our workflow and our algorithm. If we can't get that right, we can't
+# validate our workflow and our algorithm. If we can't get that right, we can't
-# expect to get "real" data right either. Employing such positive controls in
+# expect to get "real" data right either. Employing such positive controls in
-# every computational experiment is essential for research. Not doing so is
+# every computational experiment is essential for research. Not doing so is
-# Cargo Cult Bioinformatics.
+# Cargo Cult Bioinformatics.
-
+
-
+
-# [END]
+# [END]
--- a/BIN-PHYLO-Tree_building.R
+++ b/BIN-PHYLO-Tree_building.R
@ -1,168 +1,168 @@
-# tocID <- "BIN-PHYLO-Tree_building.R"
+# tocID <- "BIN-PHYLO-Tree_building.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PHYLO-Tree_building unit.
+#              R code accompanying the BIN-PHYLO-Tree_building unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10   2020-09
+# Date:     2017-10   2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
+#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
-#                  instructions to authorize proml.app
+#                  instructions to authorize proml.app
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#           1.0    First 2017 version
+#           1.0    First 2017 version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#           Add MrBayes
+#           Add MrBayes
-# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
+# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                       Line
+#TOC>   Section  Title                                       Line
-#TOC> -----------------------------------------------------------
+#TOC> -----------------------------------------------------------
-#TOC>   1        Calculating Trees                             48
+#TOC>   1        Calculating Trees                             48
-#TOC>   1.1        PROMLPATH ...                               68
+#TOC>   1.1        PROMLPATH ...                               68
-#TOC>   1.1.1          ... on the Mac                          73
+#TOC>   1.1.1          ... on the Mac                          73
-#TOC>   1.1.2          ... on Windows                         101
+#TOC>   1.1.2          ... on Windows                         101
-#TOC>   1.1.3          ... on Linux                           115
+#TOC>   1.1.3          ... on Linux                           115
-#TOC>   1.1.4          Confirming PROMLPATH                   120
+#TOC>   1.1.4          Confirming PROMLPATH                   120
-#TOC>   1.2        Building a maximum likelihood tree         134
+#TOC>   1.2        Building a maximum likelihood tree         134
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Calculating Trees  ===================================================
+# =    1  Calculating Trees  ===================================================
-
+
-
+
-# Follow the instructions found at phylip's home on the Web to install. If you
+# Follow the instructions found at phylip's home on the Web to install. If you
-# are on a Windows computer, take note of the installation directory.
+# are on a Windows computer, take note of the installation directory.
-
+
-# After you have installed Phylip on your computer, install the R package that
+# After you have installed Phylip on your computer, install the R package that
-# provides an interface to the Phylip functions.
+# provides an interface to the Phylip functions.
-
+
-if (! requireNamespace("Rphylip", quietly = TRUE)) {
+if (! requireNamespace("Rphylip", quietly = TRUE)) {
-  install.packages("Rphylip")
+  install.packages("Rphylip")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Rphylip)       # basic information
+#  library(help = Rphylip)       # basic information
-#  browseVignettes("Rphylip")    # available vignettes
+#  browseVignettes("Rphylip")    # available vignettes
-#  data(package = "Rphylip")     # available datasets
+#  data(package = "Rphylip")     # available datasets
-
+
-# This will install RPhylip, as well as its dependency, the package "ape".
+# This will install RPhylip, as well as its dependency, the package "ape".
-
+
-
+
-# ==   1.1  PROMLPATH ...  =====================================================
+# ==   1.1  PROMLPATH ...  =====================================================
-# The next part may be tricky. You will need to figure out where
+# The next part may be tricky. You will need to figure out where
-# on your computer Phylip has been installed and define the path
+# on your computer Phylip has been installed and define the path
-# to the proml program that calculates a maximum-likelihood tree.
+# to the proml program that calculates a maximum-likelihood tree.
-
+
-# ===   1.1.1  ... on the Mac                    
+# ===   1.1.1  ... on the Mac                    
-# On the Mac, the standard installation places a phylip folder
+# On the Mac, the standard installation places a phylip folder
-# in the /Applications directory. That folder contains all the
+# in the /Applications directory. That folder contains all the
-# individual phylip programs as <name>.app files. These are not
+# individual phylip programs as <name>.app files. These are not
-# the actual executables, but "app" files are actually directories
+# the actual executables, but "app" files are actually directories
-# that contain the required resources for a program to run.
+# that contain the required resources for a program to run.
-
+
-# The executable is in a subdirectory and you can point Rphylip
+# The executable is in a subdirectory and you can point Rphylip
-# directly to that subdirectory to find the program it needs:
+# directly to that subdirectory to find the program it needs:
-# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
+# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
-
+
-# However, RPHYLIP will not be able to run PHYLIP applications immediately,
+# However, RPHYLIP will not be able to run PHYLIP applications immediately,
-# because they have not been "signed" by the PHYLIP developers. The process
+# because they have not been "signed" by the PHYLIP developers. The process
-# will terminate by your system, with a warning.
+# will terminate by your system, with a warning.
-
+
-#   -  Navigate to the phylip folder in your ~/Applications directory
+#   -  Navigate to the phylip folder in your ~/Applications directory
-#   -  Descend into the "exe" folder and find  proml.app
+#   -  Descend into the "exe" folder and find  proml.app
-#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
+#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
-#      says: "macOS cannot verify the developer of “proml.app”.
+#      says: "macOS cannot verify the developer of “proml.app”.
-#             Are you sure you want to open it?"
+#             Are you sure you want to open it?"
-#   -  Click open to continue. You may need to allow access to the terminal
+#   -  Click open to continue. You may need to allow access to the terminal
-#      as well. When the proml terminal session open, you can type
+#      as well. When the proml terminal session open, you can type
-#      Ctrl-c to abort the program and close the window.
+#      Ctrl-c to abort the program and close the window.
-#
+#
-#   This adds proml.app to the list of known-good programs and you will not
+#   This adds proml.app to the list of known-good programs and you will not
-#   need to repeat this process.
+#   need to repeat this process.
-#
+#
-
+
-# ===   1.1.2  ... on Windows                    
+# ===   1.1.2  ... on Windows                    
-# On Windows you need to know where the programs have been installed, and you
+# On Windows you need to know where the programs have been installed, and you
-# need to specify a path that is correct for the Windows OS. Find the folder
+# need to specify a path that is correct for the Windows OS. Find the folder
-# that is named "exe", and right-click to inspect its properties. The path
+# that is named "exe", and right-click to inspect its properties. The path
-# should be listed among them.
+# should be listed among them.
-
+
-# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
+# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
-# assignment has to be
+# assignment has to be
-# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
+# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
-# (Note: "/", not "\")
+# (Note: "/", not "\")
-
+
-# I have heard that your path must not contain spaces, and it is prudent to
+# I have heard that your path must not contain spaces, and it is prudent to
-# avoid other special characters as well.
+# avoid other special characters as well.
-
+
-# ===   1.1.3  ... on Linux                      
+# ===   1.1.3  ... on Linux                      
-# If you are running Linux I trust you know what to do. It's probably
+# If you are running Linux I trust you know what to do. It's probably
-# something like
+# something like
-# PROMLPATH <- "/usr/local/phylip-3.695/bin"
+# PROMLPATH <- "/usr/local/phylip-3.695/bin"
-
+
-# ===   1.1.4  Confirming PROMLPATH              
+# ===   1.1.4  Confirming PROMLPATH              
-# Confirm that the settings are right.
+# Confirm that the settings are right.
-PROMLPATH                # returns the path
+PROMLPATH                # returns the path
-list.dirs(PROMLPATH)     # returns the directories in that path
+list.dirs(PROMLPATH)     # returns the directories in that path
-list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
+list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
-
+
-# If "proml" is NOT among the files that the last command returns, you
+# If "proml" is NOT among the files that the last command returns, you
-# can't continue. Ask on the mailing list for advice.
+# can't continue. Ask on the mailing list for advice.
-
+
-# If everything is good, you can add the line that defines PROMLPATH to
+# If everything is good, you can add the line that defines PROMLPATH to
-# myScripts/.myProfile.R - the path will then be automatically set when
+# myScripts/.myProfile.R - the path will then be automatically set when
-# you quit RStudio and return.
+# you quit RStudio and return.
-
+
-
+
-# ==   1.2  Building a maximum likelihood tree  ================================
+# ==   1.2  Building a maximum likelihood tree  ================================
-# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
+# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
-# as a "proseq" object with the read.protein() function of the RPhylip package:
+# as a "proseq" object with the read.protein() function of the RPhylip package:
-
+
-apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
+apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
-str(apsIn)
+str(apsIn)
-
+
-# ... and you are ready to build a tree.
+# ... and you are ready to build a tree.
-
+
-# There are many fast options in PHYLIP - we will use the most _accurate_ one
+# There are many fast options in PHYLIP - we will use the most _accurate_ one
-# that it has: proml, a maximum-likelihood tree building program for protein
+# that it has: proml, a maximum-likelihood tree building program for protein
-# data.
+# data.
-
+
-# Building maximum-likelihood trees can eat as much computer time
+# Building maximum-likelihood trees can eat as much computer time
-# as you can throw at it. Calculating a tree of 48 APSES domains
+# as you can throw at it. Calculating a tree of 48 APSES domains
-# with default parameters of Rproml() runs for more than half a day
+# with default parameters of Rproml() runs for more than half a day
-# on my computer. But we have only twelve sequences here, so the
+# on my computer. But we have only twelve sequences here, so the
-# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
+# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
-# of coffee while you are waiting.
+# of coffee while you are waiting.
-
+
-apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
+apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
-
+
-# A quick first look:
+# A quick first look:
-
+
-plot(apsTree)
+plot(apsTree)
-
+
-# save your tree:
+# save your tree:
-saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
+saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
-
+
-# If this did not work, ask for advice.
+# If this did not work, ask for advice.
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-PPI-Analysis.R
+++ b/BIN-PPI-Analysis.R
@ -1,323 +1,323 @@
-# tocID <- "BIN-PPI-Analysis.R"
+# tocID <- "BIN-PPI-Analysis.R"
-#
+#
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-PPI-Analysis unit.
+#              R code accompanying the BIN-PPI-Analysis unit.
-#
+#
-# Version:   1.4
+# Version:   1.4
-#
+#
-# Date:     2017-08  -  2020-10
+# Date:     2017-08  -  2020-10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.4    Update vector ID's for betweenness centrality.
+#           1.4    Update vector ID's for betweenness centrality.
-#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
+#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
-#           1.2    2020 Updates; Rewrite for new STRINg V11;
+#           1.2    2020 Updates; Rewrite for new STRINg V11;
-#                  Deprecate save()/load() for saveRDS()/readRDS()
+#                  Deprecate save()/load() for saveRDS()/readRDS()
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0    First live version
+#           1.0    First live version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                                           Line
+#TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------------
+#TOC> ---------------------------------------------------------------
-#TOC>   1        Setup and data                                    50
+#TOC>   1        Setup and data                                    50
-#TOC>   2        Functional Edges in the Human Proteome            86
+#TOC>   2        Functional Edges in the Human Proteome            86
-#TOC>   2.1        Cliques                                        129
+#TOC>   2.1        Cliques                                        129
-#TOC>   2.2        Communities                                    170
+#TOC>   2.2        Communities                                    170
-#TOC>   2.3        Betweenness Centrality                         184
+#TOC>   2.3        Betweenness Centrality                         184
-#TOC>   3        biomaRt                                          231
+#TOC>   3        biomaRt                                          231
-#TOC>   4        Task for submission                              302
+#TOC>   4        Task for submission                              302
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Setup and data  ======================================================
+# =    1  Setup and data  ======================================================
-
+
-
+
-# Not surprisingly, the analysis of PPI networks needs iGraph:
+# Not surprisingly, the analysis of PPI networks needs iGraph:
-
+
-if (! requireNamespace("igraph", quietly = TRUE)) {
+if (! requireNamespace("igraph", quietly = TRUE)) {
-  install.packages("igraph")
+  install.packages("igraph")
-}
+}
-# Package information:
+# Package information:
-#  library(help = igraph)       # basic information
+#  library(help = igraph)       # basic information
-#  browseVignettes("igraph")    # available vignettes
+#  browseVignettes("igraph")    # available vignettes
-#  data(package = "igraph")     # available datasets
+#  data(package = "igraph")     # available datasets
-
+
-# In order for you to explore some real, biological networks, I give you a
+# In order for you to explore some real, biological networks, I give you a
-# dataframe of functional relationships of human proteins that I have downloaded
+# dataframe of functional relationships of human proteins that I have downloaded
-# from the STRING database. The full table has 8.5 million records, here is a
+# from the STRING database. The full table has 8.5 million records, here is a
-# subset of records with combined confidence scores > 980
+# subset of records with combined confidence scores > 980
-
+
-# The selected set of edges with a confidence of > 964 is a dataframe with about
+# The selected set of edges with a confidence of > 964 is a dataframe with about
-# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
+# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
-# a fungal proteome. You can load the saved dataframe here (To read more about
+# a fungal proteome. You can load the saved dataframe here (To read more about
-# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
+# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
-
+
-STRINGedges <- readRDS("./data/STRINGedges.rds")
+STRINGedges <- readRDS("./data/STRINGedges.rds")
-
+
-head(STRINGedges)
+head(STRINGedges)
-
+
-# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
+# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
-# Ensemble transcript identifiers that start with ENSP. We'll remove them:
+# Ensemble transcript identifiers that start with ENSP. We'll remove them:
-
+
-STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
+STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
-STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
+STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
-
+
-head(STRINGedges)
+head(STRINGedges)
-
+
-
+
-# =    2  Functional Edges in the Human Proteome  ==============================
+# =    2  Functional Edges in the Human Proteome  ==============================
-
+
-
+
-# There are many possibilities to explore interesting aspects of biological
+# There are many possibilities to explore interesting aspects of biological
-# networks, we will keep with some very simple procedures here but you have
+# networks, we will keep with some very simple procedures here but you have
-# to be aware that this is barely scratching the surface of possibilities.
+# to be aware that this is barely scratching the surface of possibilities.
-# However, once the network exists in your computer, it is comparatively
+# However, once the network exists in your computer, it is comparatively
-# easy to find information online about the many, many options to analyze.
+# easy to find information online about the many, many options to analyze.
-
+
-
+
-# Make a graph from this dataframe
+# Make a graph from this dataframe
-?igraph::graph_from_data_frame
+?igraph::graph_from_data_frame
-
+
-gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
+gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
-
+
-# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
+# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
-# layout of such large graphs is possible, but requires specialized code. Google
+# layout of such large graphs is possible, but requires specialized code. Google
-# for <layout large graphs> if you are curious. Also, consider what one can
+# for <layout large graphs> if you are curious. Also, consider what one can
-# really learn from plotting such a graph ...
+# really learn from plotting such a graph ...
-
+
-# Of course simple computations on this graph are reasonably fast:
+# Of course simple computations on this graph are reasonably fast:
-
+
-compSTR <- igraph::components(gSTR)
+compSTR <- igraph::components(gSTR)
-summary(compSTR) # our graph is fully connected!
+summary(compSTR) # our graph is fully connected!
-
+
-hist(log(igraph::degree(gSTR)), col="#FEE0AF")
+hist(log(igraph::degree(gSTR)), col="#FEE0AF")
-# this actually does look rather scale-free
+# this actually does look rather scale-free
-
+
-(freqRank <- table(igraph::degree(gSTR)))
+(freqRank <- table(igraph::degree(gSTR)))
-plot(log10(as.numeric(names(freqRank)) + 1),
+plot(log10(as.numeric(names(freqRank)) + 1),
-     log10(as.numeric(freqRank)), type = "b",
+     log10(as.numeric(freqRank)), type = "b",
-     pch = 21, bg = "#FEE0AF",
+     pch = 21, bg = "#FEE0AF",
-     xlab = "log(Rank)", ylab = "log(frequency)",
+     xlab = "log(Rank)", ylab = "log(frequency)",
-     main = "8,400 nodes from the human functional interaction network")
+     main = "8,400 nodes from the human functional interaction network")
-
+
-# This looks very scale-free indeed.
+# This looks very scale-free indeed.
-
+
-(regressionLine <- lm(log10(as.numeric(freqRank)) ~
+(regressionLine <- lm(log10(as.numeric(freqRank)) ~
-                      log10(as.numeric(names(freqRank)) + 1)))
+                      log10(as.numeric(names(freqRank)) + 1)))
-abline(regressionLine, col = "firebrick")
+abline(regressionLine, col = "firebrick")
-
+
-# Now explore some more:
+# Now explore some more:
-
+
-# ==   2.1  Cliques  ===========================================================
+# ==   2.1  Cliques  ===========================================================
-
+
-# Let's find the largest cliques. Remember: a clique is a fully connected
+# Let's find the largest cliques. Remember: a clique is a fully connected
-# subgraph, i.e. a subgraph in which every node is connected to every other.
+# subgraph, i.e. a subgraph in which every node is connected to every other.
-# Biological complexes often appear as cliques in interaction graphs.
+# Biological complexes often appear as cliques in interaction graphs.
-
+
-igraph::clique_num(gSTR)
+igraph::clique_num(gSTR)
-# The largest clique has 81 members.
+# The largest clique has 81 members.
-
+
-(C <- igraph::largest_cliques(gSTR)[[1]])
+(C <- igraph::largest_cliques(gSTR)[[1]])
-
+
-# Pick one of the proteins and find out what this fully connected cluster of 81
+# Pick one of the proteins and find out what this fully connected cluster of 81
-# proteins is (you can simply Google for any of the IDs). Is this expected?
+# proteins is (you can simply Google for any of the IDs). Is this expected?
-
+
-# Plot this ...
+# Plot this ...
-R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
+R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
-
+
-# color the vertices along a color spectrum
+# color the vertices along a color spectrum
-vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
+vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
-
+
-# color the edges to have the same color as the originating node
+# color the edges to have the same color as the originating node
-eCol <- character()
+eCol <- character()
-for (i in seq_along(vCol)) {
+for (i in seq_along(vCol)) {
-  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
+  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
-}
+}
-
+
-oPar <- par(mar= rep(0,4)) # Turn margins off
+oPar <- par(mar= rep(0,4)) # Turn margins off
-plot(R,
+plot(R,
-     layout = igraph::layout_in_circle(R),
+     layout = igraph::layout_in_circle(R),
-     vertex.size = 3,
+     vertex.size = 3,
-     vertex.color = vCol,
+     vertex.color = vCol,
-     edge.color = eCol,
+     edge.color = eCol,
-     edge.width = 0.1,
+     edge.width = 0.1,
-     vertex.label = NA)
+     vertex.label = NA)
-par(oPar)
+par(oPar)
-
+
-# ... well: remember: a clique means every node is connected to every other
+# ... well: remember: a clique means every node is connected to every other
-# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
+# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
-# networks looks like for large complexes.
+# networks looks like for large complexes.
-
+
-
+
-# ==   2.2  Communities  =======================================================
+# ==   2.2  Communities  =======================================================
-
+
-set.seed(112358)                       # set RNG seed for repeatable randomness
+set.seed(112358)                       # set RNG seed for repeatable randomness
-gSTRclusters <- igraph::cluster_infomap(gSTR)
+gSTRclusters <- igraph::cluster_infomap(gSTR)
-set.seed(NULL)                         # reset the RNG
+set.seed(NULL)                         # reset the RNG
-
+
-igraph::modularity(gSTRclusters) # ... measures how separated the different
+igraph::modularity(gSTRclusters) # ... measures how separated the different
-                                 # membership types are from each other
+                                 # membership types are from each other
-tMem <- table(igraph::membership(gSTRclusters))
+tMem <- table(igraph::membership(gSTRclusters))
-length(tMem)  # About 700 communities identified
+length(tMem)  # About 700 communities identified
-hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
+hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
-range(tMem) # ... but one has > 200 members
+range(tMem) # ... but one has > 200 members
-
+
-
+
-# ==   2.3  Betweenness Centrality  ============================================
+# ==   2.3  Betweenness Centrality  ============================================
-
+
-# Let's find the nodes with the 10 - highest betweenness centralities.
+# Let's find the nodes with the 10 - highest betweenness centralities.
-#
+#
-BC <- igraph::centr_betw(gSTR)
+BC <- igraph::centr_betw(gSTR)
-
+
-# remember: BC$res contains the results
+# remember: BC$res contains the results
-head(BC$res)
+head(BC$res)
-
+
-BC$res[1]   # betweenness centrality of node 1 in the graph ...
+BC$res[1]   # betweenness centrality of node 1 in the graph ...
-# ... which one is node 1?
+# ... which one is node 1?
-igraph::V(gSTR)[1]
+igraph::V(gSTR)[1]
-
+
-# to get the ten-highest nodes, we simply label the elements of BC with their
+# to get the ten-highest nodes, we simply label the elements of BC with their
-# index ...
+# index ...
-names(BC$res) <- as.character(1:length(BC$res))
+names(BC$res) <- as.character(1:length(BC$res))
-
+
-# ... and then we sort:
+# ... and then we sort:
-sBC <- sort(BC$res, decreasing = TRUE)
+sBC <- sort(BC$res, decreasing = TRUE)
-head(sBC)
+head(sBC)
-
+
-# This ordered vector means: node 3 has the highest betweenness centrality,
+# This ordered vector means: node 3 has the highest betweenness centrality,
-# node 721 has the second highest, etc.
+# node 721 has the second highest, etc.
-
+
-(BCsel <- as.numeric(names(sBC)[1:10]))
+(BCsel <- as.numeric(names(sBC)[1:10]))
-
+
-# We can use the first ten labels to subset the nodes in gSTR and fetch the
+# We can use the first ten labels to subset the nodes in gSTR and fetch the
-# IDs...
+# IDs...
-(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
+(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
-
+
-# Task:
+# Task:
-# =====
+# =====
-# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
+# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
-# We are going to use these IDs to produce some output for a submitted task:
+# We are going to use these IDs to produce some output for a submitted task:
-# therefore I need you to execute the following line, note the "seal" that this
+# therefore I need you to execute the following line, note the "seal" that this
-# returns, and not change myENSPsel later:
+# returns, and not change myENSPsel later:
-
+
-myENSPsel <- selectENSP(ENSPsel)
+myENSPsel <- selectENSP(ENSPsel)
-
+
-#  Next, to find what these proteins are...
+#  Next, to find what these proteins are...
-
+
-# We could now Google for all of these IDs to learn more about them. But really,
+# We could now Google for all of these IDs to learn more about them. But really,
-# googling for IDs one after the other, that would be lame. Let's instead use
+# googling for IDs one after the other, that would be lame. Let's instead use
-# the very, very useful biomaRt package to translate these Ensemble IDs into
+# the very, very useful biomaRt package to translate these Ensemble IDs into
-# gene symbols.
+# gene symbols.
-
+
-
+
-# =    3  biomaRt  =============================================================
+# =    3  biomaRt  =============================================================
-
+
-
+
-# IDs are just labels, but for _bio_informatics we need to learn more about the
+# IDs are just labels, but for _bio_informatics we need to learn more about the
-# biological function of the genes or proteins that we retrieve via graph data
+# biological function of the genes or proteins that we retrieve via graph data
-# mining. biomaRt is the tool of choice. It's a package distributed by the
+# mining. biomaRt is the tool of choice. It's a package distributed by the
-# bioconductor project. This here is not a biomaRt tutorial (that's for another
+# bioconductor project. This here is not a biomaRt tutorial (that's for another
-# day), simply a few lines of sample code to get you started on the specific use
+# day), simply a few lines of sample code to get you started on the specific use
-# case of retrieving descriptions for ensembl protein IDs.
+# case of retrieving descriptions for ensembl protein IDs.
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("biomaRt", quietly = TRUE)) {
+if (! requireNamespace("biomaRt", quietly = TRUE)) {
-  BiocManager::install("biomaRt")
+  BiocManager::install("biomaRt")
-}
+}
-# Package information:
+# Package information:
-#  library(help = biomaRt)       # basic information
+#  library(help = biomaRt)       # basic information
-#  browseVignettes("biomaRt")    # available vignettes
+#  browseVignettes("biomaRt")    # available vignettes
-#  data(package = "biomaRt")     # available datasets
+#  data(package = "biomaRt")     # available datasets
-
+
-# define which dataset to use ... this takes a while for download
+# define which dataset to use ... this takes a while for download
-myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
+myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
-
+
-# what filters are defined?
+# what filters are defined?
-( filters <- biomaRt::listFilters(myMart) )
+( filters <- biomaRt::listFilters(myMart) )
-
+
-
+
-# and what attributes can we filter for?
+# and what attributes can we filter for?
-( attributes <- biomaRt::listAttributes(myMart) )
+( attributes <- biomaRt::listAttributes(myMart) )
-
+
-
+
-# Soooo many options - let's look for the correct name of filters that are
+# Soooo many options - let's look for the correct name of filters that are
-# useful for ENSP IDs ...
+# useful for ENSP IDs ...
-filters[grep("ENSP", filters$description), ]
+filters[grep("ENSP", filters$description), ]
-
+
-# ... and the correct attribute names for gene symbols and descriptions ...
+# ... and the correct attribute names for gene symbols and descriptions ...
-attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
+attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
-attributes[grep("description", attributes$description, ignore.case = TRUE), ]
+attributes[grep("description", attributes$description, ignore.case = TRUE), ]
-
+
-
+
-# ... so we can put this together: here is a syntax example:
+# ... so we can put this together: here is a syntax example:
-biomaRt::getBM(filters = "ensembl_peptide_id",
+biomaRt::getBM(filters = "ensembl_peptide_id",
-               attributes = c("hgnc_symbol",
+               attributes = c("hgnc_symbol",
-                              "wikigene_description",
+                              "wikigene_description",
-                              "interpro_description",
+                              "interpro_description",
-                              "phenotype_description"),
+                              "phenotype_description"),
-               values = "ENSP00000000442",
+               values = "ENSP00000000442",
-               mart = myMart)
+               mart = myMart)
-
+
-# A simple loop will now get us the information for our 10 most central genes
+# A simple loop will now get us the information for our 10 most central genes
-# from the human subset of STRING.
+# from the human subset of STRING.
-
+
-CPdefs <- list()  # Since we don't know how many matches one of our queries
+CPdefs <- list()  # Since we don't know how many matches one of our queries
-# will return, we'll put the result dataframes into a list.
+# will return, we'll put the result dataframes into a list.
-
+
-for (ID in myENSPsel) {
+for (ID in myENSPsel) {
-  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
+  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
-                                 attributes = c("hgnc_symbol",
+                                 attributes = c("hgnc_symbol",
-                                                "wikigene_description",
+                                                "wikigene_description",
-                                                "interpro_description",
+                                                "interpro_description",
-                                                "phenotype_description"),
+                                                "phenotype_description"),
-                                 values = ID,
+                                 values = ID,
-                                 mart = myMart)
+                                 mart = myMart)
-}
+}
-
+
-
+
-# So what are the proteins with the ten highest betweenness centralities?
+# So what are the proteins with the ten highest betweenness centralities?
-#  ... are you surprised? (I am! Really.)
+#  ... are you surprised? (I am! Really.)
-
+
-
+
-# =    4  Task for submission  =================================================
+# =    4  Task for submission  =================================================
-
+
-# Write a loop that will go through your personalized list of Ensemble IDs and
+# Write a loop that will go through your personalized list of Ensemble IDs and
-#    for each ID:
+#    for each ID:
-#    --  print the ID,
+#    --  print the ID,
-#    --  print the first row's HGNC symbol,
+#    --  print the first row's HGNC symbol,
-#    --  print the first row's wikigene description.
+#    --  print the first row's wikigene description.
-#    --  print the first row's phenotype.
+#    --  print the first row's phenotype.
-#
+#
-# Write your thoughts about this group of genes.
+# Write your thoughts about this group of genes.
-#
+#
-# (Hint, you can structure your loop in the same way as the loop that
+# (Hint, you can structure your loop in the same way as the loop that
-# created CPdefs. )
+# created CPdefs. )
-
+
-# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
+# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
-# for this loop and its output into your report if you are submitting
+# for this loop and its output into your report if you are submitting
-# anything for credit for this unit. Please read the requirements carefully.
+# anything for credit for this unit. Please read the requirements carefully.
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-SEQA-Composition.R
+++ b/BIN-SEQA-Composition.R
@ -1,252 +1,252 @@
-# tocID <- "BIN-SEQA-Composition.R"
+# tocID <- "BIN-SEQA-Composition.R"
-#
+#
-# Purpose: A Bioinformatics Course:
+# Purpose: A Bioinformatics Course:
-#              R code accompanying the BIN-SEQA-Comparison unit
+#              R code accompanying the BIN-SEQA-Comparison unit
-#
+#
-# Version: 1.2
+# Version: 1.2
-#
+#
-# Date:    2017-11  -  2020-09
+# Date:    2017-11  -  2020-09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-#           1.2    2020 Maintenance
+#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-# Versions:
+# Versions:
-#           1.0    First live version 2017
+#           1.0    First live version 2017
-#           0.1    First code copied from BCH441_A03_makeYFOlist.R
+#           0.1    First code copied from BCH441_A03_makeYFOlist.R
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
+#
-# DO NOT SIMPLY  source()  THESE FILES!
+# DO NOT SIMPLY  source()  THESE FILES!
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
+#  going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                      Line
+#TOC>   Section  Title                                      Line
-#TOC> ----------------------------------------------------------
+#TOC> ----------------------------------------------------------
-#TOC>   1        Preparation                                  48
+#TOC>   1        Preparation                                  48
-#TOC>   2        Aggregate properties                         69
+#TOC>   2        Aggregate properties                         69
-#TOC>   3        Sequence Composition Enrichment             113
+#TOC>   3        Sequence Composition Enrichment             113
-#TOC>   3.1        Barplot, and side-by-side barplot         136
+#TOC>   3.1        Barplot, and side-by-side barplot         136
-#TOC>   3.2        Plotting ratios                           171
+#TOC>   3.2        Plotting ratios                           171
-#TOC>   3.3        Plotting log ratios                       188
+#TOC>   3.3        Plotting log ratios                       188
-#TOC>   3.4        Sort by frequency                         204
+#TOC>   3.4        Sort by frequency                         204
-#TOC>   3.5        Color by amino acid type                  221
+#TOC>   3.5        Color by amino acid type                  221
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Preparation  =========================================================
+# =    1  Preparation  =========================================================
-
+
-if (! requireNamespace("seqinr", quietly = TRUE)) {
+if (! requireNamespace("seqinr", quietly = TRUE)) {
-  install.packages("seqinr")
+  install.packages("seqinr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = seqinr)       # basic information
+#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
+#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
+#  data(package = "seqinr")     # available datasets
-
+
-# Load a reference sequence to work with:
+# Load a reference sequence to work with:
-
+
-# If you have done the BIN-Storing_data unit:
+# If you have done the BIN-Storing_data unit:
-   source("makeProteinDB.R")
+   source("makeProteinDB.R")
-   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
+   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
-   mySeq <- myDB$protein$sequence[sel]
+   mySeq <- myDB$protein$sequence[sel]
-
+
-# If not, use the yeast Mbp1 sequence:
+# If not, use the yeast Mbp1 sequence:
-   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
+   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
-
+
-
+
-# =    2  Aggregate properties  ================================================
+# =    2  Aggregate properties  ================================================
-
+
-
+
-# Let's try a simple function from seqinr: computing the pI of the sequence
+# Let's try a simple function from seqinr: computing the pI of the sequence
-?seqinr::computePI
+?seqinr::computePI
-
+
-# This takes as input a vector of upper-case AA codes
+# This takes as input a vector of upper-case AA codes
-
+
-# We can use the function strsplit() to split the string
+# We can use the function strsplit() to split the string
-# into single characters
+# into single characters
-
+
-(s <- strsplit(mySeq, "")) # splitting on the empty spring
+(s <- strsplit(mySeq, "")) # splitting on the empty spring
-                           # splits into single characters
+                           # splits into single characters
-s <- unlist(s)             # strsplit() returns a list! Why?
+s <- unlist(s)             # strsplit() returns a list! Why?
-                           # (But we don't need a list now...)
+                           # (But we don't need a list now...)
-
+
-# Alternatively, seqinr provides
+# Alternatively, seqinr provides
-# the function s2c() to convert strings into
+# the function s2c() to convert strings into
-# character vectors (and c2s to convert them back).
+# character vectors (and c2s to convert them back).
-
+
-seqinr::s2c(mySeq)
+seqinr::s2c(mySeq)
-
+
-
+
-seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
+seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
-seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
+seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
-seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
+seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
-                                       # values along the sequence
+                                       # values along the sequence
-
+
-# A true Labor of Love has gone into the
+# A true Labor of Love has gone into the
-# compilation of the "aaindex" data:
+# compilation of the "aaindex" data:
-
+
-?seqinr::aaindex
+?seqinr::aaindex
-data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
+data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
-                                   # accessible as an R object
+                                   # accessible as an R object
-
+
-length(aaindex)  # no seqinr:: needed for the dataset since we just
+length(aaindex)  # no seqinr:: needed for the dataset since we just
-                 # "attached" it with data()
+                 # "attached" it with data()
-
+
-# Here are all the index descriptions
+# Here are all the index descriptions
-for (i in 1:length(aaindex)) {
+for (i in 1:length(aaindex)) {
-  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
+  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
-}
+}
-
+
-
+
-# =    3  Sequence Composition Enrichment  =====================================
+# =    3  Sequence Composition Enrichment  =====================================
-
+
-
+
-# Lets use one of the indices to calculate and plot amino-acid
+# Lets use one of the indices to calculate and plot amino-acid
-# composition enrichment:
+# composition enrichment:
-aaindex[[459]]$D
+aaindex[[459]]$D
-
+
-#
+#
-# Let's construct an enrichment plot to compare average frequencies
+# Let's construct an enrichment plot to compare average frequencies
-# with the amino acid counts in our sequence.
+# with the amino acid counts in our sequence.
-
+
-(refData <- aaindex[[459]]$I)                # reference frequencies in %
+(refData <- aaindex[[459]]$I)                # reference frequencies in %
-names(refData) <- seqinr::a(names(refData))  # change names to single-letter
+names(refData) <- seqinr::a(names(refData))  # change names to single-letter
-                                             # code using seqinr's "a()" function
+                                             # code using seqinr's "a()" function
-sum(refData)
+sum(refData)
-refData        # ... in %
+refData        # ... in %
-
+
-
+
-# tabulate the amino acid counts in mySeq
+# tabulate the amino acid counts in mySeq
-(obsData <- table(seqinr::s2c(mySeq)))        # counts
+(obsData <- table(seqinr::s2c(mySeq)))        # counts
-(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
+(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
-
+
-
+
-# ==   3.1  Barplot, and side-by-side barplot  =================================
+# ==   3.1  Barplot, and side-by-side barplot  =================================
-
+
-barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
+barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
-abline(h = 100/20, col="#BB0000")
+abline(h = 100/20, col="#BB0000")
-
+
-barplot(refData, col = "#BB0000", cex.names = 0.7)
+barplot(refData, col = "#BB0000", cex.names = 0.7)
-abline(h = 100/20, col="#555555")
+abline(h = 100/20, col="#555555")
-
+
-# Ok: first problem - the values in obsData are in alphabetical order. But the
+# Ok: first problem - the values in obsData are in alphabetical order. But the
-# values in refData are in alphabetical order of amino acid name: alanine,
+# values in refData are in alphabetical order of amino acid name: alanine,
-# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
+# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
-# order a lot - one of the old biochemistry tropes in the field. So we need to
+# order a lot - one of the old biochemistry tropes in the field. So we need to
-# re-order one of the vectors to match the other. That's easy though:
+# re-order one of the vectors to match the other. That's easy though:
-refData
+refData
-(refData <- refData[names(obsData)])
+(refData <- refData[names(obsData)])
-
+
-barplot(refData, col = "#BB0000", cex.names = 0.7)
+barplot(refData, col = "#BB0000", cex.names = 0.7)
-abline(h = 100/20, col="#555555")
+abline(h = 100/20, col="#555555")
-
+
-# To compare the values, we want to see them in a barplot, side-by-side ...
+# To compare the values, we want to see them in a barplot, side-by-side ...
-barplot(rbind(obsData, refData),
+barplot(rbind(obsData, refData),
-        ylim = c(0, 12),
+        ylim = c(0, 12),
-        beside = TRUE,
+        beside = TRUE,
-        col = c("#CCCCCC", "#BB0000"),
+        col = c("#CCCCCC", "#BB0000"),
-        cex.names = 0.7)
+        cex.names = 0.7)
-abline(h = 100/20, col="#00000044")
+abline(h = 100/20, col="#00000044")
-
+
-# ... and add a legend
+# ... and add a legend
-legend (x = 1, y = 12,
+legend (x = 1, y = 12,
-        legend = c("mySeq", "Average composition"),
+        legend = c("mySeq", "Average composition"),
-        fill = c("#CCCCCC", "#BB0000"),
+        fill = c("#CCCCCC", "#BB0000"),
-        cex = 0.7,
+        cex = 0.7,
-        bty = "n")
+        bty = "n")
-
+
-
+
-# ==   3.2  Plotting ratios  ===================================================
+# ==   3.2  Plotting ratios  ===================================================
-
+
-# To better compare the values, we'll calculate ratios between
+# To better compare the values, we'll calculate ratios between
-# obsData and refData
+# obsData and refData
-
+
-barplot(obsData / refData,
+barplot(obsData / refData,
-        col = "#CCCCCC",
+        col = "#CCCCCC",
-        ylab = "Sequence / Average",
+        ylab = "Sequence / Average",
-        ylim = c(0, 2.5),
+        ylim = c(0, 2.5),
-        cex.names = 0.7)
+        cex.names = 0.7)
-abline(h = 1, col="#BB0000")
+abline(h = 1, col="#BB0000")
-abline(h = c(1/2, 2), lty = 2, col="#BB000055")
+abline(h = c(1/2, 2), lty = 2, col="#BB000055")
-
+
-# ... but  ratios are not very good here, since the difference in height on the
+# ... but  ratios are not very good here, since the difference in height on the
-# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
+# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
-# lines) are exactly the same fold-difference !
+# lines) are exactly the same fold-difference !
-
+
-# ==   3.3  Plotting log ratios  ===============================================
+# ==   3.3  Plotting log ratios  ===============================================
-
+
-# A better way to display this
+# A better way to display this
-# is to plot log(ratios).
+# is to plot log(ratios).
-
+
-barplot(log(obsData / refData),
+barplot(log(obsData / refData),
-        col = "#CCCCCC",
+        col = "#CCCCCC",
-        ylab = "log(Sequence / Average)",
+        ylab = "log(Sequence / Average)",
-        ylim = log(c(1/3, 3)),
+        ylim = log(c(1/3, 3)),
-        cex.names = 0.7)
+        cex.names = 0.7)
-abline(h = log(1), col="#BB0000")
+abline(h = log(1), col="#BB0000")
-abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
+abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
-
+
-# Note how the two-fold difference lines are now the same distance from the
+# Note how the two-fold difference lines are now the same distance from the
-# line of equal ratio.
+# line of equal ratio.
-
+
-# ==   3.4  Sort by frequency  =================================================
+# ==   3.4  Sort by frequency  =================================================
-
+
-barplot(sort(log(obsData / refData), decreasing = TRUE),
+barplot(sort(log(obsData / refData), decreasing = TRUE),
-        ylim = log(c(1/3, 3)),
+        ylim = log(c(1/3, 3)),
-        col = "#CCCCCC",
+        col = "#CCCCCC",
-        ylab = "log(Sequence / Average)",
+        ylab = "log(Sequence / Average)",
-        cex.names = 0.7)
+        cex.names = 0.7)
-abline(h = log(1), col="#BB0000")
+abline(h = log(1), col="#BB0000")
-abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
+abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
-
+
-yTxt <- log(0.9)
+yTxt <- log(0.9)
-arrows(4, yTxt, 0, yTxt, length = 0.07)
+arrows(4, yTxt, 0, yTxt, length = 0.07)
-text(5.5, yTxt, "Enriched", cex = 0.7)
+text(5.5, yTxt, "Enriched", cex = 0.7)
-yTxt <- log(1.1)
+yTxt <- log(1.1)
-arrows(20, yTxt, 24, yTxt, length = 0.07)
+arrows(20, yTxt, 24, yTxt, length = 0.07)
-text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
+text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
-
+
-# ==   3.5  Color by amino acid type  ==========================================
+# ==   3.5  Color by amino acid type  ==========================================
-
+
-# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
+# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
-# script, or define your own.
+# script, or define your own.
-
+
-barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
+barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
-
+
-lR <- sort(log(obsData / refData), decreasing = TRUE)
+lR <- sort(log(obsData / refData), decreasing = TRUE)
-barplot(lR,
+barplot(lR,
-        ylim = log(c(1/3, 3)),
+        ylim = log(c(1/3, 3)),
-        col = AACOLS[names(lR)],
+        col = AACOLS[names(lR)],
-        ylab = "log(Sequence / Average)",
+        ylab = "log(Sequence / Average)",
-        cex.names = 0.7)
+        cex.names = 0.7)
-abline(h = log(1), col="#00000055")
+abline(h = log(1), col="#00000055")
-abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
+abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
-
+
-yTxt <- log(0.9)
+yTxt <- log(0.9)
-arrows(4, yTxt, 0, yTxt, length = 0.07)
+arrows(4, yTxt, 0, yTxt, length = 0.07)
-text(5.5, yTxt, "Enriched", cex = 0.7)
+text(5.5, yTxt, "Enriched", cex = 0.7)
-yTxt <- log(1.1)
+yTxt <- log(1.1)
-arrows(20, yTxt, 24, yTxt, length = 0.07)
+arrows(20, yTxt, 24, yTxt, length = 0.07)
-text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
+text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
-
+
-
+
-# Task:
+# Task:
-#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
+#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
-#   Depleted?
+#   Depleted?
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-Sequence.R
+++ b/BIN-Sequence.R
@ -1,394 +1,394 @@
-# tocID <- "BIN-Sequence.R"
+# tocID <- "BIN-Sequence.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the BIN-Sequence unit.
+#              R code accompanying the BIN-Sequence unit.
-#
+#
-# Version:  1.5
+# Version:  1.5
-#
+#
-# Date:     2017-09  - 2020-09
+# Date:     2017-09  - 2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.5    2020 Updates
+#           1.5    2020 Updates
-#           1.4    Change from require() to requireNamespace(),
+#           1.4    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.3    Update set.seed() usage
+#           1.3    Update set.seed() usage
-#           1.2    Removed irrelevant task. How did that even get in there? smh
+#           1.2    Removed irrelevant task. How did that even get in there? smh
-#           1.1    Add chartr()
+#           1.1    Add chartr()
-#           1.0    First live version 2017.
+#           1.0    First live version 2017.
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                                Line
+#TOC>   Section  Title                                Line
-#TOC> ----------------------------------------------------
+#TOC> ----------------------------------------------------
-#TOC>   1        Prepare                                63
+#TOC>   1        Prepare                                63
-#TOC>   2        Storing Sequence                       80
+#TOC>   2        Storing Sequence                       80
-#TOC>   3        String properties                     109
+#TOC>   3        String properties                     109
-#TOC>   4        Substrings                            116
+#TOC>   4        Substrings                            116
-#TOC>   5        Creating strings: sprintf()           137
+#TOC>   5        Creating strings: sprintf()           137
-#TOC>   6        Changing strings                      172
+#TOC>   6        Changing strings                      172
-#TOC>   6.1.1          Changing case                   174
+#TOC>   6.1.1          Changing case                   174
-#TOC>   6.1.2          Reverse                         179
+#TOC>   6.1.2          Reverse                         179
-#TOC>   6.1.3          Change characters               183
+#TOC>   6.1.3          Change characters               183
-#TOC>   6.1.4          Substitute characters           211
+#TOC>   6.1.4          Substitute characters           211
-#TOC>   6.2        stringi and stringr                 231
+#TOC>   6.2        stringi and stringr                 231
-#TOC>   6.3        dbSanitizeSequence()                241
+#TOC>   6.3        dbSanitizeSequence()                241
-#TOC>   7        Permuting and sampling                253
+#TOC>   7        Permuting and sampling                253
-#TOC>   7.1        Permutations                        260
+#TOC>   7.1        Permutations                        260
-#TOC>   7.2        Sampling                            306
+#TOC>   7.2        Sampling                            306
-#TOC>   7.2.1          Equiprobable characters         308
+#TOC>   7.2.1          Equiprobable characters         308
-#TOC>   7.2.2          Defined probability vector      350
+#TOC>   7.2.2          Defined probability vector      350
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Prepare  =============================================================
+# =    1  Prepare  =============================================================
-
+
-# Much basic sequence handling is supported by the Bioconductor package
+# Much basic sequence handling is supported by the Bioconductor package
-# Biostrings.
+# Biostrings.
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-
+
-# =    2  Storing Sequence  ====================================================
+# =    2  Storing Sequence  ====================================================
-
+
-
+
-# Sequences can be represented and stored as vectors of single characters ...
+# Sequences can be represented and stored as vectors of single characters ...
-(v <- c("D", "I", "V", "M", "T", "Q"))
+(v <- c("D", "I", "V", "M", "T", "Q"))
-
+
-# ... as strings ...
+# ... as strings ...
-(s <- "DIVMTQ")
+(s <- "DIVMTQ")
-
+
-# ... or as more complex objects with rich metadata e.g. as a Biostrings
+# ... or as more complex objects with rich metadata e.g. as a Biostrings
-# DNAstring, RNAstring, AAString, etc.
+# DNAstring, RNAstring, AAString, etc.
-(a <- Biostrings::AAString("DIVMTQ"))
+(a <- Biostrings::AAString("DIVMTQ"))
-
+
-# ... and all of these representations can be interconverted:
+# ... and all of these representations can be interconverted:
-
+
-# string to vector ...
+# string to vector ...
-unlist(strsplit(s, ""))
+unlist(strsplit(s, ""))
-
+
-# vector to string ...
+# vector to string ...
-paste(v, sep = "", collapse = "")
+paste(v, sep = "", collapse = "")
-
+
-# ... and AAstring to plain string.
+# ... and AAstring to plain string.
-as.character(a)
+as.character(a)
-
+
-# Since operations with character vectors trivially follow all other vector
+# Since operations with character vectors trivially follow all other vector
-# conventions and syntax, and we will look at Biostrings methods in more
+# conventions and syntax, and we will look at Biostrings methods in more
-# detail in a later unit, we will focus on basic strings in the following.
+# detail in a later unit, we will focus on basic strings in the following.
-
+
-
+
-# =    3  String properties  ===================================================
+# =    3  String properties  ===================================================
-
+
-
+
-length(s) # why ???
+length(s) # why ???
-nchar(s)  # Aha!
+nchar(s)  # Aha!
-
+
-
+
-# =    4  Substrings  ==========================================================
+# =    4  Substrings  ==========================================================
-
+
-# Use the substr() function
+# Use the substr() function
-substr(s, 2, 4)
+substr(s, 2, 4)
-
+
-# or the similar substring()
+# or the similar substring()
-substring(s, 2, 4)
+substring(s, 2, 4)
-
+
-# Note: both functions are vectorized (i.e. they operate on vectors
+# Note: both functions are vectorized (i.e. they operate on vectors
-# of arguments, you don't need to loop over input)...
+# of arguments, you don't need to loop over input)...
-myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
+myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
-substr(   myBiCodes, 1, 3)
+substr(   myBiCodes, 1, 3)
-substring(myBiCodes, 1, 3)
+substring(myBiCodes, 1, 3)
-
+
-# ... however only substring() will also use vectors for start and stop
+# ... however only substring() will also use vectors for start and stop
-s <- "gatattgtgatgacccagtaa"       # a DNA sequence
+s <- "gatattgtgatgacccagtaa"       # a DNA sequence
-(vI <- seq(1, nchar(s), by = 3))   # an index vector
+(vI <- seq(1, nchar(s), by = 3))   # an index vector
-substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
+substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
-substring(s, vI, vI+2)             # ... returns all triplets
+substring(s, vI, vI+2)             # ... returns all triplets
-
+
-
+
-# =    5  Creating strings: sprintf()  =========================================
+# =    5  Creating strings: sprintf()  =========================================
-
+
-
+
-# Sprintf is a very smart, very powerful function and has cognates in all
+# Sprintf is a very smart, very powerful function and has cognates in all
-# other programming languages. It has a bit of a  learning curve, but this is
+# other programming languages. It has a bit of a  learning curve, but this is
-# totally worth it:
+# totally worth it:
-# the function takes a format string, and a list of other arguments. It returns
+# the function takes a format string, and a list of other arguments. It returns
-# a formatted string. Here are some examples - watch carefully for sprintf()
+# a formatted string. Here are some examples - watch carefully for sprintf()
-# calls elsewhere in the code.
+# calls elsewhere in the code.
-
+
-sprintf("Just a string.")
+sprintf("Just a string.")
-sprintf("A string and the number %d.", 5)
+sprintf("A string and the number %d.", 5)
-sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
+sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
-sprintf("Pi is ~ %1.2f ...", pi)
+sprintf("Pi is ~ %1.2f ...", pi)
-sprintf("or more accurately ~ %1.11f.", pi)
+sprintf("or more accurately ~ %1.11f.", pi)
-x <- "bottles of beer"
+x <- "bottles of beer"
-N <- 99
+N <- 99
-sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
+sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
-        N, x, N, x, "one down, and pass it around", N - 1, x)
+        N, x, N, x, "one down, and pass it around", N - 1, x)
-
+
-# Note that in the last example, the value of the string was displayed with
+# Note that in the last example, the value of the string was displayed with
-# R's usual print-formatting function and therefore the line-break "\n" did
+# R's usual print-formatting function and therefore the line-break "\n" did
-# not actually break the line. To have line breaks, tabs etc, you need to use
+# not actually break the line. To have line breaks, tabs etc, you need to use
-# cat() to display the string:
+# cat() to display the string:
-
+
-for (i in N:(N-4)) {
+for (i in N:(N-4)) {
-  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
+  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
-              i, x, i, x, "one down, and pass it around", i - 1, x))
+              i, x, i, x, "one down, and pass it around", i - 1, x))
-}
+}
-
+
-# sprintf() is vectorized: if one of its parameters is a vector, it
+# sprintf() is vectorized: if one of its parameters is a vector, it
-# will generate one output string for each of the vector's elements:
+# will generate one output string for each of the vector's elements:
-cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
+cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
-
+
-
+
-# =    6  Changing strings  ====================================================
+# =    6  Changing strings  ====================================================
-
+
-# ===   6.1.1  Changing case
+# ===   6.1.1  Changing case
-tolower(s)
+tolower(s)
-toupper(tolower(s))
+toupper(tolower(s))
-
+
-
+
-# ===   6.1.2  Reverse
+# ===   6.1.2  Reverse
-# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
+# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
-# Biostrings::str_rev(s)
+# Biostrings::str_rev(s)
-# The following works, of course, but awkward:
+# The following works, of course, but awkward:
-s
+s
-paste0(rev(unlist(strsplit(s, ""))), collapse = "")
+paste0(rev(unlist(strsplit(s, ""))), collapse = "")
-
+
-# reverse complement
+# reverse complement
-COMP <- c("t", "g", "c", "a")
+COMP <- c("t", "g", "c", "a")
-names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
+names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
-s
+s
-paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
+paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
-
+
-
+
-# ===   6.1.3  Change characters
+# ===   6.1.3  Change characters
-# chartr(old, new, x) maps all characters in x that appear in "old" to the
+# chartr(old, new, x) maps all characters in x that appear in "old" to the
-# correpsonding character in "new." Kind of like the COMP vector above ...
+# correpsonding character in "new." Kind of like the COMP vector above ...
-
+
-chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
+chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
-
+
-# One could implement toupper() and tolower() with this - remember that R has
+# One could implement toupper() and tolower() with this - remember that R has
-# character vectors of uppercase and lowercase letters as language constants.
+# character vectors of uppercase and lowercase letters as language constants.
-chartr(paste0(letters, collapse = ""),
+chartr(paste0(letters, collapse = ""),
-       paste0(LETTERS, collapse = ""),
+       paste0(LETTERS, collapse = ""),
-       "Twinkle, twinkle little star, how I wonder what you are.")
+       "Twinkle, twinkle little star, how I wonder what you are.")
-
+
-# One amusing way to use the function  is for a reversible substitution
+# One amusing way to use the function  is for a reversible substitution
-# cypher.
+# cypher.
-alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
+alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
-set.seed(112358)                       # set RNG seed for repeatable randomness
+set.seed(112358)                       # set RNG seed for repeatable randomness
-( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
+( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
-set.seed(NULL)                         # reset the RNG
+set.seed(NULL)                         # reset the RNG
-
+
-# encode ...
+# encode ...
-(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
+(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
-
+
-# decode ...
+# decode ...
-chartr(myCypher, alBet, x)
+chartr(myCypher, alBet, x)
-# (Nb. substitution cyphers are easy to crack!)
+# (Nb. substitution cyphers are easy to crack!)
-
+
-
+
-# ===   6.1.4  Substitute characters
+# ===   6.1.4  Substitute characters
-# gsub can change lengths.
+# gsub can change lengths.
-#   Example: implementing the binary Fibonacci sequence:
+#   Example: implementing the binary Fibonacci sequence:
-#   0 -> 1; 1 -> 10 , in three nested gsub() statements
+#   0 -> 1; 1 -> 10 , in three nested gsub() statements
-( s <- 1 )
+( s <- 1 )
-( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
+( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
-
+
-# Iterate this line a few times ...
+# Iterate this line a few times ...
-#
+#
-# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
+# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
-# for the features of the sequence.
+# for the features of the sequence.
-
+
-# I use gsub() often to delete unwanted characters ...
+# I use gsub() often to delete unwanted characters ...
-# ... select something, and substitute the empty string for it.
+# ... select something, and substitute the empty string for it.
-(s <- gsub("-", "", s))
+(s <- gsub("-", "", s))
-
+
-# For example: clean up a sequence
+# For example: clean up a sequence
-# copy/paste from UniProt
+# copy/paste from UniProt
-(s <- "        10         20         30         40         50
+(s <- "        10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
-
+
-
+
-# remove numbers
+# remove numbers
-(s <- gsub("[0-9]", "", s))
+(s <- gsub("[0-9]", "", s))
-
+
-# remove "whitespace" (spaces, tabs, line breaks)...
+# remove "whitespace" (spaces, tabs, line breaks)...
-(s <- gsub("\\s", "", s))
+(s <- gsub("\\s", "", s))
-
+
-# ==   6.2  stringi and stringr  ===============================================
+# ==   6.2  stringi and stringr  ===============================================
-
+
-# But there are also specialized functions eg. to remove leading/trailing
+# But there are also specialized functions eg. to remove leading/trailing
-# whitespace which may be important to sanitize user input etc. Have a look at
+# whitespace which may be important to sanitize user input etc. Have a look at
-# the function descriptions for the stringr and the stringi package. stringr is
+# the function descriptions for the stringr and the stringi package. stringr is
-# part of the tidyverse, and for the most part a wrapper for stringi functions.
+# part of the tidyverse, and for the most part a wrapper for stringi functions.
-# https://github.com/tidyverse/stringr
+# https://github.com/tidyverse/stringr
-
+
-
+
-
+
-# ==   6.3  dbSanitizeSequence()  ==============================================
+# ==   6.3  dbSanitizeSequence()  ==============================================
-
+
-# In our learning units, we use a function dbSanitizeSequence() to clean up
+# In our learning units, we use a function dbSanitizeSequence() to clean up
-# sequences that may be copy/pasted from Web-sources
+# sequences that may be copy/pasted from Web-sources
-
+
-cat( s <- ">FASTA header will be removed
+cat( s <- ">FASTA header will be removed
-10         20         30         40         50
+10         20         30         40         50
-MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
+MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
-
+
-dbSanitizeSequence(s)
+dbSanitizeSequence(s)
-
+
-
+
-# =    7  Permuting and sampling  ==============================================
+# =    7  Permuting and sampling  ==============================================
-
+
-
+
-# An important aspect of working with strings is generating random strings
+# An important aspect of working with strings is generating random strings
-# with given statistical properties: reference items to evaluate significance.
+# with given statistical properties: reference items to evaluate significance.
-
+
-
+
-# ==   7.1  Permutations  ======================================================
+# ==   7.1  Permutations  ======================================================
-
+
-
+
-# One way to produce such reference items is to permute a string. A permuted
+# One way to produce such reference items is to permute a string. A permuted
-# string has the same composition as the original, but all positional
+# string has the same composition as the original, but all positional
-# information is lost. The sample() function can be used to permute:
+# information is lost. The sample() function can be used to permute:
-
+
-# This is the sequence of the ompA secretion signal
+# This is the sequence of the ompA secretion signal
-(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
+(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
-
+
-(x <- sample(s, length(s)))  # permuted
+(x <- sample(s, length(s)))  # permuted
-
+
-# Here's a small example how such permuted strings may be useful. As you look
+# Here's a small example how such permuted strings may be useful. As you look
-# at the ompA sequence, you suspect that the two lysines near the +-charged
+# at the ompA sequence, you suspect that the two lysines near the +-charged
-# N-terminus may not be accidental, but selected for a positively charged
+# N-terminus may not be accidental, but selected for a positively charged
-# N-terminus. What is the chance that such a sequence has two lysines close to
+# N-terminus. What is the chance that such a sequence has two lysines close to
-# the N-terminus simply by chance? Or put differently: what is the average
+# the N-terminus simply by chance? Or put differently: what is the average
-# distance of two lysines in such a sequence to the N-terminus. First, we
+# distance of two lysines in such a sequence to the N-terminus. First, we
-# need an expression that measures the distance. A simple use of the which()
+# need an expression that measures the distance. A simple use of the which()
-# function will do just fine.
+# function will do just fine.
-
+
-which(s == "K")        # shows they are in position 2 and 3, so ...
+which(s == "K")        # shows they are in position 2 and 3, so ...
-mean(which(s == "K"))  # ... gives us the average, and ...
+mean(which(s == "K"))  # ... gives us the average, and ...
-mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
+mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
-
+
-# So what does the distribution look like? Lets do 10,000 trials.
+# So what does the distribution look like? Lets do 10,000 trials.
-
+
-(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
+(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
-N <- 10000
+N <- 10000
-d <- numeric(N)
+d <- numeric(N)
-
+
-set.seed(112358)                       # set RNG seed for repeatable randomness
+set.seed(112358)                       # set RNG seed for repeatable randomness
-for (i in 1:N) {
+for (i in 1:N) {
-  d[i] <- mean(which(sample(s, length(s)) == "K"))
+  d[i] <- mean(which(sample(s, length(s)) == "K"))
-}
+}
-set.seed(NULL)                         # reset the RNG
+set.seed(NULL)                         # reset the RNG
-
+
-hist(d, breaks = 20)
+hist(d, breaks = 20)
-abline(v = 2.5, lwd = 2, col = "firebrick")
+abline(v = 2.5, lwd = 2, col = "firebrick")
-sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
+sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
-              # N-terminus or more. That's just below the signifcance
+              # N-terminus or more. That's just below the signifcance
-              # threshold of 5 %. It's a trend, but to be sure we are looking
+              # threshold of 5 %. It's a trend, but to be sure we are looking
-              # at a biological effect we would need to see more
+              # at a biological effect we would need to see more
-              # sequences.
+              # sequences.
-
+
-
+
-# ==   7.2  Sampling  ==========================================================
+# ==   7.2  Sampling  ==========================================================
-
+
-# ===   7.2.1  Equiprobable characters
+# ===   7.2.1  Equiprobable characters
-
+
-# Assume you need a large random-nucleotide string for some statistical model.
+# Assume you need a large random-nucleotide string for some statistical model.
-# How to create such a string? sample() can easily create it:
+# How to create such a string? sample() can easily create it:
-
+
-nuc <- c("A", "C", "G", "T")
+nuc <- c("A", "C", "G", "T")
-N <- 100
+N <- 100
-
+
-set.seed(16818)                        # set RNG seed for repeatable randomness
+set.seed(16818)                        # set RNG seed for repeatable randomness
-v <- sample(nuc, N, replace = TRUE)
+v <- sample(nuc, N, replace = TRUE)
-set.seed(NULL)                         # reset the RNG
+set.seed(NULL)                         # reset the RNG
-
+
-(mySeq <- paste(v, collapse = ""))
+(mySeq <- paste(v, collapse = ""))
-
+
-# What's the GC content?
+# What's the GC content?
-table(v)
+table(v)
-sum(table(v)[c("G", "C")]) # 51 is close to expected
+sum(table(v)[c("G", "C")]) # 51 is close to expected
-
+
-# What's the number of CpG motifs? Easy to check with the stringi
+# What's the number of CpG motifs? Easy to check with the stringi
-# stri_match_all() function
+# stri_match_all() function
-
+
-if (! requireNamespace("stringi", quietly = TRUE)) {
+if (! requireNamespace("stringi", quietly = TRUE)) {
-  install.packages("stringi")
+  install.packages("stringi")
-}
+}
-# Package information:
+# Package information:
-#  library(help = stringi)       # basic information
+#  library(help = stringi)       # basic information
-#  browseVignettes("stringi")    # available vignettes
+#  browseVignettes("stringi")    # available vignettes
-#  data(package = "stringi")     # available datasets
+#  data(package = "stringi")     # available datasets
-
+
-
+
-(x <- stringi::stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
-length(unlist(x))
+length(unlist(x))
-
+
-# Now you could compare that number with yeast DNA sequences, and determine
+# Now you could compare that number with yeast DNA sequences, and determine
-# whether there are more or less CpG motifs than expected by chance.
+# whether there are more or less CpG motifs than expected by chance.
-# (cf. https://en.wikipedia.org/wiki/CpG_site)
+# (cf. https://en.wikipedia.org/wiki/CpG_site)
-# But hold on: is that a fair comparison? sample() gives us all four nucleotides
+# But hold on: is that a fair comparison? sample() gives us all four nucleotides
-# with the same probability. But the yeast genomic DNA GC content is only
+# with the same probability. But the yeast genomic DNA GC content is only
-# 38%. So you would expect fewer CpG motifs based on the statistical properties
+# 38%. So you would expect fewer CpG motifs based on the statistical properties
-# of the smaller number of Cs and Gs - before biology even comes into play. How
+# of the smaller number of Cs and Gs - before biology even comes into play. How
-# do we account for that?
+# do we account for that?
-
+
-# ===   7.2.2  Defined probability vector
+# ===   7.2.2  Defined probability vector
-
+
-# This is where we need to know how to create samples with specific probability
+# This is where we need to know how to create samples with specific probability
-# distributions. A crude hack would be to create a sampling source vector with
+# distributions. A crude hack would be to create a sampling source vector with
-# 19 C, 19 G, 31 A and 31 T
+# 19 C, 19 G, 31 A and 31 T
-c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
+c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
-# ... but that doesn't scale if the numeric accuracy needs to be higher.
+# ... but that doesn't scale if the numeric accuracy needs to be higher.
-#
+#
-# However sample() has an argument that takes care of that: you can explicitly
+# However sample() has an argument that takes care of that: you can explicitly
-# specify the probabilities with which each element of the the sampling vector
+# specify the probabilities with which each element of the the sampling vector
-# should be chosen:
+# should be chosen:
-
+
-nuc <- c("A", "C", "G", "T")
+nuc <- c("A", "C", "G", "T")
-N <- 100
+N <- 100
-myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
+myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
-
+
-set.seed(16818)                       # set RNG seed for repeatable randomness
+set.seed(16818)                       # set RNG seed for repeatable randomness
-v <- sample(nuc, N, prob = myProb, replace = TRUE)
+v <- sample(nuc, N, prob = myProb, replace = TRUE)
-set.seed(NULL)                         # reset the RNG
+set.seed(NULL)                         # reset the RNG
-
+
-(mySeq <- paste(v, collapse = ""))
+(mySeq <- paste(v, collapse = ""))
-
+
-# What's the GC content?
+# What's the GC content?
-table(v)
+table(v)
-sum(table(v)[c("G", "C")]) # Close to expected
+sum(table(v)[c("G", "C")]) # Close to expected
-
+
-# What's the number of CpG motifs?
+# What's the number of CpG motifs?
-(x <- stringi::stri_match_all(mySeq, regex = "CG"))
+(x <- stringi::stri_match_all(mySeq, regex = "CG"))
-# ... not a single one in this case.
+# ... not a single one in this case.
-
+
-
+
-
+
-# [END]
+# [END]
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
--- a/FND-Genetic_code.R
+++ b/FND-Genetic_code.R
@ -1,349 +1,349 @@
-# tocID <- "FND-Genetic_code.R"
+# tocID <- "FND-Genetic_code.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-Genetic_code unit.
+#              R code accompanying the FND-Genetic_code unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017  10  -  2019  01
+# Date:     2017  10  -  2019  01
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance
+#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0.1  Comment on "incomplete final line" warning in FASTA
+#           1.0.1  Comment on "incomplete final line" warning in FASTA
-#           1.0    First live version
+#           1.0    First live version
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                                            Line
+#TOC>   Section  Title                                            Line
-#TOC> ----------------------------------------------------------------
+#TOC> ----------------------------------------------------------------
-#TOC>   1        Storing the genetic code                           45
+#TOC>   1        Storing the genetic code                           45
-#TOC>   1.1        Genetic code in Biostrings                       63
+#TOC>   1.1        Genetic code in Biostrings                       63
-#TOC>   2        Working with the genetic code                      94
+#TOC>   2        Working with the genetic code                      94
-#TOC>   2.1        Translate a sequence.                           129
+#TOC>   2.1        Translate a sequence.                           129
-#TOC>   3        An alternative representation: 3D array           212
+#TOC>   3        An alternative representation: 3D array           212
-#TOC>   3.1        Print a Genetic code table                      246
+#TOC>   3.1        Print a Genetic code table                      246
-#TOC>   4        Tasks                                             272
+#TOC>   4        Tasks                                             272
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Storing the genetic code  ============================================
+# =    1  Storing the genetic code  ============================================
-
+
-# The genetic code maps trinucleotide codons to amino acids. To store it, we
+# The genetic code maps trinucleotide codons to amino acids. To store it, we
-# need some mechanism to associate the two representations. The most
+# need some mechanism to associate the two representations. The most
-# convenient way to do that is a "named vector" which holds the amino acid
+# convenient way to do that is a "named vector" which holds the amino acid
-# code and assigns the codons as names to its elements.
+# code and assigns the codons as names to its elements.
-
+
-x <- c("M", "H", "H", "*", "*", "*")
+x <- c("M", "H", "H", "*", "*", "*")
-names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
+names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
-x
+x
-
+
-# Then we can access the vector by the codon as name, and retrieve the
+# Then we can access the vector by the codon as name, and retrieve the
-# amino acid ...
+# amino acid ...
-
+
-x["ATG"]
+x["ATG"]
-x["CAC"]
+x["CAC"]
-x["TAA"]
+x["TAA"]
-
+
-# ... or the names of elements, to retrieve the codon(s)
+# ... or the names of elements, to retrieve the codon(s)
-names(x)[x == "M"]
+names(x)[x == "M"]
-names(x)[x == "H"]
+names(x)[x == "H"]
-names(x)[x == "*"]
+names(x)[x == "*"]
-
+
-
+
-# ==   1.1  Genetic code in Biostrings  ========================================
+# ==   1.1  Genetic code in Biostrings  ========================================
-
+
-# Coveniently, the standard genetic code as well as its alternatives are
+# Coveniently, the standard genetic code as well as its alternatives are
-# available in the Bioconductor "Biostrings" package:
+# available in the Bioconductor "Biostrings" package:
-
+
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-
+
-# The standard genetic code vector
+# The standard genetic code vector
-Biostrings::GENETIC_CODE
+Biostrings::GENETIC_CODE
-
+
-# The table of genetic codes. This information corresponds to this page
+# The table of genetic codes. This information corresponds to this page
-# at the NCBI:
+# at the NCBI:
-# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
+# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
-Biostrings::GENETIC_CODE_TABLE
+Biostrings::GENETIC_CODE_TABLE
-
+
-# Most of the alternative codes are mitochondrial codes. The id of the
+# Most of the alternative codes are mitochondrial codes. The id of the
-# Alternative Yeast Nuclear code is "12"
+# Alternative Yeast Nuclear code is "12"
-Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
+Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
-
+
-
+
-# =    2  Working with the genetic code  =======================================
+# =    2  Working with the genetic code  =======================================
-
+
-# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
+# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
-# to a "local" variable, rather than retrieving it from the package all the
+# to a "local" variable, rather than retrieving it from the package all the
-# time.
+# time.
-
+
-GC <- Biostrings::GENETIC_CODE
+GC <- Biostrings::GENETIC_CODE
-
+
-# This is a named vector of characters ...
+# This is a named vector of characters ...
-
+
-str(GC)
+str(GC)
-
+
-# ... which also stores the alternative initiation codons TTG and CTG in
+# ... which also stores the alternative initiation codons TTG and CTG in
-# an attribute of the vector. (Alternative initiation codons sometimes are
+# an attribute of the vector. (Alternative initiation codons sometimes are
-# used instead of ATG to intiate translation, if translation is not initiated
+# used instead of ATG to intiate translation, if translation is not initiated
-# at ATG thses are still translated with fMet.)
+# at ATG thses are still translated with fMet.)
-
+
-attr(GC, "alt_init_codons")
+attr(GC, "alt_init_codons")
-
+
-# But the key to use this vector is in the "names" which we use for subsetting
+# But the key to use this vector is in the "names" which we use for subsetting
-# the list of amino acids in whatever way we need.
+# the list of amino acids in whatever way we need.
-names(GC)
+names(GC)
-
+
-# The translation of "TGG" ...
+# The translation of "TGG" ...
-GC["TGG"]
+GC["TGG"]
-
+
-# All stop codons
+# All stop codons
-names(GC)[GC == "*"]
+names(GC)[GC == "*"]
-
+
-# All start codons
+# All start codons
-names(GC)[GC == "M"] # ... or
+names(GC)[GC == "M"] # ... or
-c(names(GC)[GC == "M"],
+c(names(GC)[GC == "M"],
-  attr(GC, "alt_init_codons"))
+  attr(GC, "alt_init_codons"))
-
+
-
+
-# ==   2.1  Translate a sequence.  =============================================
+# ==   2.1  Translate a sequence.  =============================================
-
+
-
+
-# I have provided a gene sequence in the data directory:
+# I have provided a gene sequence in the data directory:
-# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
+# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
-
+
-# read it
+# read it
-mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-
+
-# You will notice that this generates a Warning message:
+# You will notice that this generates a Warning message:
-#      Warning message:
+#      Warning message:
-#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
+#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
-#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
+#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
-
+
-# The reason for this is that the last character of the file is the letter "A"
+# The reason for this is that the last character of the file is the letter "A"
-# and not a "\n" line break. This file is exactly how it was sent from the
+# and not a "\n" line break. This file is exactly how it was sent from the
-# NCBI server; I think good, defensive programming practice would have been to
+# NCBI server; I think good, defensive programming practice would have been to
-# include some kind of an end-marker in the file, like a final "\n". This helps
+# include some kind of an end-marker in the file, like a final "\n". This helps
-# us recognize an incomplete transmission. Let's parse the actual sequence from
+# us recognize an incomplete transmission. Let's parse the actual sequence from
-# the file, and then check for completeness.
+# the file, and then check for completeness.
-
+
-
+
-head(mbp1)
+head(mbp1)
-
+
-# drop the first line (header)
+# drop the first line (header)
-mbp1 <- mbp1[-1]
+mbp1 <- mbp1[-1]
-head(mbp1)
+head(mbp1)
-
+
-# concatenate it all to a single string
+# concatenate it all to a single string
-mbp1 <- paste(mbp1, sep = "", collapse = "")
+mbp1 <- paste(mbp1, sep = "", collapse = "")
-
+
-# how long is it?
+# how long is it?
-nchar(mbp1)
+nchar(mbp1)
-
+
-# how many codons?
+# how many codons?
-nchar(mbp1)/3
+nchar(mbp1)/3
-
+
-# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
+# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
-# first verification that the file we read is complete, the nucleotides of a
+# first verification that the file we read is complete, the nucleotides of a
-# complete ORF should be divisible by 3.
+# complete ORF should be divisible by 3.
-
+
-# Extract the codons. There are many ways to split a long string into chunks
+# Extract the codons. There are many ways to split a long string into chunks
-# of three characters. Here we use the Biostrings  codons()  function. codons()
+# of three characters. Here we use the Biostrings  codons()  function. codons()
-# requires an object of type DNAstring - a special kind of string with
+# requires an object of type DNAstring - a special kind of string with
-# attributes that are useful for Biostrings. Thus we convert the sequence first
+# attributes that are useful for Biostrings. Thus we convert the sequence first
-# with DNAstring(), then split it up, then convert it into a plain
+# with DNAstring(), then split it up, then convert it into a plain
-# character vector.
+# character vector.
-mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
+mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
-
+
-head(mbp1Codons)
+head(mbp1Codons)
-
+
-# now translate each codon
+# now translate each codon
-
+
-mbp1AA <- character(834)
+mbp1AA <- character(834)
-for (i in seq_along(mbp1Codons)) {
+for (i in seq_along(mbp1Codons)) {
-  mbp1AA[i] <- GC[mbp1Codons[i]]
+  mbp1AA[i] <- GC[mbp1Codons[i]]
-}
+}
-
+
-head(mbp1Codons)
+head(mbp1Codons)
-head(mbp1AA)
+head(mbp1AA)
-
+
-tail(mbp1Codons)
+tail(mbp1Codons)
-tail(mbp1AA) # Note the stop!
+tail(mbp1AA) # Note the stop!
-
+
-# The TAA "ochre" stop codon is our second verification that the nucleotide
+# The TAA "ochre" stop codon is our second verification that the nucleotide
-# sequence is complete: a stop codon can't appear internally in an ORF.
+# sequence is complete: a stop codon can't appear internally in an ORF.
-
+
-# We can work with the mbp1AA vector, for example to tabulate the
+# We can work with the mbp1AA vector, for example to tabulate the
-# amino acid frequencies:
+# amino acid frequencies:
-table(mbp1AA)
+table(mbp1AA)
-sort(table(mbp1AA), decreasing = TRUE)
+sort(table(mbp1AA), decreasing = TRUE)
-
+
-# Or we can paste all elements together into a single string. But let's remove
+# Or we can paste all elements together into a single string. But let's remove
-# the stop, it's not actually a part of the sequence. To remove the last element
+# the stop, it's not actually a part of the sequence. To remove the last element
-# of a vector, re-assign it with a vector minus the index of the last element:
+# of a vector, re-assign it with a vector minus the index of the last element:
-mbp1AA <- mbp1AA[-(length(mbp1AA))]
+mbp1AA <- mbp1AA[-(length(mbp1AA))]
-tail(mbp1AA) # Note the stop is gone!
+tail(mbp1AA) # Note the stop is gone!
-
+
-# paste it together, collapsing the elements using an empty string as the
+# paste it together, collapsing the elements using an empty string as the
-# separation-character (i.e.: nothing)
+# separation-character (i.e.: nothing)
-(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
+(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
-
+
-
+
-# =    3  An alternative representation: 3D array  =============================
+# =    3  An alternative representation: 3D array  =============================
-
+
-
+
-# We don't use 3D arrays often - usually just 2D tables and data frames, so
+# We don't use 3D arrays often - usually just 2D tables and data frames, so
-# here is a good opportunity to review the syntax of 3D arrays with a
+# here is a good opportunity to review the syntax of 3D arrays with a
-# genetic code cube:
+# genetic code cube:
-
+
-# Initialize, using A G C T as the names of the elements in each dimension
+# Initialize, using A G C T as the names of the elements in each dimension
-cCube <- array(data     = character(64),
+cCube <- array(data     = character(64),
-               dim      = c(4, 4, 4),
+               dim      = c(4, 4, 4),
-               dimnames = list(c("A", "G", "C", "T"),
+               dimnames = list(c("A", "G", "C", "T"),
-                               c("A", "G", "C", "T"),
+                               c("A", "G", "C", "T"),
-                               c("A", "G", "C", "T")))
+                               c("A", "G", "C", "T")))
-
+
-# fill it with amino acid codes using three nested loops
+# fill it with amino acid codes using three nested loops
-for (i in 1:4) {
+for (i in 1:4) {
-  for (j in 1:4) {
+  for (j in 1:4) {
-    for (k in 1:4) {
+    for (k in 1:4) {
-      myCodon <- paste(dimnames(cCube)[[1]][i],
+      myCodon <- paste(dimnames(cCube)[[1]][i],
-                       dimnames(cCube)[[2]][j],
+                       dimnames(cCube)[[2]][j],
-                       dimnames(cCube)[[3]][k],
+                       dimnames(cCube)[[3]][k],
-                       sep = "",
+                       sep = "",
-                       collapse = "")
+                       collapse = "")
-      cCube[i, j, k] <- GC[myCodon]
+      cCube[i, j, k] <- GC[myCodon]
-    }
+    }
-  }
+  }
-}
+}
-
+
-# confirm
+# confirm
-cCube["A", "T", "G"] # methionine
+cCube["A", "T", "G"] # methionine
-cCube["T", "T", "T"] # phenylalanine
+cCube["T", "T", "T"] # phenylalanine
-cCube["T", "A", "G"] # stop (amber)
+cCube["T", "A", "G"] # stop (amber)
-
+
-
+
-
+
-# ==   3.1  Print a Genetic code table  ========================================
+# ==   3.1  Print a Genetic code table  ========================================
-
+
-
+
-# The data structure of our cCube is well suited to print a table. In the
+# The data structure of our cCube is well suited to print a table. In the
-# "standard" way to print the genetic code, we write codons with the same
+# "standard" way to print the genetic code, we write codons with the same
-# second nucleotide in columns, and arrange rows in blocks of same
+# second nucleotide in columns, and arrange rows in blocks of same
-# first nucleotide, varying the third nucleotide fastest. This maximizes the
+# first nucleotide, varying the third nucleotide fastest. This maximizes the
-# similarity of adjacent amino acids in the table if we print the
+# similarity of adjacent amino acids in the table if we print the
-# nucleotides in the order T C A G. It's immidiately obvious that the code
+# nucleotides in the order T C A G. It's immidiately obvious that the code
-# is not random: the universal genetic code is exceptionally error tolerant in
+# is not random: the universal genetic code is exceptionally error tolerant in
-# the sense that mutations (or single-nucleotide translation errors) are likely
+# the sense that mutations (or single-nucleotide translation errors) are likely
-# to result in an amino acid with similar biophysical properties as the
+# to result in an amino acid with similar biophysical properties as the
-# original.
+# original.
-
+
-nuc <- c("T", "C", "A", "G")
+nuc <- c("T", "C", "A", "G")
-
+
-# (calling variables f, s, t to indicate first, second, and third position ...)
+# (calling variables f, s, t to indicate first, second, and third position ...)
-for (f in nuc) {      # first varies in blocks
+for (f in nuc) {      # first varies in blocks
-  for (t in nuc) {    # third varies in columns
+  for (t in nuc) {    # third varies in columns
-    for (s in nuc) {  # second varies in rows
+    for (s in nuc) {  # second varies in rows
-      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
+      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
-    }
+    }
-    cat("\n")
+    cat("\n")
-  }
+  }
-  cat("\n")
+  cat("\n")
-}
+}
-
+
-
+
-# =    4  Tasks  ===============================================================
+# =    4  Tasks  ===============================================================
-
+
-
+
-# Task: What do you need to change to print the table with U instead
+# Task: What do you need to change to print the table with U instead
-#         of T? Try it.
+#         of T? Try it.
-
+
-
+
-# Task: Point mutations are more often transitions (purine -> purine;
+# Task: Point mutations are more often transitions (purine -> purine;
-#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
+#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
-#         pyrimidine -> purine), even though twice as many transversions
+#         pyrimidine -> purine), even though twice as many transversions
-#         are possible in the code. This is most likely due a deamination /
+#         are possible in the code. This is most likely due a deamination /
-#         tautomerization process that favours C -> T changes. If the code
+#         tautomerization process that favours C -> T changes. If the code
-#         indeed minimizes the effect of mutations, you would expect that
+#         indeed minimizes the effect of mutations, you would expect that
-#         codons that differ by a transition code for more similar amino acids
+#         codons that differ by a transition code for more similar amino acids
-#         than codons that differ by a transversion. Is that true? List the set
+#         than codons that differ by a transversion. Is that true? List the set
-#         of all amino acid pairs that are encoded by codons with a C -> T
+#         of all amino acid pairs that are encoded by codons with a C -> T
-#         transition. Then list the set of amino acid pairs with a C -> A
+#         transition. Then list the set of amino acid pairs with a C -> A
-#         transversion. Which set of pairs is more similar?
+#         transversion. Which set of pairs is more similar?
-
+
-
+
-# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
+# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
-#         have if you translate them in the 2. or the 3. frame?
+#         have if you translate them in the 2. or the 3. frame?
-
+
-
+
-# Task: How does the amino acid composition change if you translate the mbp1
+# Task: How does the amino acid composition change if you translate the mbp1
-#         gene with the Alternative Yeast Nuclear code that is used by the
+#         gene with the Alternative Yeast Nuclear code that is used by the
-#         "GTC clade" of fungi?
+#         "GTC clade" of fungi?
-#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
+#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
-
+
-# Solution:
+# Solution:
-
+
-    # Fetch the code
+    # Fetch the code
-    Biostrings::GENETIC_CODE_TABLE
+    Biostrings::GENETIC_CODE_TABLE
-    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
+    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
-    altYcode <- Biostrings::getGeneticCode("12")
+    altYcode <- Biostrings::getGeneticCode("12")
-
+
-    # what's the difference?
+    # what's the difference?
-    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
+    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
-
+
-    Biostrings::GENETIC_CODE[delta]
+    Biostrings::GENETIC_CODE[delta]
-    altYcode[delta]
+    altYcode[delta]
-
+
-    # translate
+    # translate
-    altYAA <- character(834)
+    altYAA <- character(834)
-    for (i in seq_along(mbp1Codons)) {
+    for (i in seq_along(mbp1Codons)) {
-      altYAA[i] <- altYcode[mbp1Codons[i]]
+      altYAA[i] <- altYcode[mbp1Codons[i]]
-    }
+    }
-
+
-    table(mbp1AA)
+    table(mbp1AA)
-    table(altYAA)
+    table(altYAA)
-
+
-# Task: The genetic code has significant redundacy, i.e. there are up to six
+# Task: The genetic code has significant redundacy, i.e. there are up to six
-#         codons that code for the same amino acid. Write code that lists how
+#         codons that code for the same amino acid. Write code that lists how
-#         many amino acids are present how often i.e. it should tell you that
+#         many amino acids are present how often i.e. it should tell you that
-#         two amino acids are encoded only with a single codon, three amino
+#         two amino acids are encoded only with a single codon, three amino
-#         acids have six codons, etc. Solution below, but don't peek. There
+#         acids have six codons, etc. Solution below, but don't peek. There
-#         are many possible ways to do this.
+#         are many possible ways to do this.
-#
+#
-#
+#
-# Solution:
+# Solution:
-( x <- table(table(Biostrings::GENETIC_CODE)) )
+( x <- table(table(Biostrings::GENETIC_CODE)) )
-
+
-# confirm
+# confirm
-sum(x * as.numeric(names(x)))
+sum(x * as.numeric(names(x)))
-
+
-
+
-
+
-# [END]
+# [END]
--- a/FND-MAT-Graphs_and_networks.R
+++ b/FND-MAT-Graphs_and_networks.R
--- a/FND-STA-Information_theory.R
+++ b/FND-STA-Information_theory.R
@ -1,224 +1,224 @@
-# tocID <- "FND-STA-Information_theory.R"
+# tocID <- "FND-STA-Information_theory.R"
-#
+#
-# ==============================================================================
+# ==============================================================================
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-STA-Information_theory unit.
+#              R code accompanying the FND-STA-Information_theory unit.
-#
+#
-# Version:  0.2.1
+# Version:  0.2.1
-#
+#
-# Date:     2017 - 2021
+# Date:     2017 - 2021
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           0.2.1  Maintenance
+#           0.2.1  Maintenance
-#           0.2    Under development
+#           0.2    Under development
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                  Line
+#TOC>   Section  Title                  Line
-#TOC> --------------------------------------
+#TOC> --------------------------------------
-#TOC>   1        ___Section___            39
+#TOC>   1        ___Section___            39
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  ___Section___  =======================================================
+# =    1  ___Section___  =======================================================
-
+
-# What level of information is "significant"
+# What level of information is "significant"
-
+
-# Assume the background distribution is the database frequencies of
+# Assume the background distribution is the database frequencies of
-# amino acids:
+# amino acids:
-
+
-AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
+AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
-# sum to 1.0
+# sum to 1.0
-AAref["A"] <- 0.0904
+AAref["A"] <- 0.0904
-AAref["C"] <- 0.0123
+AAref["C"] <- 0.0123
-AAref["D"] <- 0.0545
+AAref["D"] <- 0.0545
-AAref["E"] <- 0.0617
+AAref["E"] <- 0.0617
-AAref["F"] <- 0.0394
+AAref["F"] <- 0.0394
-AAref["G"] <- 0.0724
+AAref["G"] <- 0.0724
-AAref["H"] <- 0.0221
+AAref["H"] <- 0.0221
-AAref["I"] <- 0.0573
+AAref["I"] <- 0.0573
-AAref["K"] <- 0.0504
+AAref["K"] <- 0.0504
-AAref["L"] <- 0.0986
+AAref["L"] <- 0.0986
-AAref["M"] <- 0.0240
+AAref["M"] <- 0.0240
-AAref["N"] <- 0.0392
+AAref["N"] <- 0.0392
-AAref["P"] <- 0.0486
+AAref["P"] <- 0.0486
-AAref["Q"] <- 0.0381
+AAref["Q"] <- 0.0381
-AAref["R"] <- 0.0570
+AAref["R"] <- 0.0570
-AAref["S"] <- 0.0673
+AAref["S"] <- 0.0673
-AAref["T"] <- 0.0558
+AAref["T"] <- 0.0558
-AAref["V"] <- 0.0686
+AAref["V"] <- 0.0686
-AAref["W"] <- 0.0129
+AAref["W"] <- 0.0129
-AAref["Y"] <- 0.0294
+AAref["Y"] <- 0.0294
-sum(AAref)
+sum(AAref)
-
+
-# Function to calculate Shannon entropy
+# Function to calculate Shannon entropy
-H <- function(pmf) {
+H <- function(pmf) {
-  # Calculate Shannon entropy
+  # Calculate Shannon entropy
-  # Parameters:
+  # Parameters:
-  #   pmf (numeric) probability mass function: a vector of states and
+  #   pmf (numeric) probability mass function: a vector of states and
-  #                 associated probabilities. Each element of
+  #                 associated probabilities. Each element of
-  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
+  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
-  # Value:
+  # Value:
-  #   Shannon entropy in bits.
+  #   Shannon entropy in bits.
-  # Examples:
+  # Examples:
-  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
+  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
-  #                                         # nucleotide sequence
+  #                                         # nucleotide sequence
-  #   H(1)     # If all elements are the same, entropy is zero
+  #   H(1)     # If all elements are the same, entropy is zero
-  #
+  #
-  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
+  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
-    stop("Input is not a discrete probability distribution.")
+    stop("Input is not a discrete probability distribution.")
-  }
+  }
-  H <- -sum(pmf * (log(pmf) / log(2)))
+  H <- -sum(pmf * (log(pmf) / log(2)))
-  return(H)
+  return(H)
-}
+}
-
+
-# Why use all.equal()? Exact comparisons with floating point numbers are
+# Why use all.equal()? Exact comparisons with floating point numbers are
-# brittle. Consider for example:
+# brittle. Consider for example:
-1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
+1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
-print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
+print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
-# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
+# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
-
+
-
+
-
+
-# Entropy of the database frequencies (in bits):
+# Entropy of the database frequencies (in bits):
-(Href <- H(AAref))
+(Href <- H(AAref))
-
+
-# for comparison: entropy if all amino acids are equiprobable
+# for comparison: entropy if all amino acids are equiprobable
-H(rep(0.05, 20))
+H(rep(0.05, 20))
-
+
-
+
-# Set up a simulation to estimate the distribution of Information values
+# Set up a simulation to estimate the distribution of Information values
-# from random sequences drawn from AAref. This is the distribution for the
+# from random sequences drawn from AAref. This is the distribution for the
-# statistical null hypothesis:
+# statistical null hypothesis:
-nObs <- 15                      # number of observations (e.g aligned sequences)
+nObs <- 15                      # number of observations (e.g aligned sequences)
-# nObs <- 80
+# nObs <- 80
-nTrials <- 10000                # number of trials
+nTrials <- 10000                # number of trials
-IObs <- numeric(nTrials)        # vector to store Information in each trial
+IObs <- numeric(nTrials)        # vector to store Information in each trial
-simCounts <- numeric(20)        # vector to tabulate our information ...
+simCounts <- numeric(20)        # vector to tabulate our information ...
-names(simCounts) <- names(AAref)# ... with the names of AAref
+names(simCounts) <- names(AAref)# ... with the names of AAref
-
+
-
+
-for (i in 1:nTrials) {  # simulate ...
+for (i in 1:nTrials) {  # simulate ...
-
+
-  # sample AAref letters, nObs times, with the probabilities of AAref:
+  # sample AAref letters, nObs times, with the probabilities of AAref:
-  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
+  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
-
+
-  x <- table(AAobs)                            # table simulated observations
+  x <- table(AAobs)                            # table simulated observations
-  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
+  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
-  simCounts[names(x)] <- x                     # overwrite with observed counts
+  simCounts[names(x)] <- x                     # overwrite with observed counts
-  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
+  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
-  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
+  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
-  IObs[i] <- Href - Hobs                       # store information
+  IObs[i] <- Href - Hobs                       # store information
-}
+}
-
+
-# evaluate
+# evaluate
-hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
+hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
-abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
+abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
-
+
-# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
+# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
-# i.e. an actual observation that lies outside the purple lines is deemed
+# i.e. an actual observation that lies outside the purple lines is deemed
-# "significant"(1)(2). Of course, this is only true to the degree that the
+# "significant"(1)(2). Of course, this is only true to the degree that the
-# database frequencies are a valid model for the null-hypothesis on the
+# database frequencies are a valid model for the null-hypothesis on the
-# sequence position we are considering here.
+# sequence position we are considering here.
-
+
-#  (1) If we use 5% quantiles, this means a value is significantly larger
+#  (1) If we use 5% quantiles, this means a value is significantly larger
-#      than expected, and we ignore cases when the value is < 0; if we
+#      than expected, and we ignore cases when the value is < 0; if we
-#      consider both smaller and larger values, we need to use 2.5% quantiles,
+#      consider both smaller and larger values, we need to use 2.5% quantiles,
-#      since 5% of all observations lie outside the 0.025 and 0.975
+#      since 5% of all observations lie outside the 0.025 and 0.975
-#      quantiles.
+#      quantiles.
-#
+#
-#  (2) For an actual observation of counts, we calculate its observed
+#  (2) For an actual observation of counts, we calculate its observed
-#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
+#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
-
+
-
+
-# You can probably now appreciate that information is a bit of a shortcut for
+# You can probably now appreciate that information is a bit of a shortcut for
-# biological sequences, and does not really take the different inherent
+# biological sequences, and does not really take the different inherent
-# frequencies based on the character of the amino acids into account. For
+# frequencies based on the character of the amino acids into account. For
-# example, L is the most frequent and C is the least frequent, but if we have an
+# example, L is the most frequent and C is the least frequent, but if we have an
-# alignment of 1000 sequences and we see that the frequencies for L and C are
+# alignment of 1000 sequences and we see that the frequencies for L and C are
-# swapped, that would be _very_ surprising - nevertheless, the information would
+# swapped, that would be _very_ surprising - nevertheless, the information would
-# be 0. In order to take that into account, we should actually compute
+# be 0. In order to take that into account, we should actually compute
-# Kullback-Leibler divergences.
+# Kullback-Leibler divergences.
-
+
-
+
-# Swap C and L frequencies
+# Swap C and L frequencies
-p <- AAref
+p <- AAref
-q <- AAref
+q <- AAref
-q["L"] <- AAref["C"]
+q["L"] <- AAref["C"]
-q["C"] <- AAref["L"]
+q["C"] <- AAref["L"]
-H(p)
+H(p)
-H(q)
+H(q)
-
+
-KLdiv <- function(p, q) {
+KLdiv <- function(p, q) {
-  # p and q are two pmfs of discrete probability distributions
+  # p and q are two pmfs of discrete probability distributions
-  # with the same outcomes, which are nowhere 0.
+  # with the same outcomes, which are nowhere 0.
-  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
+  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
-
+
-  if (length(p) != length(q)) {
+  if (length(p) != length(q)) {
-    stop("PANIC: input vector lengths differ!")
+    stop("PANIC: input vector lengths differ!")
-  }
+  }
-  if (any(c((p == 0), (q == 0)))) {
+  if (any(c((p == 0), (q == 0)))) {
-    stop("PANIC: 0's found in input vectors!")
+    stop("PANIC: 0's found in input vectors!")
-  }
+  }
-
+
-  return(sum(p * log( p / q )))
+  return(sum(p * log( p / q )))
-}
+}
-
+
-KLdiv(p, p)
+KLdiv(p, p)
-KLdiv(p, q)
+KLdiv(p, q)
-
+
-
+
-nObs <- 15                      # number of observations (e.g aligned sequences)
+nObs <- 15                      # number of observations (e.g aligned sequences)
-# nObs <- 80
+# nObs <- 80
-nTrials <- 10000                # number of trials
+nTrials <- 10000                # number of trials
-KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
+KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
-simCounts <- numeric(20)        # vector to tabulate our information ...
+simCounts <- numeric(20)        # vector to tabulate our information ...
-names(simCounts) <- names(AAref)# ... with the names of AAref
+names(simCounts) <- names(AAref)# ... with the names of AAref
-
+
-
+
-for (i in 1:nTrials) {  # simulate ...
+for (i in 1:nTrials) {  # simulate ...
-
+
-  # sample AAref letters, nObs times, with the probabilities of AAref:
+  # sample AAref letters, nObs times, with the probabilities of AAref:
-  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
+  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
-
+
-  x <- table(AAobs)                            # table simulated observations
+  x <- table(AAobs)                            # table simulated observations
-  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
+  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
-  simCounts[names(x)] <- x                     # overwrite with observed counts
+  simCounts[names(x)] <- x                     # overwrite with observed counts
-  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
+  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
-  simCounts <- simCounts/sum(simCounts)        # counts to frequency
+  simCounts <- simCounts/sum(simCounts)        # counts to frequency
-  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
+  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
-}
+}
-
+
-# evaluate
+# evaluate
-hist(KLdivObs, col = "#C9F4E3", breaks = 25)
+hist(KLdivObs, col = "#C9F4E3", breaks = 25)
-abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
+abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
-quantile(KLdivObs, 0.992)
+quantile(KLdivObs, 0.992)
-
+
-# Running the simulation with KL does not give a fundamentally
+# Running the simulation with KL does not give a fundamentally
-# different behaviour - since we are just randomly sampling. But KL would be
+# different behaviour - since we are just randomly sampling. But KL would be
-# more sensitive in case there is biological selection, where the sampling is no
+# more sensitive in case there is biological selection, where the sampling is no
-# longer random. If I run the same simulation, with nObs <- 80 but calculating
+# longer random. If I run the same simulation, with nObs <- 80 but calculating
-# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
+# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
-# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
+# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
-# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
+# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
-# nice addition to the toolbox.
+# nice addition to the toolbox.
-
+
-
+
-# [END]
+# [END]
--- a/FND-STA-Probability_distribution.R
+++ b/FND-STA-Probability_distribution.R
--- a/FND-STA-Significance.R
+++ b/FND-STA-Significance.R
@ -1,351 +1,351 @@
-# tocID <- "FND-STA-Significance.R"
+# tocID <- "FND-STA-Significance.R"
-#
+#
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the FND-STA-Significance unit.
+#              R code accompanying the FND-STA-Significance unit.
-#
+#
-# Version:  1.3
+# Version:  1.3
-#
+#
-# Date:     2017-09  - 2020-09
+# Date:     2017-09  - 2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.3    2020 Maintenance. Add sample solution.
+#           1.3    2020 Maintenance. Add sample solution.
-#           1.2    Update set.seed() usage
+#           1.2    Update set.seed() usage
-#           1.1    Corrected treatment of empirical p-value
+#           1.1    Corrected treatment of empirical p-value
-#           1.0    First contents
+#           1.0    First contents
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                              Line
+#TOC>   Section  Title                                              Line
-#TOC> ------------------------------------------------------------------
+#TOC> ------------------------------------------------------------------
-#TOC>   1        Significance and p-value                             49
+#TOC>   1        Significance and p-value                             49
-#TOC>   1.1        Significance levels                                60
+#TOC>   1.1        Significance levels                                60
-#TOC>   1.2        probability and p-value                            77
+#TOC>   1.2        probability and p-value                            77
-#TOC>   1.2.1          p-value illustrated                           109
+#TOC>   1.2.1          p-value illustrated                           109
-#TOC>   2        One- or two-sided                                   165
+#TOC>   2        One- or two-sided                                   165
-#TOC>   3        Significance by integration                         209
+#TOC>   3        Significance by integration                         209
-#TOC>   4        Significance by simulation or permutation           215
+#TOC>   4        Significance by simulation or permutation           215
-#TOC>   5        Final tasks                                         327
+#TOC>   5        Final tasks                                         327
-#TOC>   6        Sample solutions                                    336
+#TOC>   6        Sample solutions                                    336
-#TOC>   6.1                                                          338
+#TOC>   6.1                                                          338
-#TOC>   6.2                                                          342
+#TOC>   6.2                                                          342
-#TOC>   6.3                                                          346
+#TOC>   6.3                                                          346
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Significance and p-value  ============================================
+# =    1  Significance and p-value  ============================================
-
+
-# The idea of the probability of an event has a precise mathematical
+# The idea of the probability of an event has a precise mathematical
-# interpretation, but how is it useful to know the probability? Usually we are
+# interpretation, but how is it useful to know the probability? Usually we are
-# interested in whether we should accept or reject a hypothesis based on the
+# interested in whether we should accept or reject a hypothesis based on the
-# observations we have. A rational way to do this is to say: if the probability
+# observations we have. A rational way to do this is to say: if the probability
-# of observing the data is very small under the null-hypothesis, then we will
+# of observing the data is very small under the null-hypothesis, then we will
-# assume the observation is due to something other than the null-hypothesis. But
+# assume the observation is due to something other than the null-hypothesis. But
-# what do we mean by the "probability of our observation"? And what is "very
+# what do we mean by the "probability of our observation"? And what is "very
-# small"?
+# small"?
-
+
-# ==   1.1  Significance levels  ===============================================
+# ==   1.1  Significance levels  ===============================================
-
+
-# A "very small" probability is purely a matter of convention - a cultural
+# A "very small" probability is purely a matter of convention - a cultural
-# convention. In the biomedical field we usually call probabilities of less then
+# convention. In the biomedical field we usually call probabilities of less then
-# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
+# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
-# observations with a probability of less than 0.05 "significant" and if we want
+# observations with a probability of less than 0.05 "significant" and if we want
-# to highlight this in text or in a graph, we often mark them with an asterisk
+# to highlight this in text or in a graph, we often mark them with an asterisk
-# (*). Also we often call observations with a probability of less than 0.01
+# (*). Also we often call observations with a probability of less than 0.01
-# "highly significant" and mark them with two asterisks (**). But there is no
+# "highly significant" and mark them with two asterisks (**). But there is no
-# special significance in these numbers, the cutoff point for significance could
+# special significance in these numbers, the cutoff point for significance could
-# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
+# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
-# British statistician Ronald Fisher happened to propose for this purpose in
+# British statistician Ronald Fisher happened to propose for this purpose in
-# 1925. Incidentally, Fisher later recommended to use different cutoffs for
+# 1925. Incidentally, Fisher later recommended to use different cutoffs for
-# different purposes (cf.
+# different purposes (cf.
-# https://en.wikipedia.org/wiki/Statistical_significance).
+# https://en.wikipedia.org/wiki/Statistical_significance).
-
+
-
+
-# ==   1.2  probability and p-value  ===========================================
+# ==   1.2  probability and p-value  ===========================================
-
+
-# But what do we even mean by the probability of an observation?
+# But what do we even mean by the probability of an observation?
-# Assume I am drawing samples from a normal distribution with a mean of 0 and a
+# Assume I am drawing samples from a normal distribution with a mean of 0 and a
-# standard deviation of 1. The sample I get is ...
+# standard deviation of 1. The sample I get is ...
-
+
-set.seed(sqrt(5))
+set.seed(sqrt(5))
-x <- rnorm(1)
+x <- rnorm(1)
-set.seed(NULL)
+set.seed(NULL)
-
+
-print(x, digits = 22)
+print(x, digits = 22)
-# [1] -0.8969145466249813791748
+# [1] -0.8969145466249813791748
-
+
-# So what's the probability of that number? Obviously, the probability of
+# So what's the probability of that number? Obviously, the probability of
-# getting exactly this number is very, very, very small. But also obviously,
+# getting exactly this number is very, very, very small. But also obviously,
-# this does not mean that observing this number is in any way significant - we
+# this does not mean that observing this number is in any way significant - we
-# always observe some number. That's not what we mean in this case. There are
+# always observe some number. That's not what we mean in this case. There are
-# several implicit assumptions when we speak of the probability of an
+# several implicit assumptions when we speak of the probability of an
-# observation:
+# observation:
-
+
-# 1: the observation can be compared to a probability distribution;
+# 1: the observation can be compared to a probability distribution;
-# 2: that distribution can be integrated between any specific value
+# 2: that distribution can be integrated between any specific value
-#      and its upper and lower bounds (or +- infinity).
+#      and its upper and lower bounds (or +- infinity).
-
+
-# Then what we really mean by the probability of an observation in the context
+# Then what we really mean by the probability of an observation in the context
-# of that distribution is: the probability of observing that value, or a value
+# of that distribution is: the probability of observing that value, or a value
-# more extreme than the one we have. We call this the p-value. Note that we are
+# more extreme than the one we have. We call this the p-value. Note that we are
-# not talking about an individual number anymore, we are talking about the area
+# not talking about an individual number anymore, we are talking about the area
-# under the curve between our observation and the upper (or lower) bound of the
+# under the curve between our observation and the upper (or lower) bound of the
-# curve, as a fraction of the whole.
+# curve, as a fraction of the whole.
-
+
-
+
-# ===   1.2.1  p-value illustrated                      
+# ===   1.2.1  p-value illustrated                      
-
+
-# Let's illustrate. First we draw a million random values from our
+# Let's illustrate. First we draw a million random values from our
-# standard, normal distribution:
+# standard, normal distribution:
-
+
-N <- 1e6                             # one million
+N <- 1e6                             # one million
-set.seed(112358)                     # set RNG seed for repeatable randomness
+set.seed(112358)                     # set RNG seed for repeatable randomness
-r <- rnorm(N)                        # N values from a normal distribution
+r <- rnorm(N)                        # N values from a normal distribution
-set.seed(NULL)                       # reset the RNG
+set.seed(NULL)                       # reset the RNG
-
+
-# Let's see what the distribution looks like:
+# Let's see what the distribution looks like:
-
+
-(h <- hist(r))
+(h <- hist(r))
-
+
-# The histogram details are now available in the list h -  e.g. h$counts
+# The histogram details are now available in the list h -  e.g. h$counts
-
+
-# Where is the value we have drawn previously?
+# Where is the value we have drawn previously?
-abline(v = x, col = "#EE0000")
+abline(v = x, col = "#EE0000")
-
+
-# How many values are smaller?
+# How many values are smaller?
-sum(r < x)
+sum(r < x)
-
+
-# Let's color the bars:
+# Let's color the bars:
-#    first, make a vector of red and green colors for the bars with breaks
+#    first, make a vector of red and green colors for the bars with breaks
-#    smaller and larger then x, white for the bar that contains x ...
+#    smaller and larger then x, white for the bar that contains x ...
-hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
+hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
-hCol <- c(hCol, "#FFFFFFFF")
+hCol <- c(hCol, "#FFFFFFFF")
-hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
+hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
-# ... then plot the histogram, with colored bars ...
+# ... then plot the histogram, with colored bars ...
-hist(r, col = hCol)
+hist(r, col = hCol)
-# ... add two colored rectangles into the white bar ...
+# ... add two colored rectangles into the white bar ...
-idx <- sum(h$breaks < x)
+idx <- sum(h$breaks < x)
-xMin <- h$breaks[idx]
+xMin <- h$breaks[idx]
-xMax <- h$breaks[idx + 1]
+xMax <- h$breaks[idx + 1]
-y <- h$counts[idx]
+y <- h$counts[idx]
-rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
+rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
-rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
+rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
-# ... and a red line for our observation.
+# ... and a red line for our observation.
-abline(v = x, col = "#EE0000", lwd = 2)
+abline(v = x, col = "#EE0000", lwd = 2)
-
+
-# The p-value of our observation is the red area as a fraction of the
+# The p-value of our observation is the red area as a fraction of the
-# whole histogram (red + green).
+# whole histogram (red + green).
-
+
-
+
-# Task:
+# Task:
-#    Explain how the expression sum(r < x) works to give us a count of values
+#    Explain how the expression sum(r < x) works to give us a count of values
-#    with the property we are looking for. E.g., examine -4:4 < x
+#    with the property we are looking for. E.g., examine -4:4 < x
-
+
-# Task:
+# Task:
-#    Write an expression to estimate the probability that a value
+#    Write an expression to estimate the probability that a value
-#    drawn from the vector r is less-or-equal to x. The result you get
+#    drawn from the vector r is less-or-equal to x. The result you get
-#    will depend on the exact values that went into the vector r but it should
+#    will depend on the exact values that went into the vector r but it should
-#    be close to 0.185  That expression is the p-value associated with x.
+#    be close to 0.185  That expression is the p-value associated with x.
-#    (Sample solution 6.1)
+#    (Sample solution 6.1)
-
+
-
+
-# =    2  One- or two-sided  ===================================================
+# =    2  One- or two-sided  ===================================================
-
+
-# The shape of our histogram confirms that the rnorm() function has returned
+# The shape of our histogram confirms that the rnorm() function has returned
-# values that appear distributed according to a normal distribution. In a normal
+# values that appear distributed according to a normal distribution. In a normal
-# distribution, readily available tables tell us that 5% of the values (i.e. our
+# distribution, readily available tables tell us that 5% of the values (i.e. our
-# significance level) lie 1.96 (or approximately 2) standard deviations away
+# significance level) lie 1.96 (or approximately 2) standard deviations away
-# from the mean. Is this the case here? How many values in our vector r are
+# from the mean. Is this the case here? How many values in our vector r are
-# larger than 1.96?
+# larger than 1.96?
-
+
-sum(r > 1.96)
+sum(r > 1.96)
-# [1] 24589
+# [1] 24589
-
+
-# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
+# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
-
+
-# The answer is: we have to be careful with two-sided distributions. 2 standard
+# The answer is: we have to be careful with two-sided distributions. 2 standard
-# deviations away from the mean means either larger or smaller than 1.96 . This
+# deviations away from the mean means either larger or smaller than 1.96 . This
-# can give rise to errors. If we are simply are interested in outliers, no
+# can give rise to errors. If we are simply are interested in outliers, no
-# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
+# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
-# But if we are specifically interested in, say, larger values, because a
+# But if we are specifically interested in, say, larger values, because a
-# smaller value is not meaningful, then the significance cutoff, expressed as
+# smaller value is not meaningful, then the significance cutoff, expressed as
-# standard deviations, is relaxed. We can use the quantile function to see what
+# standard deviations, is relaxed. We can use the quantile function to see what
-# the cutoff values are:
+# the cutoff values are:
-
+
-quantile(r)
+quantile(r)
-quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
+quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
-# close to ± 1.96, as expected
+# close to ± 1.96, as expected
-quantile(r, probs = 0.95) # for the single 5% boundary
+quantile(r, probs = 0.95) # for the single 5% boundary
-# close to 1.64 . Check counts to confirm:
+# close to 1.64 . Check counts to confirm:
-sum(r > quantile(r, probs = 0.95))
+sum(r > quantile(r, probs = 0.95))
-# [1] 50000
+# [1] 50000
-# which is 5%, as expected.
+# which is 5%, as expected.
-
+
-# Task:
+# Task:
-# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
+# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
-# (Sample solution 6.2)
+# (Sample solution 6.2)
-
+
-# To summarize: when we evaluate the significance of an event, we divide a
+# To summarize: when we evaluate the significance of an event, we divide a
-# probability distribution into two parts at the point where the event was
+# probability distribution into two parts at the point where the event was
-# observed. We then ask whether the integral over the more extreme part is less
+# observed. We then ask whether the integral over the more extreme part is less
-# or more than 5% of the whole. If it is less, we deem the event to be
+# or more than 5% of the whole. If it is less, we deem the event to be
-# significant.
+# significant.
-#
+#
-
+
-
+
-# =    3  Significance by integration  =========================================
+# =    3  Significance by integration  =========================================
-
+
-# If the underlying probability distribution can be analytically or numerically
+# If the underlying probability distribution can be analytically or numerically
-# integrated, the siginificance of an observation can be directly computed.
+# integrated, the siginificance of an observation can be directly computed.
-
+
-
+
-# =    4  Significance by simulation or permutation  ===========================
+# =    4  Significance by simulation or permutation  ===========================
-
+
-# But whether the integration is correct, or relies on assumptions that may not
+# But whether the integration is correct, or relies on assumptions that may not
-# be warranted for biological data, can be a highly technical question.
+# be warranted for biological data, can be a highly technical question.
-# Fortunately, we can often simply run a simulation, a random resampling, or a
+# Fortunately, we can often simply run a simulation, a random resampling, or a
-# permutation and then count the number of outcomes, just as we did with our
+# permutation and then count the number of outcomes, just as we did with our
-# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
+# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
-# p-value" is defined as (Nobs + 1) / (N + 1).  )
+# p-value" is defined as (Nobs + 1) / (N + 1).  )
-
+
-# Here is an example. Assume you have a protein sequence and
+# Here is an example. Assume you have a protein sequence and
-# you speculate that positively charged residues are close to negatively charged
+# you speculate that positively charged residues are close to negatively charged
-# residues to balance charge locally. A statistic that would capture this is the
+# residues to balance charge locally. A statistic that would capture this is the
-# mean minimum distance between all D,E residues and the closest R,K,H
+# mean minimum distance between all D,E residues and the closest R,K,H
-# residue. Let's compute this for the sequence of yeast Mbp1.
+# residue. Let's compute this for the sequence of yeast Mbp1.
-
+
-MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
+MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
-               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
+               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
-               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
+               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
-               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
+               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
-               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
+               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
-               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
+               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
-               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
+               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
-               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
+               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
-               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
+               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
-               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
+               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
-               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
+               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
-               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
+               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
-               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
+               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
-               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
+               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
-
+
-# first we split this string into individual characters:
+# first we split this string into individual characters:
-v <- unlist(strsplit(MBP1, ""))
+v <- unlist(strsplit(MBP1, ""))
-
+
-# and find the positions of our charged residues
+# and find the positions of our charged residues
-
+
-ED  <- grep("[ED]", v)
+ED  <- grep("[ED]", v)
-RKH <- grep("[RKH]", v)
+RKH <- grep("[RKH]", v)
-
+
-sep <- numeric(length(ED)) # this vector will hold the distances
+sep <- numeric(length(ED)) # this vector will hold the distances
-for (i in seq_along(ED)) {
+for (i in seq_along(ED)) {
-  sep[i] <- min(abs(RKH - ED[i]))
+  sep[i] <- min(abs(RKH - ED[i]))
-}
+}
-
+
-# Task: read and explain this bit of code
+# Task: read and explain this bit of code
-
+
-# Now that sep is computed, what does it look like?
+# Now that sep is computed, what does it look like?
-
+
-table(sep)  # these are the minimum distances
+table(sep)  # these are the minimum distances
-# 24 of D,E residues are adjacent to R,K,H;
+# 24 of D,E residues are adjacent to R,K,H;
-# the longest separation is 28 residues.
+# the longest separation is 28 residues.
-
+
-# What is the mean separation?
+# What is the mean separation?
-mean(sep)
+mean(sep)
-
+
-# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
+# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
-# to solve this analytically. But by permutation it's soooo easy.
+# to solve this analytically. But by permutation it's soooo easy.
-
+
-# First, we combine what we have done above into a function:
+# First, we combine what we have done above into a function:
-
+
-chSep <- function(v) {
+chSep <- function(v) {
-  # computes the mean minimum separation of oppositely charged residues
+  # computes the mean minimum separation of oppositely charged residues
-  # Parameter: v (char) a vector of amino acids in the one-letter code
+  # Parameter: v (char) a vector of amino acids in the one-letter code
-  # Value: msep (numeric) mean minimum separation
+  # Value: msep (numeric) mean minimum separation
-
+
-  ED  <- grep("[EDed]", v)
+  ED  <- grep("[EDed]", v)
-  RKH <- grep("[RKHrkh]", v)
+  RKH <- grep("[RKHrkh]", v)
-
+
-  sep <- numeric(length(ED))
+  sep <- numeric(length(ED))
-  for (i in seq_along(ED)) {
+  for (i in seq_along(ED)) {
-    sep[i] <- min(abs(RKH - ED[i]))
+    sep[i] <- min(abs(RKH - ED[i]))
-  }
+  }
-  return(mean(sep))
+  return(mean(sep))
-}
+}
-
+
-# Execute the function to define it.
+# Execute the function to define it.
-
+
-# Confirm that the function gives the same result as the number we
+# Confirm that the function gives the same result as the number we
-# calculated above:
+# calculated above:
-chSep(v)
+chSep(v)
-
+
-# Now we can produce a random permutation of v, and recalculate
+# Now we can produce a random permutation of v, and recalculate
-
+
-set.seed(pi)                       # set RNG seed for repeatable randomness
+set.seed(pi)                       # set RNG seed for repeatable randomness
-w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
+w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
-                                   # code paradigm. It is very useful.
+                                   # code paradigm. It is very useful.
-set.seed(NULL)                     # reset the RNG
+set.seed(NULL)                     # reset the RNG
-
+
-
+
-
+
-chSep(w)
+chSep(w)
-# 3.773 ... that's actually less than what we had before.
+# 3.773 ... that's actually less than what we had before.
-
+
-# Let's do this 10000 times and record the results (takes a few seconds):
+# Let's do this 10000 times and record the results (takes a few seconds):
-
+
-N <- 10000
+N <- 10000
-chs <- numeric(N)
+chs <- numeric(N)
-for (i in 1:N) {
+for (i in 1:N) {
-  chs[i] <- chSep(sample(v, length(v))) # charge
+  chs[i] <- chSep(sample(v, length(v))) # charge
-}
+}
-
+
-hist(chs, breaks = 50)
+hist(chs, breaks = 50)
-abline(v = chSep(v), col = "#EE0000")
+abline(v = chSep(v), col = "#EE0000")
-
+
-# Contrary to our expectations, the actual observed mean minimum charge
+# Contrary to our expectations, the actual observed mean minimum charge
-# separation seems to be larger than what we observe in randomly permuted
+# separation seems to be larger than what we observe in randomly permuted
-# sequences. But is this significant? Your task to find out.
+# sequences. But is this significant? Your task to find out.
-
+
-# Task:
+# Task:
-# Calculate the empirical p-value for chsep(v)
+# Calculate the empirical p-value for chsep(v)
-# (Sample solution 6.3)
+# (Sample solution 6.3)
-
+
-
+
-# =    5  Final tasks  =========================================================
+# =    5  Final tasks  =========================================================
-
+
-# From chs, compute the empirical p-value of a mean minimum charge separation to
+# From chs, compute the empirical p-value of a mean minimum charge separation to
-#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
+#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
-#   the result in your journal. Is it significant? Also note the result of
+#   the result in your journal. Is it significant? Also note the result of
-#   the following expression for validation:
+#   the following expression for validation:
-seal(sum(chs))
+seal(sum(chs))
-
+
-
+
-# =    6  Sample solutions  ====================================================
+# =    6  Sample solutions  ====================================================
-
+
-# ==   6.1    ==================================================================
+# ==   6.1    ==================================================================
-#
+#
-sum(r <= x) / length(r)
+sum(r <= x) / length(r)
-
+
-# ==   6.2    ==================================================================
+# ==   6.2    ==================================================================
-#
+#
-abline(v = quantile(r, probs = c(0.05)))
+abline(v = quantile(r, probs = c(0.05)))
-
+
-# ==   6.3    ==================================================================
+# ==   6.3    ==================================================================
-#
+#
-( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
+( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
-
+
-
+
-# [END]
+# [END]
--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
-# BCH441-WORK-ABC-units
+# BCH441-WORK-ABC-units
-
+
 This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.
--- a/RPR-Biostrings.R
+++ b/RPR-Biostrings.R
@ -1,245 +1,245 @@
-# tocID <- "RPR-Biostrings.R"
+# tocID <- "RPR-Biostrings.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Biostrings unit.
+#              R code accompanying the RPR-Biostrings unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Updates
+#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.0    2017 Revisions
+#           1.0    2017 Revisions
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                             Line
+#TOC>   Section  Title                                             Line
-#TOC> -----------------------------------------------------------------
+#TOC> -----------------------------------------------------------------
-#TOC>   1        The Biostrings:: Package                            56
+#TOC>   1        The Biostrings:: Package                            56
-#TOC>   2        Getting Data into Biostrings:: Objects              88
+#TOC>   2        Getting Data into Biostrings:: Objects              88
-#TOC>   3        Working with Biostrings:: Objects                  110
+#TOC>   3        Working with Biostrings:: Objects                  110
-#TOC>   3.1        Properties                                       127
+#TOC>   3.1        Properties                                       127
-#TOC>   3.2        Subsetting                                       168
+#TOC>   3.2        Subsetting                                       168
-#TOC>   3.3        Operators                                        180
+#TOC>   3.3        Operators                                        180
-#TOC>   3.4        Transformations                                  187
+#TOC>   3.4        Transformations                                  187
-#TOC>   4        Getting Data out of Biostrings:: Objects           194
+#TOC>   4        Getting Data out of Biostrings:: Objects           194
-#TOC>   5        More                                               203
+#TOC>   5        More                                               203
-#TOC>   5.1        Views                                            205
+#TOC>   5.1        Views                                            205
-#TOC>   5.2        Iranges                                          219
+#TOC>   5.2        Iranges                                          219
-#TOC>   5.3        StringSets                                       225
+#TOC>   5.3        StringSets                                       225
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# This is a very brief introduction to the Biostrings:: package, other units will
+# This is a very brief introduction to the Biostrings:: package, other units will
-# be using more of the Biostrings:: functions.
+# be using more of the Biostrings:: functions.
-
+
-
+
-# =    1  The Biostrings:: Package  ============================================
+# =    1  The Biostrings:: Package  ============================================
-
+
-
+
-# First, we install and load the Biostrings:: package from bioconductor (if we
+# First, we install and load the Biostrings:: package from bioconductor (if we
-# haven't done so already).
+# haven't done so already).
-
+
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Examine the package information:
+# Examine the package information:
-library(help = Biostrings)       # basic information
+library(help = Biostrings)       # basic information
-browseVignettes("Biostrings")    # available vignettes
+browseVignettes("Biostrings")    # available vignettes
-data(package = "Biostrings")     # available datasets
+data(package = "Biostrings")     # available datasets
-
+
-
+
-# At its core, Biostrings:: objects are "classes" of type XString (you can think
+# At its core, Biostrings:: objects are "classes" of type XString (you can think
-# of a "class" in R as a special kind of list), that can take on particular
+# of a "class" in R as a special kind of list), that can take on particular
-# flavours for RNA, DNA or amino acid sequence information.
+# flavours for RNA, DNA or amino acid sequence information.
-
+
-class(Biostrings::RNAString("AUG"))
+class(Biostrings::RNAString("AUG"))
-class(Biostrings::DNAString("ATG"))
+class(Biostrings::DNAString("ATG"))
-class(Biostrings::AAString("M"))
+class(Biostrings::AAString("M"))
-
+
-# An essential property of Biostrings:: objects is that they only allow letters
+# An essential property of Biostrings:: objects is that they only allow letters
-# from the applicable IUPAC alphabet:
+# from the applicable IUPAC alphabet:
-Biostrings::RNAString("AUG")
+Biostrings::RNAString("AUG")
-Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
+Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
-
+
-
+
-# =    2  Getting Data into Biostrings:: Objects  ==============================
+# =    2  Getting Data into Biostrings:: Objects  ==============================
-
+
-
+
-# Example: read FASTA. Extract sequence. Convert to DNAString object.
+# Example: read FASTA. Extract sequence. Convert to DNAString object.
-rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-rawSeq <- dbSanitizeSequence(rawSeq)
+rawSeq <- dbSanitizeSequence(rawSeq)
-biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
+biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
-                                            # into an object of class DNAstring
+                                            # into an object of class DNAstring
-
+
-# Multi FASTA files can be read directly as a "XStringSet) ...
+# Multi FASTA files can be read directly as a "XStringSet) ...
-rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
+rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
-(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
+(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
-
+
-# ... and if you subset one sequence from the set, you get an XString object
+# ... and if you subset one sequence from the set, you get an XString object
-# back again.
+# back again.
-(Xseq <- biosDNASet[[1]])
+(Xseq <- biosDNASet[[1]])
-
+
-biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
+biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
-identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
+identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
-
+
-
+
-
+
-# =    3  Working with Biostrings:: Objects  ===================================
+# =    3  Working with Biostrings:: Objects  ===================================
-
+
-# Biostrings:: is a highly engineered package that is tightly integrated into
+# Biostrings:: is a highly engineered package that is tightly integrated into
-# the Bioconductor world - unfortunately that brings with it a somewhat
+# the Bioconductor world - unfortunately that brings with it a somewhat
-# undesirable level of computational overhead and dependencies. Using the
+# undesirable level of computational overhead and dependencies. Using the
-# package as we normally do - i.e. calling required functions with their
+# package as we normally do - i.e. calling required functions with their
-# explicit package prefix is therefore not advisable. There are generics
+# explicit package prefix is therefore not advisable. There are generics
-# that won't be propery dispatched. If you only need a small number of
+# that won't be propery dispatched. If you only need a small number of
-# functions for a very specific context, you will probably get away with
+# functions for a very specific context, you will probably get away with
-# Biostrings::<function>() - but even in the demonstration code of this script
+# Biostrings::<function>() - but even in the demonstration code of this script
-# not everything works out of the box. We'll therefore load the library,
+# not everything works out of the box. We'll therefore load the library,
-# but we'll (redundantly) use the prefix anyway so as to emphasize where
+# but we'll (redundantly) use the prefix anyway so as to emphasize where
-# the functions come from.
+# the functions come from.
-
+
-library(Biostrings)
+library(Biostrings)
-
+
-
+
-# ==   3.1  Properties  ========================================================
+# ==   3.1  Properties  ========================================================
-str(rawSeq)
+str(rawSeq)
-str(biosDNAseq)
+str(biosDNAseq)
-
+
-length(rawSeq)       # ... is 1: one string only. To get the number of
+length(rawSeq)       # ... is 1: one string only. To get the number of
-                     # characters in a string, you need nchar().
+                     # characters in a string, you need nchar().
-length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
+length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
-nchar(rawSeq)
+nchar(rawSeq)
-nchar(biosDNAseq)    # ... but nchar() works too.
+nchar(biosDNAseq)    # ... but nchar() works too.
-
+
-(uL <- Biostrings::uniqueLetters(biosDNAseq))
+(uL <- Biostrings::uniqueLetters(biosDNAseq))
-
+
-# Count frequencies - with strings, you would strsplit() into a character
+# Count frequencies - with strings, you would strsplit() into a character
-# vector and then use table(). biost
+# vector and then use table(). biost
-Biostrings::alphabetFrequency(biosDNAseq)
+Biostrings::alphabetFrequency(biosDNAseq)
-
+
-# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
+# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
-# returns.
+# returns.
-Biostrings::letterFrequency(biosDNAseq, uL)
+Biostrings::letterFrequency(biosDNAseq, uL)
-sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
+sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
-  length(biosDNAseq) # GC contents
+  length(biosDNAseq) # GC contents
-
+
-Biostrings::dinucleotideFrequency(biosDNAseq)
+Biostrings::dinucleotideFrequency(biosDNAseq)
-barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
+barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
-
+
-(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
+(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
-barplot(sort(triNuc), col="#4499EE33")
+barplot(sort(triNuc), col="#4499EE33")
-triNuc[triNuc == max(triNuc)]
+triNuc[triNuc == max(triNuc)]
-triNuc[triNuc == min(triNuc)]
+triNuc[triNuc == min(triNuc)]
-max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
+max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
-
+
-# compare to a shuffled sequence:
+# compare to a shuffled sequence:
-(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
+(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
-barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
+barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
-max(triNuc)
+max(triNuc)
-# Interpret this plot.
+# Interpret this plot.
-(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
+(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
-barplot(sort(triNuc), col="#EEEE4433")
+barplot(sort(triNuc), col="#EEEE4433")
-max(triNuc)
+max(triNuc)
-
+
-
+
-# ==   3.2  Subsetting  ========================================================
+# ==   3.2  Subsetting  ========================================================
-
+
-# Subsetting any XString object works as expected:
+# Subsetting any XString object works as expected:
-biosDNAseq[4:15]
+biosDNAseq[4:15]
-
+
-# ... well - maybe not expected, because rawSeq[4:15] would not work.
+# ... well - maybe not expected, because rawSeq[4:15] would not work.
-
+
-# Alternatively to the "[" operator, use the subseq() function - especially for
+# Alternatively to the "[" operator, use the subseq() function - especially for
-# long sequences. This is far more efficient.
+# long sequences. This is far more efficient.
-Biostrings::subseq(biosDNAseq, start = 1, end = 30)
+Biostrings::subseq(biosDNAseq, start = 1, end = 30)
-
+
-
+
-# ==   3.3  Operators  =========================================================
+# ==   3.3  Operators  =========================================================
-
+
-# RNAstring() and DNAstring() objects compare U and T as equals!
+# RNAstring() and DNAstring() objects compare U and T as equals!
-  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
+  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
-  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
+  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
-
+
-
+
-# ==   3.4  Transformations  ===================================================
+# ==   3.4  Transformations  ===================================================
-
+
-biosDNAseq[4:15]
+biosDNAseq[4:15]
-Biostrings::reverseComplement(biosDNAseq[4:15])
+Biostrings::reverseComplement(biosDNAseq[4:15])
-Biostrings::translate(biosDNAseq[4:15])
+Biostrings::translate(biosDNAseq[4:15])
-
+
-
+
-# =    4  Getting Data out of Biostrings:: Objects  ============================
+# =    4  Getting Data out of Biostrings:: Objects  ============================
-
+
-# If you need a character object, use toString():
+# If you need a character object, use toString():
-
+
-Biostrings::toString(biosDNAseq[4:15])
+Biostrings::toString(biosDNAseq[4:15])
-
+
-# saveRDS() and readRDS() works like on all other R objects.
+# saveRDS() and readRDS() works like on all other R objects.
-
+
-
+
-# =    5  More  ================================================================
+# =    5  More  ================================================================
-
+
-# ==   5.1  Views  =============================================================
+# ==   5.1  Views  =============================================================
-
+
-# Biostring "Views" are objects that store multiple substrings of one
+# Biostring "Views" are objects that store multiple substrings of one
-# Biostring object.
+# Biostring object.
-
+
-(myView <- Biostrings::Views(biosDNAseq,
+(myView <- Biostrings::Views(biosDNAseq,
-                             start = c(1, 19, 37),
+                             start = c(1, 19, 37),
-                             end = c(15, 30, 45)))
+                             end = c(15, 30, 45)))
-
+
-# Views are convenient to store feature annotations
+# Views are convenient to store feature annotations
-names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
+names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
-cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
+cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
-
+
-
+
-# ==   5.2  Iranges  ===========================================================
+# ==   5.2  Iranges  ===========================================================
-
+
-# Biostrings:: Iranges are like Views with a common start point. These can be
+# Biostrings:: Iranges are like Views with a common start point. These can be
-# useful for feature annotations. Instead of start/end you store start/width.
+# useful for feature annotations. Instead of start/end you store start/width.
-
+
-
+
-# ==   5.3  StringSets  ========================================================
+# ==   5.3  StringSets  ========================================================
-
+
-# Biostring "StringSets" store multiple sequences.
+# Biostring "StringSets" store multiple sequences.
-#
+#
-ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
+ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
-sample(ompA) # sample can work directly on a Biostring object to shuffle it
+sample(ompA) # sample can work directly on a Biostring object to shuffle it
-
+
-x <- Biostrings::toString(ompA)
+x <- Biostrings::toString(ompA)
-for (i in 2:10) {
+for (i in 2:10) {
-  x[i] <- Biostrings::toString(sample(ompA))
+  x[i] <- Biostrings::toString(sample(ompA))
-}
+}
-shuffledPeptideSet <- Biostrings::AAStringSet(x)
+shuffledPeptideSet <- Biostrings::AAStringSet(x)
-names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
+names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
-shuffledPeptideSet
+shuffledPeptideSet
-
+
-length(shuffledPeptideSet)
+length(shuffledPeptideSet)
-Biostrings::width(shuffledPeptideSet)
+Biostrings::width(shuffledPeptideSet)
-Biostrings::alphabetFrequency(shuffledPeptideSet)
+Biostrings::alphabetFrequency(shuffledPeptideSet)
-
+
-
+
-# [END]
+# [END]
--- a/RPR-ChimeraX_remote.R
+++ b/RPR-ChimeraX_remote.R
@ -1,165 +1,165 @@
-# tocID <- "RPR-ChimeraX_remote.R"
+# tocID <- "RPR-ChimeraX_remote.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code demonstrating remote scripting of ChimeraX.
+#              R code demonstrating remote scripting of ChimeraX.
-#
+#
-# Version:  1.0.1
+# Version:  1.0.1
-#
+#
-# Date:     2020-09
+# Date:     2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.0.1  2021 Minimal updates
+#           1.0.1  2021 Minimal updates
-#           1.0    First ABC units version
+#           1.0    First ABC units version
-#
+#
-#
+#
-# TODO:
+# TODO:
-#    %-encode and escape quotes, or just pass-through?
+#    %-encode and escape quotes, or just pass-through?
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                  Line
+#TOC>   Section  Title                                  Line
-#TOC> ------------------------------------------------------
+#TOC> ------------------------------------------------------
-#TOC>   1        ChimeraX REMOTE SCRIPTING                41
+#TOC>   1        ChimeraX REMOTE SCRIPTING                41
-#TOC>   1.1        Defining a Port                        59
+#TOC>   1.1        Defining a Port                        59
-#TOC>   1.2        Open ChimeraX                          81
+#TOC>   1.2        Open ChimeraX                          81
-#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
+#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
+# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
-
+
-
+
-# One of the cool features of ChimeraX is that it can be driven by Python code,
+# One of the cool features of ChimeraX is that it can be driven by Python code,
-# both within a running session and through Python scripts. What I find even
+# both within a running session and through Python scripts. What I find even
-# cooler though is that ChimeraX can be driven from any programming language via
+# cooler though is that ChimeraX can be driven from any programming language via
-# its remote control function that can listen to commands sent from any other
+# its remote control function that can listen to commands sent from any other
-# application. The interface that is used here is the standard REST (method) -
+# application. The interface that is used here is the standard REST (method) -
-# the GET and POST verbs that ubiquitously underly the communication of clients
+# the GET and POST verbs that ubiquitously underly the communication of clients
-# and servers on the Web.
+# and servers on the Web.
-
+
-# In order to establish the communication between this script and ChimeraX, all
+# In order to establish the communication between this script and ChimeraX, all
-# we need to do is:
+# we need to do is:
-#  - open ChimeraX;
+#  - open ChimeraX;
-#  - tell it to listen on a specific "port";
+#  - tell it to listen on a specific "port";
-#  - send commands to that port via httr::
+#  - send commands to that port via httr::
-
+
-
+
-# ==   1.1  Defining a Port  ===================================================
+# ==   1.1  Defining a Port  ===================================================
-
+
-# The httr:: package needs to be available
+# The httr:: package needs to be available
-
+
-if (! requireNamespace("httr", quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = httr)       # basic information
+#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
+#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
+#  data(package = "httr")     # available datasets
-
+
-# We need to think od a port. Any available port number between 49152-65535 is
+# We need to think od a port. Any available port number between 49152-65535 is
-# fine. We'll choose 61803 because that's the fractional part of the golden
+# fine. We'll choose 61803 because that's the fractional part of the golden
-# ratio. But one could choose another.
+# ratio. But one could choose another.
-
+
-CXPORT <- 61803
+CXPORT <- 61803
-
+
-# Check that our current version of R supports sockets (default since V 3.3)
+# Check that our current version of R supports sockets (default since V 3.3)
-capabilities("sockets")   # MUST be TRUE. If not, don't continue.
+capabilities("sockets")   # MUST be TRUE. If not, don't continue.
-
+
-
+
-# ==   1.2  Open ChimeraX  =====================================================
+# ==   1.2  Open ChimeraX  =====================================================
-
+
-#  - Open a fresh, new session of recently updated version of ChimeraX
+#  - Open a fresh, new session of recently updated version of ChimeraX
-#  - type:
+#  - type:
-#
+#
-#       remotecontrol rest start port 61803
+#       remotecontrol rest start port 61803
-#
+#
-#    ... or whatever the value of CXPORT is.
+#    ... or whatever the value of CXPORT is.
-
+
-# Now watch what happens in ChimeraX when you execute the following line:
+# Now watch what happens in ChimeraX when you execute the following line:
-( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
+( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
-
+
-# The .utilities.R script includes the function CX(), based on this principle,
+# The .utilities.R script includes the function CX(), based on this principle,
-# through which you can send commands to ChimeraX
+# through which you can send commands to ChimeraX
-
+
-CX("camera sbs")
+CX("camera sbs")
-CX("lighting soft")
+CX("lighting soft")
-CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
+CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
-
+
-# The command echos Chimera's response if the parameter "quietly" is
+# The command echos Chimera's response if the parameter "quietly" is
-# FALSE (default), and we can silence output with quietly = TRUE :
+# FALSE (default), and we can silence output with quietly = TRUE :
-CX("info models #1 attribute num_residues")
+CX("info models #1 attribute num_residues")
-CX("info models #1 attribute num_residues", quietly = TRUE)
+CX("info models #1 attribute num_residues", quietly = TRUE)
-
+
-# Either way, the command also returns Chimera's responses "invisibly";
+# Either way, the command also returns Chimera's responses "invisibly";
-# i.e. we can use the results by assigning the output to a variable:
+# i.e. we can use the results by assigning the output to a variable:
-hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
+hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
-x <- read.table(file = textConnection(hBonds), skip = 9,
+x <- read.table(file = textConnection(hBonds), skip = 9,
-                blank.lines.skip = TRUE, fill = TRUE)
+                blank.lines.skip = TRUE, fill = TRUE)
-hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
+hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
-
+
-
+
-# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
+# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
-
+
-# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
+# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
-# to explore possible DNA binding regions in 1BM8
+# to explore possible DNA binding regions in 1BM8
-
+
-# The model for 1BM8 is already open as model 1  (#1)
+# The model for 1BM8 is already open as model 1  (#1)
-CX("hide #1 cartoons")        # hide model 1 cartoon representation
+CX("hide #1 cartoons")        # hide model 1 cartoon representation
-CX("open 1DUX")               # assume this is opened as model #2
+CX("open 1DUX")               # assume this is opened as model #2
-CX("hide #2")                 # hide everything ...
+CX("hide #2")                 # hide everything ...
-CX("select #2/C")             # chain c (protein)
+CX("select #2/C")             # chain c (protein)
-CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
+CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
-CX("color sequential sel target c palette steelblue:darkmagenta")
+CX("color sequential sel target c palette steelblue:darkmagenta")
-CX("view #2/C")               # re-center the display
+CX("view #2/C")               # re-center the display
-CX("cofr #2/C:62@CA")         # set pivot to an interface residue
+CX("cofr #2/C:62@CA")         # set pivot to an interface residue
-CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
+CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
-CX("style sel stick")
+CX("style sel stick")
-CX("show sel target ab")      # show atoms/bonds
+CX("show sel target ab")      # show atoms/bonds
-CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
+CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
-CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
+CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
-CX("surface sel enclose sel") # compute joint accessible surface of both chains
+CX("surface sel enclose sel") # compute joint accessible surface of both chains
-CX("transparency 50")
+CX("transparency 50")
-CX("select clear")
+CX("select clear")
-
+
-# Now superimpose the 1BM8 chain onto 1DUX chain C
+# Now superimpose the 1BM8 chain onto 1DUX chain C
-CX("show #1 cartoons")
+CX("show #1 cartoons")
-CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
+CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
-
+
-# study the general layout, and the position of the 1mb8 secondary structure
+# study the general layout, and the position of the 1mb8 secondary structure
-# elements relative to 1DUX
+# elements relative to 1DUX
-
+
-# Let's examine side chain orientations in more detail
+# Let's examine side chain orientations in more detail
-CX("hide #2/C cartoons")  # hide the 1DUX protein
+CX("hide #2/C cartoons")  # hide the 1DUX protein
-
+
-# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
+# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
-CX("select zone #2/A,B 3.5 #1 & protein residues true")
+CX("select zone #2/A,B 3.5 #1 & protein residues true")
-CX("~select sel & H")  # de-select H atoms
+CX("~select sel & H")  # de-select H atoms
-CX("show sel target ab")
+CX("show sel target ab")
-CX("size stickRadius 0.4")
+CX("size stickRadius 0.4")
-CX("select clear")
+CX("select clear")
-
+
-# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
+# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
-# transcription factor binding mode; the detailed conformations of side chains
+# transcription factor binding mode; the detailed conformations of side chains
-# would need to change only to a minor degree. There is a very significant
+# would need to change only to a minor degree. There is a very significant
-# degree of structural similarity; remarkable, given that the DNA is not the
+# degree of structural similarity; remarkable, given that the DNA is not the
-# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
+# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
-# determined without a DNA ligand.
+# determined without a DNA ligand.
-
+
-CX("remotecontrol rest stop")  # release the socket
+CX("remotecontrol rest stop")  # release the socket
-# Done.
+# Done.
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-FASTA.R
+++ b/RPR-FASTA.R
@ -1,322 +1,322 @@
-# tocID <- "RPR-FASTA.R"
+# tocID <- "RPR-FASTA.R"
-#
+#
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-FASTA unit.
+#              R code accompanying the RPR-FASTA unit.
-#
+#
-# Version:  1.1.2
+# Version:  1.1.2
-#
+#
-# Date:     2017-10  -  2021-09
+# Date:     2017-10  -  2021-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.1.2  style update
+#           1.1.2  style update
-#           1.1.1  bugfix - wrong function name
+#           1.1.1  bugfix - wrong function name
-#           1.1    2020 Maintenance. Rewrite validation logic. Add data
+#           1.1    2020 Maintenance. Rewrite validation logic. Add data
-#                  to utilities. Define AACOLS
+#                  to utilities. Define AACOLS
-#           1.0    New unit.
+#           1.0    New unit.
-#
+#
-#
+#
-# TODO: Make a simple solution first, then extend it to error checking, and
+# TODO: Make a simple solution first, then extend it to error checking, and
-#       to handle .mfa files.
+#       to handle .mfa files.
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                                 Line
+#TOC>   Section  Title                                 Line
-#TOC> -----------------------------------------------------
+#TOC> -----------------------------------------------------
-#TOC>   1        Reading and validating FASTA            45
+#TOC>   1        Reading and validating FASTA            45
-#TOC>   1.1        Validating FASTA                      81
+#TOC>   1.1        Validating FASTA                      81
-#TOC>   2        Parsing FASTA                          227
+#TOC>   2        Parsing FASTA                          227
-#TOC>   3        Interpreting FASTA                     247
+#TOC>   3        Interpreting FASTA                     247
-#TOC>   4        Writing FASTA                          274
+#TOC>   4        Writing FASTA                          274
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Reading and validating FASTA  ========================================
+# =    1  Reading and validating FASTA  ========================================
-
+
-# FASTA is a text based format, structured in lines that are separated by
+# FASTA is a text based format, structured in lines that are separated by
-# line-feed or paragraph-break characters. Which one of these is used, depends
+# line-feed or paragraph-break characters. Which one of these is used, depends
-# on your operating system. But R's readLines() function knows how to handle
+# on your operating system. But R's readLines() function knows how to handle
-# these correctly, accross platforms. Don't try to read such files "by hand".
+# these correctly, accross platforms. Don't try to read such files "by hand".
-# Here is the yeast Mbp1 gene, via SGD.
+# Here is the yeast Mbp1 gene, via SGD.
-
+
-file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
+file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
-faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
+faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
-
+
-# The warning is generated because the programmer at the NCBI who implemented
+# The warning is generated because the programmer at the NCBI who implemented
-# the code to write this FASTA file neglected to place a line-break character
+# the code to write this FASTA file neglected to place a line-break character
-# after the last sequence character. While this is not technically incorrect,
+# after the last sequence character. While this is not technically incorrect,
-# it is poor practice: the resulting file can't be distinguished from one that
+# it is poor practice: the resulting file can't be distinguished from one that
-# has been truncated in transmission.
+# has been truncated in transmission.
-
+
-head(faMBP1)
+head(faMBP1)
-
+
-# Note that there are NO line-break characters ("\n") at the end of these
+# Note that there are NO line-break characters ("\n") at the end of these
-# strings, even though they were present in the original file. readLines()
+# strings, even though they were present in the original file. readLines()
-# has "consumed" these characters while reading - but every single line is in
+# has "consumed" these characters while reading - but every single line is in
-# a vector of its own.
+# a vector of its own.
-
+
-tail(faMBP1)
+tail(faMBP1)
-
+
-# Also note that the last line has fewer characters - this means readLines()
+# Also note that the last line has fewer characters - this means readLines()
-# imported the whole line, despite it not being terminated by "\n".
+# imported the whole line, despite it not being terminated by "\n".
-
+
-# It's very straightforward to work with such data, for example by collapsing
+# It's very straightforward to work with such data, for example by collapsing
-# everything except the first line into a single string ...
+# everything except the first line into a single string ...
-
+
-f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
+f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
-
+
-f[1]
+f[1]
-nchar(f[2])
+nchar(f[2])
-
+
-# ==   1.1  Validating FASTA  ==================================================
+# ==   1.1  Validating FASTA  ==================================================
-
+
-# The code above is making the assumption that everything from line 2 until
+# The code above is making the assumption that everything from line 2 until
-#  the end IS sequence, the whole sequence and nothing but sequence.
+#  the end IS sequence, the whole sequence and nothing but sequence.
-#  That assumption can break down in many ways:
+#  That assumption can break down in many ways:
-#
+#
-#  - there could be more than one header line. The specification says otherwise,
+#  - there could be more than one header line. The specification says otherwise,
-#       but some older files use multiple, consecutive header lines. You don't
+#       but some older files use multiple, consecutive header lines. You don't
-#       want that to end up in your sequence.
+#       want that to end up in your sequence.
-#  - this could be not a FASTA file at all. It could be raw sequence, a
+#  - this could be not a FASTA file at all. It could be raw sequence, a
-#       different sequence file format, or a wholly different file altogether.
+#       different sequence file format, or a wholly different file altogether.
-#       If you look at the file, you can immediately tell, but if you are
+#       If you look at the file, you can immediately tell, but if you are
-#       reading the file in a complex workflow, your could easily import wrong
+#       reading the file in a complex workflow, your could easily import wrong
-#       data into your analysis.
+#       data into your analysis.
-#  - there could be more than one sequence in the file. Such Multi-FASTA files
+#  - there could be more than one sequence in the file. Such Multi-FASTA files
-#       occur commonly, as downloads of ORFs from genome regions or other
+#       occur commonly, as downloads of ORFs from genome regions or other
-#       sets of genes or proteins, or as the input / output for multiple
+#       sets of genes or proteins, or as the input / output for multiple
-#       sequence alignment programs.
+#       sequence alignment programs.
-#
+#
-# Data "from the wild" can (and usually does) have the most unexpected
+# Data "from the wild" can (and usually does) have the most unexpected
-# variations and it is really, really important to be clear about the
+# variations and it is really, really important to be clear about the
-# assumptions that you are making. It is possible to "fix" things, according
+# assumptions that you are making. It is possible to "fix" things, according
-# to the "Robustness Principle" :
+# to the "Robustness Principle" :
-#      "Be conservative in what you send,
+#      "Be conservative in what you send,
-#       be liberal in what you accept".
+#       be liberal in what you accept".
-#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
+#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
-# ... but if you think about this, that's actually a really poor idea,
+# ... but if you think about this, that's actually a really poor idea,
-# which is much more likely to dilute standards, make unwarranted
+# which is much more likely to dilute standards, make unwarranted
-# assumptions, and allow errors to pass silently and corrupt data.
+# assumptions, and allow errors to pass silently and corrupt data.
-#
+#
-# Let's discard this principle on the trash-heap of
+# Let's discard this principle on the trash-heap of
-# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
+# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
-# identify problems, and follow the principle: "crash early, crash often". Of
+# identify problems, and follow the principle: "crash early, crash often". Of
-# course I can write code that would reformat any possible input as a FASTA
+# course I can write code that would reformat any possible input as a FASTA
-# file - but what good will it do me if it parses the file I receive
+# file - but what good will it do me if it parses the file I receive
-# from a server into FASTA format like:
+# from a server into FASTA format like:
-#
+#
-#   >404- Page Not Found</title</head>
+#   >404- Page Not Found</title</head>
-#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
+#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
-#   spellingrcntacttheadministratrsdyhtml
+#   spellingrcntacttheadministratrsdyhtml
-#
+#
-# Therefore, we write ourselves a FASTA checker that will enforce the following:
+# Therefore, we write ourselves a FASTA checker that will enforce the following:
-#   (1) a FASTA file contains one or more sequences separated by zero or
+#   (1) a FASTA file contains one or more sequences separated by zero or
-#       more empty lines
+#       more empty lines
-#   (2) a sequence contains one header line followed by
+#   (2) a sequence contains one header line followed by
-#       one or more sequence lines
+#       one or more sequence lines
-#   (3) a sequence line contains one or more uppercase or lowercase single
+#   (3) a sequence line contains one or more uppercase or lowercase single
-#       letter amino acid codes, hyphens (gap character), or * (stop).
+#       letter amino acid codes, hyphens (gap character), or * (stop).
-#
+#
-#   Anything else should generate an error.
+#   Anything else should generate an error.
-
+
-#   (Case 1): Header(s) exist
+#   (Case 1): Header(s) exist
-fX <- c("ABC",
+fX <- c("ABC",
-        "defghi",
+        "defghi",
-        "klmnpq")
+        "klmnpq")
-sel <- grepl("^>", fX)  # "^>" is a regular expression that
+sel <- grepl("^>", fX)  # "^>" is a regular expression that
-                        # means: the exact character ">" at the
+                        # means: the exact character ">" at the
-                        # beginning ("^") of the line.
+                        # beginning ("^") of the line.
-if ( ! any(sel) ) { stop("no header lines in input.") }
+if ( ! any(sel) ) { stop("no header lines in input.") }
-
+
-
+
-#   (Case 2) No adjacent header lines
+#   (Case 2) No adjacent header lines
-fX <- c(">ABC",
+fX <- c(">ABC",
-        ">123",
+        ">123",
-        "defghi",
+        "defghi",
-        "klmnpq")
+        "klmnpq")
-sel <- grepl("^>", fX)
+sel <- grepl("^>", fX)
-sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
+sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
-if ( any(sel)) { stop("adjacent header lines in input.") }
+if ( any(sel)) { stop("adjacent header lines in input.") }
-
+
-#   (Case 3.1) all sequence lines contain only valid characters
+#   (Case 3.1) all sequence lines contain only valid characters
-#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
+#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
-#               are defined with the .utilities.R script)
+#               are defined with the .utilities.R script)
-AAVALID
+AAVALID
-fX <- c(">ABC",
+fX <- c(">ABC",
-        "def ;-) ghi",
+        "def ;-) ghi",
-        "klmnpq")
+        "klmnpq")
-myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
+myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
-sel <- ! grepl("^>", fX)              # NOT headers
+sel <- ! grepl("^>", fX)              # NOT headers
-if (any(grepl(myRegex, fX[sel]))) {
+if (any(grepl(myRegex, fX[sel]))) {
-  stop("invalid chracter(s) outside of header lines.")
+  stop("invalid chracter(s) outside of header lines.")
-}
+}
-
+
-#   (Case 3.2) all headers are followed directly by
+#   (Case 3.2) all headers are followed directly by
-#              at least one letter of sequence
+#              at least one letter of sequence
-fX <- c(">ABC",
+fX <- c(">ABC",
-        "",
+        "",
-        ">123",
+        ">123",
-        "defghi",
+        "defghi",
-        "klmnpq")
+        "klmnpq")
-sel <- grep("^>", fX) + 1             # indexes of headers + 1
+sel <- grep("^>", fX) + 1             # indexes of headers + 1
-myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
+myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
-if (! all(grepl(myRegex, fX[sel]))) {
+if (! all(grepl(myRegex, fX[sel]))) {
-  stop("a header has no adjacent sequence.")
+  stop("a header has no adjacent sequence.")
-}
+}
-# Ah, you might ask - couldn't we just have dropped all empty lines, and
+# Ah, you might ask - couldn't we just have dropped all empty lines, and
-# then caught this in Case 2? No - for two reasons: we would still miss headers
+# then caught this in Case 2? No - for two reasons: we would still miss headers
-# at the end of file, and, we would have changed the line numbering - and
+# at the end of file, and, we would have changed the line numbering - and
-# ideally our "production" function will create information about where the
+# ideally our "production" function will create information about where the
-# error is to be found.
+# error is to be found.
-
+
-
+
-# Now combine this into a function ...
+# Now combine this into a function ...
-
+
-val <- function(fa) {
+val <- function(fa) {
-
+
-  if ( ! any(grepl("^>", fa)) ) {
+  if ( ! any(grepl("^>", fa)) ) {
-    stop("no header lines in input.")
+    stop("no header lines in input.")
-  }
+  }
-
+
-  sel <- grepl("^>", fa)
+  sel <- grepl("^>", fa)
-  if ( any(sel[- length(sel)] & sel[-1])) {
+  if ( any(sel[- length(sel)] & sel[-1])) {
-    stop("adjacent header lines in input.")
+    stop("adjacent header lines in input.")
-  }
+  }
-
+
-  sel <- ! grepl("^>", fa)
+  sel <- ! grepl("^>", fa)
-  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
+  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
-    stop("invalid chracter(s) outside of header lines.")
+    stop("invalid chracter(s) outside of header lines.")
-  }
+  }
-
+
-  sel <- grep("^>", fa) + 1
+  sel <- grep("^>", fa) + 1
-  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
+  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
-    stop("a header has no adjacent sequence.")
+    stop("a header has no adjacent sequence.")
-  }
+  }
-
+
-  return(invisible(NULL))
+  return(invisible(NULL))
-}
+}
-
+
-# Here is an example
+# Here is an example
-FA <- c(">head1",
+FA <- c(">head1",
-        "acdef",
+        "acdef",
-        "ghi",
+        "ghi",
-        "",
+        "",
-        ">head2",
+        ">head2",
-        "kl",
+        "kl",
-        ">head3",
+        ">head3",
-        "mn",
+        "mn",
-        "pqrs")
+        "pqrs")
-val(FA)     # ... should not create an error
+val(FA)     # ... should not create an error
-
+
-
+
-# A somewhat more elaborate validateFA() function was loaded with the
+# A somewhat more elaborate validateFA() function was loaded with the
-# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
+# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
-# fasta files have space-characters in their spacer lines. Try it ...
+# fasta files have space-characters in their spacer lines. Try it ...
-validateFA(FA)
+validateFA(FA)
-
+
-# =    2  Parsing FASTA  =======================================================
+# =    2  Parsing FASTA  =======================================================
-
+
-# Once we have validated our assumptions about our input, it's quite
+# Once we have validated our assumptions about our input, it's quite
-# painless to parse it. I have put this together as a function and the function
+# painless to parse it. I have put this together as a function and the function
-# gets loaded from ./.utilities.R
+# gets loaded from ./.utilities.R
-#
+#
-
+
-# Lets try this:
+# Lets try this:
-#   - the first 3 elements of faMBP1:
+#   - the first 3 elements of faMBP1:
-readFASTA(faMBP1[1:3])
+readFASTA(faMBP1[1:3])
-
+
-#   - a multi FASTA file of aligned APSES domain sequences:
+#   - a multi FASTA file of aligned APSES domain sequences:
-
+
-refAPSES <- readFASTA("./data/refAPSES.mfa")
+refAPSES <- readFASTA("./data/refAPSES.mfa")
-
+
-# Subset the sequence with "P39678" in the header
+# Subset the sequence with "P39678" in the header
-refAPSES[grep("P39678", refAPSES$head) ,]
+refAPSES[grep("P39678", refAPSES$head) ,]
-
+
-
+
-
+
-# =    3  Interpreting FASTA  ==================================================
+# =    3  Interpreting FASTA  ==================================================
-
+
-
+
-# FASTA files are straightforward to interpret - just one thing may be of note:
+# FASTA files are straightforward to interpret - just one thing may be of note:
-# when working with strings, we can use substr(<string>, <start>, <stop>) to
+# when working with strings, we can use substr(<string>, <start>, <stop>) to
-# extract substrings, but more often we expand the string into a vector of
+# extract substrings, but more often we expand the string into a vector of
-# single characters with strsplit(<string>, ""). strsplit() returns a list,
+# single characters with strsplit(<string>, ""). strsplit() returns a list,
-# to accommodate that <string> could be a vector of many elements, therefore
+# to accommodate that <string> could be a vector of many elements, therefore
-# we usually unlist() the result if we use it only on a single string.
+# we usually unlist() the result if we use it only on a single string.
-
+
-# Example: How many positive charged residues in "MBP1_SACCE"?
+# Example: How many positive charged residues in "MBP1_SACCE"?
-
+
-s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
+s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
-s
+s
-
+
-sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
+sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
-                       # for the characters, sum() coerces to 1 and 0
+                       # for the characters, sum() coerces to 1 and 0
-                       # respectively, and that gives us the result.
+                       # respectively, and that gives us the result.
-
+
-100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
+100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
-
+
-# residue distribution
+# residue distribution
-x <- factor(s, levels = names(AACOLS))
+x <- factor(s, levels = names(AACOLS))
-pie(table(x)[names(AACOLS)], col = AACOLS)
+pie(table(x)[names(AACOLS)], col = AACOLS)
-
+
-
+
-
+
-# =    4  Writing FASTA  =======================================================
+# =    4  Writing FASTA  =======================================================
-
+
-
+
-# Writing FASTA files is mostly just the reverse of reading, with one
+# Writing FASTA files is mostly just the reverse of reading, with one
-# twist: we need to break the long sequence string into chunks of the desired
+# twist: we need to break the long sequence string into chunks of the desired
-# width. The FASTA specification calls for a maximum of 120 characters per line,
+# width. The FASTA specification calls for a maximum of 120 characters per line,
-# but writing out much less than that is common, since it allows to comfortably
+# but writing out much less than that is common, since it allows to comfortably
-# view lines on the console, or printing them on a sheet of paper (do we still
+# view lines on the console, or printing them on a sheet of paper (do we still
-# do that actually?). How do we break a string into chunks? A combination of
+# do that actually?). How do we break a string into chunks? A combination of
-# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
+# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
-# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
+# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
-# loop through our FASTA object in memory, we can build the output by c()'ing
+# loop through our FASTA object in memory, we can build the output by c()'ing
-# blocks of header + sequence to each other. For VERY large objects this might
+# blocks of header + sequence to each other. For VERY large objects this might
-# be slow - in that case, we might want to precalculate the size of the output
+# be slow - in that case, we might want to precalculate the size of the output
-# object. But that's more of a hypothetical consideration.
+# object. But that's more of a hypothetical consideration.
-
+
-( s <- refAPSES$seq[2] )
+( s <- refAPSES$seq[2] )
-nchar(s)
+nchar(s)
-w <- 30     # width of chunk
+w <- 30     # width of chunk
-(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
+(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
-(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
+(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
-
+
-# Task: Is this safe? What happens if nchar(s) is shorter than w?
+# Task: Is this safe? What happens if nchar(s) is shorter than w?
-#       What happens if nchar(s) is an exact multiple of w?
+#       What happens if nchar(s) is an exact multiple of w?
-
+
-substring(s, starts, ends)
+substring(s, starts, ends)
-# confirm that the output contains the first and last residue, and both
+# confirm that the output contains the first and last residue, and both
-# residues adjacent to the breaks
+# residues adjacent to the breaks
-
+
-# As always, the function has been defined in ".utilities.R" for to use
+# As always, the function has been defined in ".utilities.R" for to use
-# any time...  type   writeFASTA  to examine it.
+# any time...  type   writeFASTA  to examine it.
-
+
-# Let's try this...
+# Let's try this...
-
+
-writeFASTA(refAPSES, width = 40)
+writeFASTA(refAPSES, width = 40)
-
+
-# roundtrip for validation: write refAPSES with a different format,
+# roundtrip for validation: write refAPSES with a different format,
-# read it back in - the new dataframe must be identical
+# read it back in - the new dataframe must be identical
-# to the original dataframe.
+# to the original dataframe.
-fname <- tempfile()
+fname <- tempfile()
-writeFASTA(refAPSES, fn = fname, width = 30)
+writeFASTA(refAPSES, fn = fname, width = 30)
-identical(refAPSES, readFASTA(fname))
+identical(refAPSES, readFASTA(fname))
-
+
-# ...works for me  :-)
+# ...works for me  :-)
-
+
-
+
-# [END]
+# [END]
--- a/RPR-GEO2R.R
+++ b/RPR-GEO2R.R
--- a/RPR-Genetic_code_optimality.R
+++ b/RPR-Genetic_code_optimality.R
@ -1,385 +1,385 @@
-# tocID <- "RPR-Genetic_code_optimality.R"
+# tocID <- "RPR-Genetic_code_optimality.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Genetic_code_optimality unit.
+#              R code accompanying the RPR-Genetic_code_optimality unit.
-#
+#
-# Version:  1.3
+# Version:  1.3
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.3    2020 Maintenance
+#           1.3    2020 Maintenance
-#           1.2    Change from require() to requireNamespace(),
+#           1.2    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#                      use Biocmanager:: not biocLite()
+#                      use Biocmanager:: not biocLite()
-#           1.1      Update set.seed() usage
+#           1.1      Update set.seed() usage
-#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
+#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
-#           1.0      New material.
+#           1.0      New material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                          Line
+#TOC>   Section  Title                                          Line
-#TOC> --------------------------------------------------------------
+#TOC> --------------------------------------------------------------
-#TOC>   1        Designing a computational experiment             58
+#TOC>   1        Designing a computational experiment             58
-#TOC>   2        Setting up the tools                             74
+#TOC>   2        Setting up the tools                             74
-#TOC>   2.1        Natural and alternative genetic codes          77
+#TOC>   2.1        Natural and alternative genetic codes          77
-#TOC>   2.2        Effect of mutations                           135
+#TOC>   2.2        Effect of mutations                           135
-#TOC>   2.2.1          reverse-translate                         146
+#TOC>   2.2.1          reverse-translate                         146
-#TOC>   2.2.2          Randomly mutate                           171
+#TOC>   2.2.2          Randomly mutate                           171
-#TOC>   2.2.3          Forward- translate                        196
+#TOC>   2.2.3          Forward- translate                        196
-#TOC>   2.2.4          measure effect                            213
+#TOC>   2.2.4          measure effect                            213
-#TOC>   3        Run the experiment                              267
+#TOC>   3        Run the experiment                              267
-#TOC>   4        Task solutions                                  363
+#TOC>   4        Task solutions                                  363
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# This unit demonstrates R code to simulate alternate genetic codes and evaluate
+# This unit demonstrates R code to simulate alternate genetic codes and evaluate
-# their robsustness to code changes. The approaches are quite simple and you
+# their robsustness to code changes. The approaches are quite simple and you
-# will be able to come up with obvious refinements; the point of this code is to
+# will be able to come up with obvious refinements; the point of this code is to
-# demonstrate some R programming techniques, in preparation for more
+# demonstrate some R programming techniques, in preparation for more
-# sophisticated questions later.
+# sophisticated questions later.
-
+
-
+
-# =    1  Designing a computational experiment  ================================
+# =    1  Designing a computational experiment  ================================
-
+
-# Computational experiments are conducted like wet-lab experiments. We begin
+# Computational experiments are conducted like wet-lab experiments. We begin
-# with a hypothesis, then define the observables that relate to the hypothesis,
+# with a hypothesis, then define the observables that relate to the hypothesis,
-# then define the measures we apply to observations, and finally we interpret
+# then define the measures we apply to observations, and finally we interpret
-# our observations. If we want to learn something about the evolution of the
+# our observations. If we want to learn something about the evolution of the
-# genetic code ...
+# genetic code ...
-
+
-#  - we construct a hypothesis such as: the genetic code has evolved so as to
+#  - we construct a hypothesis such as: the genetic code has evolved so as to
-#      minimize the effect of mutations;
+#      minimize the effect of mutations;
-#  - we define the observables: the effect of mutations in
+#  - we define the observables: the effect of mutations in
-#      sequences, given the natural and possible alternative codes;
+#      sequences, given the natural and possible alternative codes;
-#  - we define the measures to quantify the effect of mutations;
+#  - we define the measures to quantify the effect of mutations;
-#  - then we compute alternatives and interpret the results.
+#  - then we compute alternatives and interpret the results.
-
+
-
+
-# =    2  Setting up the tools  ================================================
+# =    2  Setting up the tools  ================================================
-
+
-
+
-# ==   2.1  Natural and alternative genetic codes  =============================
+# ==   2.1  Natural and alternative genetic codes  =============================
-
+
-# Load genetic code tables from the Biostrings package
+# Load genetic code tables from the Biostrings package
-if (! requireNamespace("BiocManager", quietly = TRUE)) {
+if (! requireNamespace("BiocManager", quietly = TRUE)) {
-  install.packages("BiocManager")
+  install.packages("BiocManager")
-}
+}
-if (! requireNamespace("Biostrings", quietly = TRUE)) {
+if (! requireNamespace("Biostrings", quietly = TRUE)) {
-  BiocManager::install("Biostrings")
+  BiocManager::install("Biostrings")
-}
+}
-# Package information:
+# Package information:
-#  library(help = Biostrings)       # basic information
+#  library(help = Biostrings)       # basic information
-#  browseVignettes("Biostrings")    # available vignettes
+#  browseVignettes("Biostrings")    # available vignettes
-#  data(package = "Biostrings")     # available datasets
+#  data(package = "Biostrings")     # available datasets
-
+
-
+
-# There are many ways to generate alternative codes. The simplest way is to
+# There are many ways to generate alternative codes. The simplest way is to
-# randomly assign amino acids to codons. A more sophisticated way is to keep the
+# randomly assign amino acids to codons. A more sophisticated way is to keep the
-# redundancy of codons intact, since it may reflect some form of symmetry
+# redundancy of codons intact, since it may reflect some form of symmetry
-# breaking that ignores the third nucleotide of a codon for the most part;
+# breaking that ignores the third nucleotide of a codon for the most part;
-# therefore we only replace the amino acids of the existing code with random
+# therefore we only replace the amino acids of the existing code with random
-# others. Here are two functions that implement these two ideas about alternate
+# others. Here are two functions that implement these two ideas about alternate
-# codes.
+# codes.
-
+
-randomGC <- function(GC) {
+randomGC <- function(GC) {
-  # Return a genetic code with randomly assigned amino acids.
+  # Return a genetic code with randomly assigned amino acids.
-  # Parameters:
+  # Parameters:
-  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
-  #                       codes plus "*" (stop), named with the codon triplet.
+  #                       codes plus "*" (stop), named with the codon triplet.
-  # Value:  named chr  same vector with random amino acid assignments in which
+  # Value:  named chr  same vector with random amino acid assignments in which
-  #                       every amino acid and "*" is encoded at least once.
+  #                       every amino acid and "*" is encoded at least once.
-
+
-  aa <- unique(GC)                           # the amino acids in the input code
+  aa <- unique(GC)                           # the amino acids in the input code
-  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
+  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
-  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
+  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
-                                             # does not contain all amino acids,
+                                             # does not contain all amino acids,
-                                             # then we sample() again.
+                                             # then we sample() again.
-    GC[1:64] <- sample(aa, 64, replace = TRUE)
+    GC[1:64] <- sample(aa, 64, replace = TRUE)
-  }
+  }
-  return(GC)
+  return(GC)
-}
+}
-
+
-swappedGC <- function(GC) {
+swappedGC <- function(GC) {
-  # Return a genetic code with randomly swapped amino acids.
+  # Return a genetic code with randomly swapped amino acids.
-  # Parameters:
+  # Parameters:
-  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
+  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
-  #                       codes plus "*" (stop), named with the codon triplet.
+  #                       codes plus "*" (stop), named with the codon triplet.
-  # Value:  named chr  same vector with random amino acid assignments where the
+  # Value:  named chr  same vector with random amino acid assignments where the
-  #                       amino acids have been swapped.
+  #                       amino acids have been swapped.
-
+
-  aaOrig <- unique(GC)                       # the amino acids in the input code
+  aaOrig <- unique(GC)                       # the amino acids in the input code
-  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
+  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
-  names(aaSwap) <- aaOrig                    # name them after the original
+  names(aaSwap) <- aaOrig                    # name them after the original
-  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
+  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
-
+
-  return(GC)
+  return(GC)
-}
+}
-
+
-
+
-# ==   2.2  Effect of mutations  ===============================================
+# ==   2.2  Effect of mutations  ===============================================
-
+
-
+
-# To evaluate the effects of mutations we will do the following:
+# To evaluate the effects of mutations we will do the following:
-#   - we take an amino acid sequence (Mbp1 will do just nicely);
+#   - we take an amino acid sequence (Mbp1 will do just nicely);
-#   - we reverse-translate it into a nucleotide sequence;
+#   - we reverse-translate it into a nucleotide sequence;
-#   - we mutate it randomly;
+#   - we mutate it randomly;
-#   - we translate it back to amino acids;
+#   - we translate it back to amino acids;
-#   - we count the number of mutations and evaluate their severity.
+#   - we count the number of mutations and evaluate their severity.
-
+
-
+
-# ===   2.2.1  reverse-translate                    
+# ===   2.2.1  reverse-translate                    
-
+
-# To reverse-translate an amino acid vector, we randomly pick one of its
+# To reverse-translate an amino acid vector, we randomly pick one of its
-# codons from a genetic code, and assemble all codons to a sequence.
+# codons from a genetic code, and assemble all codons to a sequence.
-
+
-traRev <- function(s, GC) {
+traRev <- function(s, GC) {
-  # Parameters:
+  # Parameters:
-  #      s   chr   a sequence vector
+  #      s   chr   a sequence vector
-  #      GC  chr   a genetic code
+  #      GC  chr   a genetic code
-  # Value:
+  # Value:
-  #      A reverse-translated vector of codons
+  #      A reverse-translated vector of codons
-  vC <- character(length(s))
+  vC <- character(length(s))
-
+
-  for (i in seq_along(s)) {
+  for (i in seq_along(s)) {
-    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
+    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
-    if (length(codon) > 1) {         # if there's more than one ...
+    if (length(codon) > 1) {         # if there's more than one ...
-      codon <- sample(codon, 1)      # pick one at random ...
+      codon <- sample(codon, 1)      # pick one at random ...
-    }
+    }
-    vC[i] <- codon                   # store it
+    vC[i] <- codon                   # store it
-  }
+  }
-
+
-  return(vC)
+  return(vC)
-}
+}
-
+
-
+
-# ===   2.2.2  Randomly mutate                      
+# ===   2.2.2  Randomly mutate                      
-
+
-# To mutate, we split a codon into it's three nucleotides, then randomly replace
+# To mutate, we split a codon into it's three nucleotides, then randomly replace
-# one of the three with another nucleotide.
+# one of the three with another nucleotide.
-
+
-randMut <- function(vC) {
+randMut <- function(vC) {
-  # Parameter:
+  # Parameter:
-  #    vC   chr     a vector of codons
+  #    vC   chr     a vector of codons
-  # Value:  chr     a vector of codons with a single point mutation from vC
+  # Value:  chr     a vector of codons with a single point mutation from vC
-
+
-  nuc <- c("A", "C", "G", "T")
+  nuc <- c("A", "C", "G", "T")
-
+
-  for (i in seq_along(vC)) {
+  for (i in seq_along(vC)) {
-    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
+    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
-    iNuc <- sample(1:3, 1)                         # choose one of the three
+    iNuc <- sample(1:3, 1)                         # choose one of the three
-    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
+    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
-    triplet[iNuc] <- mutNuc                        # replace the original
+    triplet[iNuc] <- mutNuc                        # replace the original
-    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
+    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
-  }
+  }
-  return(vC)
+  return(vC)
-
+
-}
+}
-
+
-
+
-
+
-# ===   2.2.3  Forward- translate                   
+# ===   2.2.3  Forward- translate                   
-
+
-traFor <- function(vC, GC) {
+traFor <- function(vC, GC) {
-  # Parameters:
+  # Parameters:
-  #      vC   chr   a codon vector
+  #      vC   chr   a codon vector
-  #      GC   chr   a genetic code
+  #      GC   chr   a genetic code
-  # Value:
+  # Value:
-  #      A vector of amino acids
+  #      A vector of amino acids
-  vAA <- character(length(vC))
+  vAA <- character(length(vC))
-
+
-  for (i in seq_along(vC)) {
+  for (i in seq_along(vC)) {
-    vAA[i] <- GC[vC[i]]         # translate and store
+    vAA[i] <- GC[vC[i]]         # translate and store
-  }
+  }
-  return(vAA)
+  return(vAA)
-}
+}
-
+
-
+
-# ===   2.2.4  measure effect                       
+# ===   2.2.4  measure effect                       
-
+
-# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
+# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
-# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
+# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
-# categories, according to their free energy of transfer from water to octanol:
+# categories, according to their free energy of transfer from water to octanol:
-aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
-aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
-aaNeutral <- c("A", "H", "T", "S", "V", "G")
+aaNeutral <- c("A", "H", "T", "S", "V", "G")
-
+
-# Then we will penalize as follows:
+# Then we will penalize as follows:
-# Changes within one category: 0.1
+# Changes within one category: 0.1
-# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
+# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
-# Changes from hydrophobic to hydrophilic or back: 1.0
+# Changes from hydrophobic to hydrophilic or back: 1.0
-# Changes to stop-codon: 3.0
+# Changes to stop-codon: 3.0
-
+
-evalMut <- function(nat, mut) {
+evalMut <- function(nat, mut) {
-  # Evaluate severity of mutations between amino acid sequence vectors nat and
+  # Evaluate severity of mutations between amino acid sequence vectors nat and
-  # mut in an ad hoc approach based on hydrophobicity changes.
+  # mut in an ad hoc approach based on hydrophobicity changes.
-  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
+  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
-  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
+  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
-  aaNeutral <- c("A", "H", "T", "S", "V", "G")
+  aaNeutral <- c("A", "H", "T", "S", "V", "G")
-
+
-  penalties <- numeric(length(nat))
+  penalties <- numeric(length(nat))
-  lMut <- nat != mut    # logical TRUE for all mutated positions
+  lMut <- nat != mut    # logical TRUE for all mutated positions
-
+
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
-  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
+  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
-
+
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
-  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
+  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
-
+
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
-  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
+  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
-
+
-  return(sum(penalties))
+  return(sum(penalties))
-}
+}
-
+
-# A more sophisticated approach could take additional quantities into account,
+# A more sophisticated approach could take additional quantities into account,
-# such as charge, size, or flexibility - and it could add heuristics, such as:
+# such as charge, size, or flexibility - and it could add heuristics, such as:
-# proline is always bad in secondary structure, charged amino acids are terrible
+# proline is always bad in secondary structure, charged amino acids are terrible
-# in the folded core of a protein, replacing a small by a large amino acid in
+# in the folded core of a protein, replacing a small by a large amino acid in
-# the core is very disruptive ... etc.
+# the core is very disruptive ... etc.
-#
+#
-# For our experiment, we should not  use a mutation data matrix however:
+# For our experiment, we should not  use a mutation data matrix however:
-# empirical mutation probabilities are superbly suited to estimate evolutionary
+# empirical mutation probabilities are superbly suited to estimate evolutionary
-# relationships. Here however, as we are trying to evaluate effects of random
+# relationships. Here however, as we are trying to evaluate effects of random
-# mutations on genetic codes, our reasoning would be circular - we would
+# mutations on genetic codes, our reasoning would be circular - we would
-# discover that the natural genetic code is optimal ... because it is most
+# discover that the natural genetic code is optimal ... because it is most
-# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
+# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
-
+
-
+
-# =    3  Run the experiment  ==================================================
+# =    3  Run the experiment  ==================================================
-
+
-# Fetch the standard Genetic code from Biostrings::
+# Fetch the standard Genetic code from Biostrings::
-
+
-stdCode <- Biostrings::GENETIC_CODE
+stdCode <- Biostrings::GENETIC_CODE
-
+
-# Fetch the nucleotide sequence for MBP1:
+# Fetch the nucleotide sequence for MBP1:
-
+
-myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
+myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
-myDNA <- paste0(myDNA, collapse = "")
+myDNA <- paste0(myDNA, collapse = "")
-myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
+myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
-myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
+myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
-
+
-myAA <- traFor(myDNA, stdCode)
+myAA <- traFor(myDNA, stdCode)
-
+
-# Mutate and evaluate
+# Mutate and evaluate
-set.seed(112358)
+set.seed(112358)
-x <- randMut(myDNA)
+x <- randMut(myDNA)
-set.seed(NULL)
+set.seed(NULL)
-x <- traFor(x, stdCode)
+x <- traFor(x, stdCode)
-evalMut(myAA, x)  # 166.4
+evalMut(myAA, x)  # 166.4
-
+
-# Try this 200 times, and see how the values are distributed.
+# Try this 200 times, and see how the values are distributed.
-N <- 200
+N <- 200
-valSTDC <- numeric(N)
+valSTDC <- numeric(N)
-
+
-set.seed(112358)                   # set RNG seed for repeatable randomness
+set.seed(112358)                   # set RNG seed for repeatable randomness
-for (i in 1:N) {                   # this takes a few seconds ...
+for (i in 1:N) {                   # this takes a few seconds ...
-  x <- randMut(myDNA)              # mutate
+  x <- randMut(myDNA)              # mutate
-  x <- traFor(x, stdCode)     # translate
+  x <- traFor(x, stdCode)     # translate
-  valSTDC[i] <- evalMut(myAA, x)    # evaluate
+  valSTDC[i] <- evalMut(myAA, x)    # evaluate
-}
+}
-set.seed(NULL)                     # reset the RNG
+set.seed(NULL)                     # reset the RNG
-
+
-hist(valSTDC,
+hist(valSTDC,
-     breaks = 15,
+     breaks = 15,
-     col = "palegoldenrod",
+     col = "palegoldenrod",
-     xlim = c(0, 400),
+     xlim = c(0, 400),
-     ylim = c(0, N/4),
+     ylim = c(0, N/4),
-     main = "Standard vs. Synthetic Genetic Code",
+     main = "Standard vs. Synthetic Genetic Code",
-     xlab = "Mutation penalty")
+     xlab = "Mutation penalty")
-
+
-# This looks like a normal distribution. Let's assume the effect of mutations
+# This looks like a normal distribution. Let's assume the effect of mutations
-# under the standard genetic code is the mean of this distribution:
+# under the standard genetic code is the mean of this distribution:
-effectSTDC <- mean(valSTDC)  # 178.1
+effectSTDC <- mean(valSTDC)  # 178.1
-
+
-# Now we can look at the effects of alternate genetic codes:
+# Now we can look at the effects of alternate genetic codes:
-
+
-set.seed(112358)
+set.seed(112358)
-# choose a new code
+# choose a new code
-GC <- randomGC(stdCode)
+GC <- randomGC(stdCode)
-set.seed(NULL)
+set.seed(NULL)
-
+
-# reverse translate hypothetical sequence according to the new code
+# reverse translate hypothetical sequence according to the new code
-x <- traRev(myAA, GC)
+x <- traRev(myAA, GC)
-
+
-x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
+x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
-x <- traFor(x, GC)     # translate back, with the new code
+x <- traFor(x, GC)     # translate back, with the new code
-evalMut(myAA, x)       # evaluate mutation effects: 298.5
+evalMut(myAA, x)       # evaluate mutation effects: 298.5
-
+
-# That seems a fair bit higher than what we saw as "effectUGC"
+# That seems a fair bit higher than what we saw as "effectUGC"
-# Let's try with different genetic codes. 200 trials - but this time every trial
+# Let's try with different genetic codes. 200 trials - but this time every trial
-# is with a different, synthetic genetic code.
+# is with a different, synthetic genetic code.
-
+
-N <- 200
+N <- 200
-valXGC <- numeric(N)
+valXGC <- numeric(N)
-
+
-set.seed(1414214)                # set RNG seed for repeatable randomness
+set.seed(1414214)                # set RNG seed for repeatable randomness
-for (i in 1:N) {
+for (i in 1:N) {
-  GC <- randomGC(stdCode)   # Choose code
+  GC <- randomGC(stdCode)   # Choose code
-  x <- traRev(myAA, GC)          # reverse translate
+  x <- traRev(myAA, GC)          # reverse translate
-  x <- randMut(x)                # mutate
+  x <- randMut(x)                # mutate
-  x <- traFor(x, GC)             # translate
+  x <- traFor(x, GC)             # translate
-  valXGC[i] <- evalMut(myAA, x)  # evaluate
+  valXGC[i] <- evalMut(myAA, x)  # evaluate
-}
+}
-set.seed(NULL)                   # reset the RNG
+set.seed(NULL)                   # reset the RNG
-
+
-hist(valXGC,
+hist(valXGC,
-     col = "plum",
+     col = "plum",
-     breaks = 15,
+     breaks = 15,
-     add = TRUE)
+     add = TRUE)
-
+
-# These two distributions are very widely separated!
+# These two distributions are very widely separated!
-
+
-# Task: Perform the same experiment with the swapped genetic code.
+# Task: Perform the same experiment with the swapped genetic code.
-#       Compare the distributions. Interpret the result.
+#       Compare the distributions. Interpret the result.
-
+
-
+
-# These are simple experiments, under assumptions that can be refined in
+# These are simple experiments, under assumptions that can be refined in
-# meaningful ways. Yet, even those simple computational experiments show
+# meaningful ways. Yet, even those simple computational experiments show
-# that the Universal Genetic Code has features that one would predict if
+# that the Universal Genetic Code has features that one would predict if
-# it has evolved under selective pressure to minimize the effects of mutations.
+# it has evolved under selective pressure to minimize the effects of mutations.
-# Gradual change under mutation is benificial to evolution, disruptive
+# Gradual change under mutation is benificial to evolution, disruptive
-# change is not.
+# change is not.
-
+
-
+
-# =    4  Task solutions  ======================================================
+# =    4  Task solutions  ======================================================
-
+
-N <- 200
+N <- 200
-valSGC <- numeric(N)
+valSGC <- numeric(N)
-
+
-set.seed(2718282)                # set RNG seed for repeatable randomness
+set.seed(2718282)                # set RNG seed for repeatable randomness
-for (i in 1:N) {
+for (i in 1:N) {
-  GC <- swappedGC(stdCode)  # Choose code
+  GC <- swappedGC(stdCode)  # Choose code
-  x <- traRev(myAA, GC)          # reverse translate
+  x <- traRev(myAA, GC)          # reverse translate
-  x <- randMut(x)                # mutate
+  x <- randMut(x)                # mutate
-  x <- traFor(x, GC)             # translate
+  x <- traFor(x, GC)             # translate
-  valSGC[i] <- evalMut(myAA, x)  # evaluate
+  valSGC[i] <- evalMut(myAA, x)  # evaluate
-}
+}
-set.seed(NULL)                   # reset the RNG
+set.seed(NULL)                   # reset the RNG
-
+
-hist(valSGC,
+hist(valSGC,
-     col = "#6688FF88",
+     col = "#6688FF88",
-     breaks = 15,
+     breaks = 15,
-     add = TRUE)
+     add = TRUE)
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-Introduction.R
+++ b/RPR-Introduction.R
@ -1,50 +1,50 @@
-# tocID <- "RPR-Introduction.R"
+# tocID <- "RPR-Introduction.R"
-#
+#
-#
+#
-# Purpose: A Bioinformatics Course:
+# Purpose: A Bioinformatics Course:
-#              R code accompanying the RPR-Introduction unit
+#              R code accompanying the RPR-Introduction unit
-#
+#
-# Version: 1.0
+# Version: 1.0
-#
+#
-# Date:    2020-09-18
+# Date:    2020-09-18
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# V 1.0    Updtaed workflow; live
+# V 1.0    Updtaed workflow; live
-# V 0.1    First code
+# V 0.1    First code
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
+#
-# DO NOT SIMPLY  source()  THESE FILES!
+# DO NOT SIMPLY  source()  THESE FILES!
-
+
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
+#  going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-# === TASK: Local script
+# === TASK: Local script
-#
+#
-# - Open the file myScript.R
+# - Open the file myScript.R
-#
+#
-# - Create a section header with a date.
+# - Create a section header with a date.
-# - Enter an R-expression that will produce the first 11 powers of 2 (starting
+# - Enter an R-expression that will produce the first 11 powers of 2 (starting
-#     from 0). Not a loop - a single expression. The first number you get must
+#     from 0). Not a loop - a single expression. The first number you get must
-#     be 1. The last number you get must be 1024.
+#     be 1. The last number you get must be 1024.
-#
+#
-# - Save the file in the myScripts folder, and close it.
+# - Save the file in the myScripts folder, and close it.
-#
+#
-# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
+# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
-#   to execute it.
+#   to execute it.
-#
+#
-# - Done
+# - Done
-
+
-# (This task is meant  to make sure that writing R expressions, saving
+# (This task is meant  to make sure that writing R expressions, saving
-#  them in scripts, opening script files and executing code in the file works
+#  them in scripts, opening script files and executing code in the file works
-#  for you. If there is an issue, get in touch.)
+#  for you. If there is an issue, get in touch.)
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-PROSITE_POST.R
+++ b/RPR-PROSITE_POST.R
@ -1,168 +1,168 @@
-# tocID <- "RPR-PROSITE_POST.R"
+# tocID <- "RPR-PROSITE_POST.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
+#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance
+#           1.2    2020 Maintenance
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout,
+#                      use <package>::<function>() idiom throughout,
-#           1.0.1  Updates for slightly changed interfaces
+#           1.0.1  Updates for slightly changed interfaces
-#           1.0    First ABC units version
+#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                                 Line
+#TOC>   Section  Title                                                 Line
-#TOC> ---------------------------------------------------------------------
+#TOC> ---------------------------------------------------------------------
-#TOC>   1        Constructing a POST command from a Web query            43
+#TOC>   1        Constructing a POST command from a Web query            43
-#TOC>   1.1        Task - fetchPrositeFeatures() function               148
+#TOC>   1.1        Task - fetchPrositeFeatures() function               148
-#TOC>   2        Task solutions                                         156
+#TOC>   2        Task solutions                                         156
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Constructing a POST command from a Web query  ========================
+# =    1  Constructing a POST command from a Web query  ========================
-
+
-
+
-if (! requireNamespace("httr", quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = httr)       # basic information
+#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
+#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
+#  data(package = "httr")     # available datasets
-
+
-
+
-
+
-
+
-# We have reverse engineered the Web form for a ScanProsite request, and can
+# We have reverse engineered the Web form for a ScanProsite request, and can
-# construct a valid POST request from knowing the required field names. The POST
+# construct a valid POST request from knowing the required field names. The POST
-# command is similar to GET(), but we need an explicit request body that
+# command is similar to GET(), but we need an explicit request body that
-# contains a list of key/value pairs
+# contains a list of key/value pairs
-
+
-UniProtID <- "P39678"
+UniProtID <- "P39678"
-
+
-URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
+URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
-
+
-response <- httr::POST(URL,
+response <- httr::POST(URL,
-                       body = list(meta = "opt1",
+                       body = list(meta = "opt1",
-                                   meta1_protein = "opt1",
+                                   meta1_protein = "opt1",
-                                   seq = UniProtID,
+                                   seq = UniProtID,
-                                   skip = "on",
+                                   skip = "on",
-                                   output = "tabular"))
+                                   output = "tabular"))
-
+
-# Send off this request, and you should have a response in a few
+# Send off this request, and you should have a response in a few
-# seconds. Let's check the status first:
+# seconds. Let's check the status first:
-
+
-httr::status_code(response)  # If this is not 200, something went wrong and it
+httr::status_code(response)  # If this is not 200, something went wrong and it
-                             # makes no sense to continue. If this persists, ask
+                             # makes no sense to continue. If this persists, ask
-                             # on the Discussion Board what to do.
+                             # on the Discussion Board what to do.
-
+
-
+
-# The text contents of the response is available with the
+# The text contents of the response is available with the
-# content() function:
+# content() function:
-httr::content(response, "text")
+httr::content(response, "text")
-
+
-# ... should show you the same as the page contents that you have seen in the
+# ... should show you the same as the page contents that you have seen in the
-# browser. Now we need to extract the data from the page. For this simple
+# browser. Now we need to extract the data from the page. For this simple
-# example we can get away with using regular expressions, but in general we need
+# example we can get away with using regular expressions, but in general we need
-# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
+# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
-# strsplit() the response into individual lines, since each of our data elements
+# strsplit() the response into individual lines, since each of our data elements
-# is on its own line, and then capture the contents. The way Prosite has
+# is on its own line, and then capture the contents. The way Prosite has
-# formatted their HTML we can simply split on the "\\n" newline character - but
+# formatted their HTML we can simply split on the "\\n" newline character - but
-# they could write the same valid HTML without any newline-characters at all.
+# they could write the same valid HTML without any newline-characters at all.
-# Understand that we are working with a bit of a "hack" here: exploting
+# Understand that we are working with a bit of a "hack" here: exploting
-# empirical assumptions rather than a formal specification. But sometimes quick
+# empirical assumptions rather than a formal specification. But sometimes quick
-# and dirty is fine, because quick.
+# and dirty is fine, because quick.
-
+
-lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
+lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
-head(lines)
+head(lines)
-
+
-# Now we define a query pattern for the lines we want:
+# Now we define a query pattern for the lines we want:
-# we can use the uID, bracketed by two "|" pipe
+# we can use the uID, bracketed by two "|" pipe
-# characters:
+# characters:
-
+
-patt <- sprintf("\\|%s\\|", UniProtID)
+patt <- sprintf("\\|%s\\|", UniProtID)
-
+
-# ... and select only the lines that match this
+# ... and select only the lines that match this
-# pattern:
+# pattern:
-
+
-( lines <- lines[grep(patt, lines)] )
+( lines <- lines[grep(patt, lines)] )
-
+
-# ... captures the three lines of output.
+# ... captures the three lines of output.
-
+
-# Now we break the lines apart into tokens: this is another application of
+# Now we break the lines apart into tokens: this is another application of
-# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
+# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
-# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
+# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
-
+
-unlist(strsplit(lines[1], "\\t|\\|"))
+unlist(strsplit(lines[1], "\\t|\\|"))
-
+
-# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
+# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
-# with a backslash. "t" has to be escaped because we want to match a tab (\t),
+# with a backslash. "t" has to be escaped because we want to match a tab (\t),
-# not the literal character "t". And "|" has to be escaped because we mean the
+# not the literal character "t". And "|" has to be escaped because we mean the
-# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
+# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
-# backslash turns a special meaning off, and sometimes it turns a special
+# backslash turns a special meaning off, and sometimes it turns a special
-# meaning on. Unfortunately there's no easy way to tell - you just need to
+# meaning on. Unfortunately there's no easy way to tell - you just need to
-# remember the characters - or have a reference handy. The metacharacters are
+# remember the characters - or have a reference handy. The metacharacters are
-# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
+# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
-# where in the regex they are.
+# where in the regex they are.
-
+
-# Let's put the tokens into named slots of a data frame
+# Let's put the tokens into named slots of a data frame
-
+
-features <- data.frame()
+features <- data.frame()
-for (line in lines) {
+for (line in lines) {
-  tokens <- unlist(strsplit(line, "\\t|\\|"))
+  tokens <- unlist(strsplit(line, "\\t|\\|"))
-  features <- rbind(features,
+  features <- rbind(features,
-                    data.frame(uID   =  tokens[2],
+                    data.frame(uID   =  tokens[2],
-                               start =  as.numeric(tokens[4]),
+                               start =  as.numeric(tokens[4]),
-                               end   =  as.numeric(tokens[5]),
+                               end   =  as.numeric(tokens[5]),
-                               psID  =  tokens[6],
+                               psID  =  tokens[6],
-                               psName = tokens[7],
+                               psName = tokens[7],
-                               psSeq  = tokens[11]))
+                               psSeq  = tokens[11]))
-}
+}
-features
+features
-
+
-#  This forms the base of a function that collects the features automatically
+#  This forms the base of a function that collects the features automatically
-#  from a PrositeScan result. You can write this!
+#  from a PrositeScan result. You can write this!
-
+
-
+
-# ==   1.1  Task - fetchPrositeFeatures() function  ============================
+# ==   1.1  Task - fetchPrositeFeatures() function  ============================
-
+
-
+
-# Task: write a function that takes as input a UniProt ID, fetches the
+# Task: write a function that takes as input a UniProt ID, fetches the
-# features it contains from ScanProsite and returns a data frame as given above, or
+# features it contains from ScanProsite and returns a data frame as given above, or
-# an empty data frame if there is an error.
+# an empty data frame if there is an error.
-
+
-
+
-# =    2  Task solutions  ======================================================
+# =    2  Task solutions  ======================================================
-
+
-
+
-# I have placed such a function into the ABC-dbUtilities.R script: look it up by
+# I have placed such a function into the ABC-dbUtilities.R script: look it up by
-# clicking on  dbFetchPrositeFeatures() in the Environment pane.
+# clicking on  dbFetchPrositeFeatures() in the Environment pane.
-
+
-# Test:
+# Test:
-dbFetchPrositeFeatures("Q5KMQ9")
+dbFetchPrositeFeatures("Q5KMQ9")
-
+
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-Pipe.R
+++ b/RPR-Pipe.R
@ -1,135 +1,135 @@
-# tocID <- "RPR-Pipe.R"
+# tocID <- "RPR-Pipe.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              Discussing pipe operators.
+#              Discussing pipe operators.
-#
+#
-# Version:  1.0
+# Version:  1.0
-#
+#
-# Date:     2021  10
+# Date:     2021  10
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.0    New code
+#           1.0    New code
-#
+#
-#
+#
-# TODO:
+# TODO:
-#   - find more interesting examples
+#   - find more interesting examples
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                            Line
+#TOC>   Section  Title                            Line
-#TOC> ------------------------------------------------
+#TOC> ------------------------------------------------
-#TOC>   1        Pipe  Concept                      41
+#TOC>   1        Pipe  Concept                      41
-#TOC>   2        Nested Expression                  73
+#TOC>   2        Nested Expression                  73
-#TOC>   3        magrittr:: Pipe                    78
+#TOC>   3        magrittr:: Pipe                    78
-#TOC>   4        Base R Pipe                        93
+#TOC>   4        Base R Pipe                        93
-#TOC>   5        Intermediate Assignment           108
+#TOC>   5        Intermediate Assignment           108
-#TOC>   6        Postscript                        127
+#TOC>   6        Postscript                        127
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Pipe  Concept  =======================================================
+# =    1  Pipe  Concept  =======================================================
-
+
-# Pipes are actually an awesome idea for any code that implements a workflow -
+# Pipes are actually an awesome idea for any code that implements a workflow -
-# a sequence of operations, each of which transforms data in a specialized way.
+# a sequence of operations, each of which transforms data in a specialized way.
-#
+#
-# This principle is familiar from maths: chained functions. If have a function
+# This principle is familiar from maths: chained functions. If have a function
-# y = f(x) and want to use those results as in z = g(y), I can just write
+# y = f(x) and want to use those results as in z = g(y), I can just write
-# z = g(f(x))
+# z = g(f(x))
-#
+#
-# On the unix command line, pipes were used from the very beginning, implemented
+# On the unix command line, pipes were used from the very beginning, implemented
-# with the "|" pipe character.
+# with the "|" pipe character.
-#
+#
-# In R, the magrittr package provided the %>% operator, and recently the |>
+# In R, the magrittr package provided the %>% operator, and recently the |>
-# operator has been introduced into base R.
+# operator has been introduced into base R.
-#
+#
-# However there are alternatives: intermediate assignment, and nested functions
+# However there are alternatives: intermediate assignment, and nested functions
-# that have always existed in base R anyway.
+# that have always existed in base R anyway.
-#
+#
-# Let us look at an example. In writing this, I found out that virtually
+# Let us look at an example. In writing this, I found out that virtually
-# ALL non-trivial examples I came up with don't translate well into this idiom
+# ALL non-trivial examples I came up with don't translate well into this idiom
-# at all. It is actually quite limited to simple filtering operations on
+# at all. It is actually quite limited to simple filtering operations on
-# data. A more interesting example might be added in the future, let me know if
+# data. A more interesting example might be added in the future, let me know if
-# you have a good idea.
+# you have a good idea.
-#
+#
-# A somewhat contrived example is to sort a list of files by the
+# A somewhat contrived example is to sort a list of files by the
-# length of the file names:
+# length of the file names:
-
+
-myFiles <- list.files(pattern = "\\.R$")
+myFiles <- list.files(pattern = "\\.R$")
-
+
-# nchar() gives the number of characters in a string, order() produces indices
+# nchar() gives the number of characters in a string, order() produces indices
-# that map an array to its sorted form.
+# that map an array to its sorted form.
-#
+#
-# =    2  Nested Expression  ===================================================
+# =    2  Nested Expression  ===================================================
-
+
-myFiles[order(nchar(myFiles))]
+myFiles[order(nchar(myFiles))]
-
+
-
+
-# =    3  magrittr:: Pipe  =====================================================
+# =    3  magrittr:: Pipe  =====================================================
-
+
-if (! requireNamespace("magrittr", quietly = TRUE)) {
+if (! requireNamespace("magrittr", quietly = TRUE)) {
-  install.packages("magrittr")
+  install.packages("magrittr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = magrittr)       # basic information
+#  library(help = magrittr)       # basic information
-#  browseVignettes("magrittr")    # available vignettes
+#  browseVignettes("magrittr")    # available vignettes
-#  data(package = "magrittr")     # available datasets
+#  data(package = "magrittr")     # available datasets
-
+
-
+
-library(magrittr)
+library(magrittr)
-
+
-myFiles  %>% nchar %>% order %>% myFiles[.]
+myFiles  %>% nchar %>% order %>% myFiles[.]
-
+
-# =    4  Base R Pipe  =========================================================
+# =    4  Base R Pipe  =========================================================
-
+
-# Since version 4.1, base R now supports a pipe operator without the need
+# Since version 4.1, base R now supports a pipe operator without the need
-# to load a special package. Such an introductions of external functionality
+# to load a special package. Such an introductions of external functionality
-# into the language is very rare.
+# into the language is very rare.
-#
+#
-# Unfortunately it won't (yet) work with the '[' function, so we need to write
+# Unfortunately it won't (yet) work with the '[' function, so we need to write
-# an intermediate function for this example
+# an intermediate function for this example
-extract <- function(x, v) {
+extract <- function(x, v) {
-  return(v[x])
+  return(v[x])
-}
+}
-
+
-myFiles |> nchar() |> order() |> extract(myFiles)
+myFiles |> nchar() |> order() |> extract(myFiles)
-
+
-
+
-# =    5  Intermediate Assignment  =============================================
+# =    5  Intermediate Assignment  =============================================
-
+
-# So what's the problem? As you can see, the piped code may be concise and
+# So what's the problem? As you can see, the piped code may be concise and
-# expressive. But there is also a large amount of implicit assignment and
+# expressive. But there is also a large amount of implicit assignment and
-# processing going on and that is usually a bad idea because it makes code hard
+# processing going on and that is usually a bad idea because it makes code hard
-# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
+# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
-# replacing it with the pipe makes things much better. My preferred idiom is
+# replacing it with the pipe makes things much better. My preferred idiom is
-# to use intermediate assignments. Only then is it convenient to examine
+# to use intermediate assignments. Only then is it convenient to examine
-# the code step by step and validate every single step. And that is the most
+# the code step by step and validate every single step. And that is the most
-# important objective at all: no code is good if it does not compute
+# important objective at all: no code is good if it does not compute
-# correctly.
+# correctly.
-
+
-
+
-x <- nchar(myFiles)
+x <- nchar(myFiles)
-x <- order(x)
+x <- order(x)
-myFiles[x]
+myFiles[x]
-
+
-
+
-
+
-# =    6  Postscript  ==========================================================
+# =    6  Postscript  ==========================================================
-
+
-# I tried to write an example that strips all comments from a list of files, and
+# I tried to write an example that strips all comments from a list of files, and
-# another example that finds all files that were not yet updated this year
+# another example that finds all files that were not yet updated this year
-# (according to the "# Date: in the header). Neither examples can be well
+# (according to the "# Date: in the header). Neither examples can be well
-# written without intermediate assignments, or at least sapply() functions
+# written without intermediate assignments, or at least sapply() functions
-# that are not simpler at all than the intermediate assignment.
+# that are not simpler at all than the intermediate assignment.
-
+
-# [END]
+# [END]
--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -1,180 +1,180 @@
-# tocID <- "RPR-RegEx.R"
+# tocID <- "RPR-RegEx.R"
-#
+#
-# Purpose: A Bioinformatics Course:
+# Purpose: A Bioinformatics Course:
-#              R code accompanying the RPR-RegEx unit
+#              R code accompanying the RPR-RegEx unit
-#
+#
-# Version: 1.0
+# Version: 1.0
-#
+#
-# Date:    2017-08  -  2020-09
+# Date:    2017-08  -  2020-09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# V 0.1    Maintenance 2020
+# V 0.1    Maintenance 2020
-# V 0.1    First code
+# V 0.1    First code
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
+# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
-#
+#
-# DO NOT SIMPLY  source()  THESE FILES!
+# DO NOT SIMPLY  source()  THESE FILES!
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-#  going on. That's not how it works ...
+#  going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC>
+#TOC>
-#TOC>   Section  Title                                Line
+#TOC>   Section  Title                                Line
-#TOC> ----------------------------------------------------
+#TOC> ----------------------------------------------------
-#TOC>   1        A regex example                        41
+#TOC>   1        A regex example                        41
-#TOC>   2        Counting lines                        108
+#TOC>   2        Counting lines                        108
-#TOC>   2.1        Counting C-alpha atoms only         126
+#TOC>   2.1        Counting C-alpha atoms only         126
-#TOC>   3        Code Solutions                        142
+#TOC>   3        Code Solutions                        142
-#TOC>   3.1        Counting atoms                      144
+#TOC>   3.1        Counting atoms                      144
-#TOC>   3.2        Counting C-alpha records            160
+#TOC>   3.2        Counting C-alpha records            160
-#TOC>
+#TOC>
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  A regex example  =====================================================
+# =    1  A regex example  =====================================================
-
+
-# The canonical FASTA version of yeast Mbp1 at Uniprot
+# The canonical FASTA version of yeast Mbp1 at Uniprot
-s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
+s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
-MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
+MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
-ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
+ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
-SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
+SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
-KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
+KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
-QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
+QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
-PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
+PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
-FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
+FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
-IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
+IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
-SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
+SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
-ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
+ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
-VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
+VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
-IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
+IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
-QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
+QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
-IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
+IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
-
+
-nchar(s)
+nchar(s)
-# Must be 969
+# Must be 969
-
+
-# Task: Fetch the Uniprot ID by retrieving the first string that appears between
+# Task: Fetch the Uniprot ID by retrieving the first string that appears between
-# two vertical bars ("pipes") in the header record.
+# two vertical bars ("pipes") in the header record.
-#
+#
-
+
-# Develop the regular expression:
+# Develop the regular expression:
-                      # Just five characters returned, so we know we are using
+                      # Just five characters returned, so we know we are using
-patt <- "^>(.{5})"    # the right functions
+patt <- "^>(.{5})"    # the right functions
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
+
-patt <- "^>(.*)|"    # everything to the pipe character
+patt <- "^>(.*)|"    # everything to the pipe character
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
+
-# Ooops - "|" is a metacharacter - we must escape it
+# Ooops - "|" is a metacharacter - we must escape it
-
+
-patt <- "^>(.*)\|"    # using "\|"
+patt <- "^>(.*)\|"    # using "\|"
-# Ooops - that's not how we escape: must double the \ to send a literal
+# Ooops - that's not how we escape: must double the \ to send a literal
-# "\" plus the character "|" to the regex engine.
+# "\" plus the character "|" to the regex engine.
-
+
-patt <- "^>(.*)\\|"    # using "\\|"
+patt <- "^>(.*)\\|"    # using "\\|"
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
+
-# Good. Now let's first match everything that is not a "|", then match a "|"
+# Good. Now let's first match everything that is not a "|", then match a "|"
-patt <- "^>([^|]*)\\|"
+patt <- "^>([^|]*)\\|"
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
+
-# the same thing again, but capture the second match. And insist that there
+# the same thing again, but capture the second match. And insist that there
-# must be at least one character captured
+# must be at least one character captured
-
+
-patt <- "^>[^|]*\\|([^|]+)\\|"
+patt <- "^>[^|]*\\|([^|]+)\\|"
-# Analyze this pattern:
+# Analyze this pattern:
-#    ^           anchor the match at the beginning of the line
+#    ^           anchor the match at the beginning of the line
-#    >           ">" must be the first character
+#    >           ">" must be the first character
-#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
+#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
-#                  we don't know what other versions of the string "sp"
+#                  we don't know what other versions of the string "sp"
-#                  might appear. Note that within the brackets "|" is NOT a
+#                  might appear. Note that within the brackets "|" is NOT a
-#                  metacharacter.
+#                  metacharacter.
-#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
+#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
-#                  and means "OR"; we need to escape it to match a literal "|".
+#                  and means "OR"; we need to escape it to match a literal "|".
-#    (           open parenthesis: capture what comes next ...
+#    (           open parenthesis: capture what comes next ...
-#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
+#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
-#    )           close parenthesis: stop capturing here
+#    )           close parenthesis: stop capturing here
-#    \\|           second "|" character, escaped
+#    \\|           second "|" character, escaped
-regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
-
+
-
+
-# =    2  Counting lines  ======================================================
+# =    2  Counting lines  ======================================================
-
+
-# Task: Write a function that returns the number of atoms in a PDB file. Call it
+# Task: Write a function that returns the number of atoms in a PDB file. Call it
-#       atomCount(). Sample data is here:
+#       atomCount(). Sample data is here:
-myPDB <- readLines("./data/0TST.pdb")
+myPDB <- readLines("./data/0TST.pdb")
-
+
-#       Specification:
+#       Specification:
-#       Read a file from its path given as the only argument.
+#       Read a file from its path given as the only argument.
-#       Return the number of lines in that file that begin with "ATOM  "
+#       Return the number of lines in that file that begin with "ATOM  "
-#       or with "HETATM".
+#       or with "HETATM".
-
+
-#       Try this. Write a function. Solution code is at the end of this file.
+#       Try this. Write a function. Solution code is at the end of this file.
-#       Don't peek.
+#       Don't peek.
-
+
-atomCount("./data/0TST.pdb")  # must return 6
+atomCount("./data/0TST.pdb")  # must return 6
-
+
-
+
-
+
-# ==   2.1  Counting C-alpha atoms only  =======================================
+# ==   2.1  Counting C-alpha atoms only  =======================================
-
+
-# Task: write a function based on the previous one that matches only CA records,
+# Task: write a function based on the previous one that matches only CA records,
-#       i.e. it can be used to count the number of amino acids. Don't get
+#       i.e. it can be used to count the number of amino acids. Don't get
-#       fooled by calcium atoms, or the string CA appearing elsewhere.
+#       fooled by calcium atoms, or the string CA appearing elsewhere.
-#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
+#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
-
+
-#       Specification:
+#       Specification:
-#       Read a file from its path given as the only argument.
+#       Read a file from its path given as the only argument.
-#       Return the number of lines in that file that have a C-alpha atom.
+#       Return the number of lines in that file that have a C-alpha atom.
-
+
-#       Try this. Solution code is at the end of this file. Don't peek.
+#       Try this. Solution code is at the end of this file. Don't peek.
-
+
-CAcount("./data/0TST.pdb")  # must return 1
+CAcount("./data/0TST.pdb")  # must return 1
-
+
-
+
-# =    3  Code Solutions  ======================================================
+# =    3  Code Solutions  ======================================================
-
+
-# ==   3.1  Counting atoms  ====================================================
+# ==   3.1  Counting atoms  ====================================================
-
+
-atomCount <- function(IN) {
+atomCount <- function(IN) {
-  # count the number of atoms in a PDB formatted file
+  # count the number of atoms in a PDB formatted file
-  # Parameters:
+  # Parameters:
-  #     IN  chr  path of the file to read
+  #     IN  chr  path of the file to read
-  # Value:
+  # Value:
-  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
+  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
-  # Note: the regex MUST be anchored to the beginning of the line, otherwise
+  # Note: the regex MUST be anchored to the beginning of the line, otherwise
-  # it might match somewhere in a comment!
+  # it might match somewhere in a comment!
-  x <- readLines(IN)
+  x <- readLines(IN)
-  patt <- "(^ATOM  )|(^HETATM)"
+  patt <- "(^ATOM  )|(^HETATM)"
-  return(length(grep(patt, x)))
+  return(length(grep(patt, x)))
-}
+}
-
+
-
+
-# ==   3.2  Counting C-alpha records  ==========================================
+# ==   3.2  Counting C-alpha records  ==========================================
-
+
-
+
-CAcount <- function(IN) {
+CAcount <- function(IN) {
-  # count the number of C-alpha atoms in a PDB formatted file
+  # count the number of C-alpha atoms in a PDB formatted file
-  # Parameters:
+  # Parameters:
-  #     IN  chr  path of the file to read
+  #     IN  chr  path of the file to read
-  # Value:
+  # Value:
-  #         numeric  number of lines that match " CA " in position 13 - 16 of
+  #         numeric  number of lines that match " CA " in position 13 - 16 of
-  #                  an ATOM record.
+  #                  an ATOM record.
-  # Note: the regex MUST be aligned into the right position, otherwise it
+  # Note: the regex MUST be aligned into the right position, otherwise it
-  #       might match Calcium records!
+  #       might match Calcium records!
-  x <- readLines(IN)
+  x <- readLines(IN)
-  patt <- "^ATOM  ...... CA "
+  patt <- "^ATOM  ...... CA "
-  return(length(grep(patt, x)))
+  return(length(grep(patt, x)))
-}
+}
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-SX-PDB.R
+++ b/RPR-SX-PDB.R
--- a/RPR-UniProt_GET.R
+++ b/RPR-UniProt_GET.R
@ -1,135 +1,135 @@
-# tocID <- "RPR-UniProt_GET.R"
+# tocID <- "RPR-UniProt_GET.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
+#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
+#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
-#                  added FASTA headers as attribute
+#                  added FASTA headers as attribute
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.0    First ABC units version
+#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                      Line
+#TOC>   Section  Title                                      Line
-#TOC> ----------------------------------------------------------
+#TOC> ----------------------------------------------------------
-#TOC>   1        UniProt files via GET                        43
+#TOC>   1        UniProt files via GET                        43
-#TOC>   1.1        Task - fetchUniProtSeq() function         105
+#TOC>   1.1        Task - fetchUniProtSeq() function         105
-#TOC>   2        Task solutions                              118
+#TOC>   2        Task solutions                              118
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  UniProt files via GET  ===============================================
+# =    1  UniProt files via GET  ===============================================
-
+
-
+
-# Perhaps the simplest example of scripted download is to retrieve a protein
+# Perhaps the simplest example of scripted download is to retrieve a protein
-# FASTA sequence from UniProt. All we need is to construct an URL with the
+# FASTA sequence from UniProt. All we need is to construct an URL with the
-# correct UniProt ID.
+# correct UniProt ID.
-
+
-# An interface between R scripts and Web servers is provided by the httr::
+# An interface between R scripts and Web servers is provided by the httr::
-# package. This sends and receives information via the http protocol, just like
+# package. This sends and receives information via the http protocol, just like
-# a Web browser. Since this is a short and simple request, the GET verb is the
+# a Web browser. Since this is a short and simple request, the GET verb is the
-# right tool:
+# right tool:
-
+
-if (! requireNamespace("httr", quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = httr)       # basic information
+#  library(help = httr)       # basic information
-#  browseVignettes("httr")    # available vignettes
+#  browseVignettes("httr")    # available vignettes
-#  data(package = "httr")     # available datasets
+#  data(package = "httr")     # available datasets
-
+
-
+
-# The UniProt ID for Mbp1 is ...
+# The UniProt ID for Mbp1 is ...
-
+
-UniProtID <- "P39678"
+UniProtID <- "P39678"
-
+
-# and the base URL to retrieve data is  ...
+# and the base URL to retrieve data is  ...
-# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
+# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
-# retrieve a FASTA sequence:
+# retrieve a FASTA sequence:
-
+
-(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
+(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
-
+
-# the GET() function from httr will get the data.
+# the GET() function from httr will get the data.
-response <- httr::GET(URL)
+response <- httr::GET(URL)
-
+
-str(response) # the response object is a bit complex ...
+str(response) # the response object is a bit complex ...
-as.character(response) # ... but it is easy to pull out the data.
+as.character(response) # ... but it is easy to pull out the data.
-
+
-# to process  ...
+# to process  ...
-x <- as.character(response)
+x <- as.character(response)
-x <- strsplit(x, "\n")
+x <- strsplit(x, "\n")
-dbSanitizeSequence(x)
+dbSanitizeSequence(x)
-
+
-# Simple.
+# Simple.
-# But what happens if there is an error, e.g. the uniprot ID does not exist?
+# But what happens if there is an error, e.g. the uniprot ID does not exist?
-
+
-response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
+response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
-as.character(response)
+as.character(response)
-# this is a large HTML page that tells us the URL was not found. So we need to
+# this is a large HTML page that tells us the URL was not found. So we need to
-# check for errors.  The Right Way to do this is to evaluate the staus code that
+# check for errors.  The Right Way to do this is to evaluate the staus code that
-# every Web server returns for every transaction.
+# every Web server returns for every transaction.
-#
+#
-httr::status_code(response)  # 404 == Page Not Found
+httr::status_code(response)  # 404 == Page Not Found
-
+
-# There are many possible codes, but the only code we will be happy with
+# There are many possible codes, but the only code we will be happy with
-# is 200 - oK.
+# is 200 - oK.
-# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
+# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
-
+
-URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
+URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
-response <- httr::GET(URL)
+response <- httr::GET(URL)
-httr::status_code(response)
+httr::status_code(response)
-
+
-
+
-# ==   1.1  Task - fetchUniProtSeq() function  =================================
+# ==   1.1  Task - fetchUniProtSeq() function  =================================
-
+
-# Task: write a function that
+# Task: write a function that
-#   - takes as input a vector of UniProt IDs,
+#   - takes as input a vector of UniProt IDs,
-#   - fetches the FASTA sequence for each
+#   - fetches the FASTA sequence for each
-#   - returns a vector of the same length as the input, where an element is:
+#   - returns a vector of the same length as the input, where an element is:
-#   -  ...  the sequence, if the query was successful
+#   -  ...  the sequence, if the query was successful
-#   -  ...  NA if there was an error
+#   -  ...  NA if there was an error
-#   - each element has the UniProt ID as the name()
+#   - each element has the UniProt ID as the name()
-#   - bonus: the output has an attribute "headers" that is a vector of the
+#   - bonus: the output has an attribute "headers" that is a vector of the
-#            FASTA headers ( cf. ?attr )
+#            FASTA headers ( cf. ?attr )
-
+
-
+
-# =    2  Task solutions  ======================================================
+# =    2  Task solutions  ======================================================
-
+
-
+
-# I have placed such a function - dbFetchUniProtSeq() - into
+# I have placed such a function - dbFetchUniProtSeq() - into
-# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
+# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
-# in the Environment pane.
+# in the Environment pane.
-
+
-# Test this:
+# Test this:
-( x <- dbFetchUniProtSeq("P39678") )
+( x <- dbFetchUniProtSeq("P39678") )
-names(x)[1]
+names(x)[1]
-attr(x, "headers")[1]
+attr(x, "headers")[1]
-x[1]
+x[1]
-cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
+cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
-               width = 40), sep = "\n")
+               width = 40), sep = "\n")
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-Unit_testing.R
+++ b/RPR-Unit_testing.R
@ -1,234 +1,234 @@
-# tocID <- "RPR-Unit_testing.R"
+# tocID <- "RPR-Unit_testing.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Unit_testing unit.
+#              R code accompanying the RPR-Unit_testing unit.
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017  10  -  2019  01
+# Date:     2017  10  -  2019  01
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Updates. Discuss local tests.
+#           1.2    2020 Updates. Discuss local tests.
-#           1.1    Change from require() to requireNamespace()
+#           1.1    Change from require() to requireNamespace()
-#           1.0    New code
+#           1.0    New code
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                             Line
+#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
+#TOC> -------------------------------------------------
-#TOC>   1        Unit Tests with testthat            42
+#TOC>   1        Unit Tests with testthat            42
-#TOC>   2        Organizing your tests              165
+#TOC>   2        Organizing your tests              165
-#TOC>   2.1        Testing scripts                  189
+#TOC>   2.1        Testing scripts                  189
-#TOC>   2.2        Rethinking testing               202
+#TOC>   2.2        Rethinking testing               202
-#TOC>   3        Task solutions                     220
+#TOC>   3        Task solutions                     220
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Unit Tests with testthat  ============================================
+# =    1  Unit Tests with testthat  ============================================
-
+
-# The testthat package supports writing and executing unit tests in many ways.
+# The testthat package supports writing and executing unit tests in many ways.
-
+
-if (! requireNamespace("testthat", quietly = TRUE)) {
+if (! requireNamespace("testthat", quietly = TRUE)) {
-  install.packages("testthat")
+  install.packages("testthat")
-}
+}
-# Package information:
+# Package information:
-#  library(help = testthat)       # basic information
+#  library(help = testthat)       # basic information
-#  browseVignettes("testthat")    # available vignettes
+#  browseVignettes("testthat")    # available vignettes
-#  data(package = "testthat")     # available datasets
+#  data(package = "testthat")     # available datasets
-
+
-# testthat is one of those packages that we either use A LOT in a script,
+# testthat is one of those packages that we either use A LOT in a script,
-# or not at all. Therefore it's more reasonable to depart from our usual
+# or not at all. Therefore it's more reasonable to depart from our usual
-# <package>::<function>() idiom, and load the entire library. In fact, if
+# <package>::<function>() idiom, and load the entire library. In fact, if
-# we author packages, it is common practice to load testthat in the part
+# we author packages, it is common practice to load testthat in the part
-# of the package that automates testing.
+# of the package that automates testing.
-
+
-library(testthat)
+library(testthat)
-
+
-# An atomic test consists of an expectation about the bahaviour of a function or
+# An atomic test consists of an expectation about the bahaviour of a function or
-# the existence of an object. testthat provides a number of useful expectations:
+# the existence of an object. testthat provides a number of useful expectations:
-
+
-# At the most basic level, you can use expect_true() and expect_false():
+# At the most basic level, you can use expect_true() and expect_false():
-
+
-expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
+expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
-expect_true(file.exists("NO-SUCH-FILE.txt"))
+expect_true(file.exists("NO-SUCH-FILE.txt"))
-
+
-expect_false(is.integer(NA))
+expect_false(is.integer(NA))
-
+
-# More commonly, you will test for equality of an output with a given result.
+# More commonly, you will test for equality of an output with a given result.
-# But you need to consider what it means for two numbers to be "equal" on a
+# But you need to consider what it means for two numbers to be "equal" on a
-# digital computer. Consider:
+# digital computer. Consider:
-
+
-49*(1/49) == 1      # Surprised? Read FAQ 7.31
+49*(1/49) == 1      # Surprised? Read FAQ 7.31
-                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
+                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
-49*(1/49) - 1       # NOT zero (but almost)
+49*(1/49) - 1       # NOT zero (but almost)
-
+
-# This is really unpredictable ...
+# This is really unpredictable ...
-0.1 + 0.05 == 0.15
+0.1 + 0.05 == 0.15
-0.2 + 0.07 == 0.27
+0.2 + 0.07 == 0.27
-
+
-# It's easy to be caught on the wrong foot with numeric comparisons, therefore
+# It's easy to be caught on the wrong foot with numeric comparisons, therefore
-# R uses the function all.equal() to test whether two numbers are equal for
+# R uses the function all.equal() to test whether two numbers are equal for
-# practical puposes up to machine precision.
+# practical puposes up to machine precision.
-49*(1/49) == 1
+49*(1/49) == 1
-all.equal(49*(1/49), 1)
+all.equal(49*(1/49), 1)
-
+
-# The testthat function expect_equal() uses all.equal internally:
+# The testthat function expect_equal() uses all.equal internally:
-expect_equal(49*(1/49), 1)
+expect_equal(49*(1/49), 1)
-
+
-# ... which is reasonable, or, if things MUST be exactly the same ...
+# ... which is reasonable, or, if things MUST be exactly the same ...
-expect_identical(49*(1/49), 1)
+expect_identical(49*(1/49), 1)
-
+
-# ... but consider:
+# ... but consider:
-expect_identical(2, 2L) # one is typeof() "double", the other is integer"
+expect_identical(2, 2L) # one is typeof() "double", the other is integer"
-
+
-# Some very useful expectations are expect_warning(), and expect_error(), for
+# Some very useful expectations are expect_warning(), and expect_error(), for
-# constructing tests that check for erroneous output:
+# constructing tests that check for erroneous output:
-
+
-as.integer(c("1", "2", "three"))
+as.integer(c("1", "2", "three"))
-expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
+expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
-                                                 # printed.
+                                                 # printed.
-1/"x"
+1/"x"
-expect_warning(1/"x")
+expect_warning(1/"x")
-expect_error(1/"x")      # Again: note that the error is NOT printed, as well
+expect_error(1/"x")      # Again: note that the error is NOT printed, as well
-                         # code execution will continue.
+                         # code execution will continue.
-
+
-# Even better, you can check if the warning or error is what you expect it
+# Even better, you can check if the warning or error is what you expect it
-# to be - because it could actually have occured somewhere else in your code.
+# to be - because it could actually have occured somewhere else in your code.
-
+
-v <- c("1", "x")
+v <- c("1", "x")
-log(v[1:2])
+log(v[1:2])
-expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
+expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
-expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
+expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
-expect_error(log(v[1,2]))                # This appears oK, but ...
+expect_error(log(v[1,2]))                # This appears oK, but ...
-expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
+expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
-
+
-# Producing unit tests simply means: we define a function, and then we check
+# Producing unit tests simply means: we define a function, and then we check
-# whether all test pass. Consider a function that is loaded on startup from
+# whether all test pass. Consider a function that is loaded on startup from
-# the .utilities.R script:
+# the .utilities.R script:
-
+
-biCode
+biCode
-
+
-# We could test it like so:
+# We could test it like so:
-
+
-expect_equal(biCode(""), ".....")
+expect_equal(biCode(""), ".....")
-expect_equal(biCode(" "), ".....")
+expect_equal(biCode(" "), ".....")
-expect_equal(biCode("123 12"), ".....")
+expect_equal(biCode("123 12"), ".....")
-expect_equal(biCode("h sapiens"), "H..SA")
+expect_equal(biCode("h sapiens"), "H..SA")
-expect_equal(biCode("homo sapiens"), "HOMSA")
+expect_equal(biCode("homo sapiens"), "HOMSA")
-expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
+expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
-expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
+expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
-             c("PHACI", "MACRU"))
+             c("PHACI", "MACRU"))
-expect_error(biCode(), "argument \"s\" is missing, with no default")
+expect_error(biCode(), "argument \"s\" is missing, with no default")
-
+
-# The test_that() function allows to group related tests, include an informative
+# The test_that() function allows to group related tests, include an informative
-# message which test is being executed, and run a number of tests that are
+# message which test is being executed, and run a number of tests that are
-# passed to the function inside a code block - i.e. {...}
+# passed to the function inside a code block - i.e. {...}
-# test_that("<descriptive string>, {<code block>})
+# test_that("<descriptive string>, {<code block>})
-
+
-test_that("NA values are preserved", {
+test_that("NA values are preserved", {
-  # bicode() respects vector length: input and output must have the smae length.
+  # bicode() respects vector length: input and output must have the smae length.
-  # Therefore NA's can't be simply skipped, bust must be properly passed
+  # Therefore NA's can't be simply skipped, bust must be properly passed
-  # into output:
+  # into output:
-  expect_true(is.na((biCode(NA))))
+  expect_true(is.na((biCode(NA))))
-  expect_equal(biCode(c("first", NA, "last")),
+  expect_equal(biCode(c("first", NA, "last")),
-               c("FIRST", NA, "LAST."))
+               c("FIRST", NA, "LAST."))
-})
+})
-
+
-
+
-# Task: Write a function calcGC() that calculates GC content in a sequence.
+# Task: Write a function calcGC() that calculates GC content in a sequence.
-#       Hint: you could strsplit() the sequence into a vector, and count
+#       Hint: you could strsplit() the sequence into a vector, and count
-#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
+#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
-#       A's and T's, and use nchar() before and after to calculate the content
+#       A's and T's, and use nchar() before and after to calculate the content
-#       from the length difference.
+#       from the length difference.
-#       Then write tests that:
+#       Then write tests that:
-#          confirm that calcGC("AATT") is 0;
+#          confirm that calcGC("AATT") is 0;
-#          confirm that calcGC("ATGC") is 0.5;
+#          confirm that calcGC("ATGC") is 0.5;
-#          confirm that calcGC("AC")   is 0.5;
+#          confirm that calcGC("AC")   is 0.5;
-#          confirm that calcGC("CGCG") is 1;
+#          confirm that calcGC("CGCG") is 1;
-
+
-
+
-# =    2  Organizing your tests  ===============================================
+# =    2  Organizing your tests  ===============================================
-
+
-
+
-# Tests are only useful if they are actually executed and we need to make sure
+# Tests are only useful if they are actually executed and we need to make sure
-# there are no barriers to do that. The testthat package supports automatic
+# there are no barriers to do that. The testthat package supports automatic
-# execution of tests:
+# execution of tests:
-#  - put your tests into an R-script,
+#  - put your tests into an R-script,
-#  - save your tests in a file called "test_<my-function-name>.R"
+#  - save your tests in a file called "test_<my-function-name>.R"
-#  - execute the test with test_file("test_<my-function-name>.R") ...
+#  - execute the test with test_file("test_<my-function-name>.R") ...
-#  ... or, if you are working on a project ...
+#  ... or, if you are working on a project ...
-#  - place the file in a test-directory (e.g. the directory "test" in this
+#  - place the file in a test-directory (e.g. the directory "test" in this
-#      project),
+#      project),
-#  - execute all your tests with test_dir("<my-test-directory>")
+#  - execute all your tests with test_dir("<my-test-directory>")
-
+
-# For example I have provided a "tests" directory with this project, and
+# For example I have provided a "tests" directory with this project, and
-# placed the file "test_biCode.R" inside.
+# placed the file "test_biCode.R" inside.
-file.show("./tests/test_biCode.R")
+file.show("./tests/test_biCode.R")
-
+
-# Execute the file ...
+# Execute the file ...
-test_file("./tests/test_biCode.R")
+test_file("./tests/test_biCode.R")
-
+
-# .. or execute all the test files in the directory:
+# .. or execute all the test files in the directory:
-test_dir("./tests")
+test_dir("./tests")
-
+
-# ==   2.1  Testing scripts  ===================================================
+# ==   2.1  Testing scripts  ===================================================
-
+
-# Scripts need special consideration since we do not necessarily source() them
+# Scripts need special consideration since we do not necessarily source() them
-# entirely. Therefore automated testing is not reasonable. What you can do
+# entirely. Therefore automated testing is not reasonable. What you can do
-# instead is to place a conditional block at the end of your script, that
+# instead is to place a conditional block at the end of your script, that
-# never gets executed - then you can manually execute the code in the block
+# never gets executed - then you can manually execute the code in the block
-# whenever you wish to test your functions. For example:
+# whenever you wish to test your functions. For example:
-
+
-if (FALSE) {
+if (FALSE) {
-  # ... your tests go here
+  # ... your tests go here
-
+
-}
+}
-
+
-# ==   2.2  Rethinking testing  ================================================
+# ==   2.2  Rethinking testing  ================================================
-
+
-# However, it is important to keep in mind that different objectives lead to
+# However, it is important to keep in mind that different objectives lead to
-# different ideas of what works best. There is never a "best" in and of itself,
+# different ideas of what works best. There is never a "best" in and of itself,
-# the question is always: "Best for what?" While automated unit testing is a
+# the question is always: "Best for what?" While automated unit testing is a
-# great way to assure the integrity of packages and larger software artefacts as
+# great way to assure the integrity of packages and larger software artefacts as
-# they are being developed, more loosely conceived aggregates of code - like the
+# they are being developed, more loosely conceived aggregates of code - like the
-# scripts for this course for example - have different objectives and in this
+# scripts for this course for example - have different objectives and in this
-# case I find the testthat approach to actually be inferior. The reason is its
+# case I find the testthat approach to actually be inferior. The reason is its
-# tendency to physically separate code and tests. Keeping assets, and functions
+# tendency to physically separate code and tests. Keeping assets, and functions
-# that operate on those assets separated is always poor design. I have found
+# that operate on those assets separated is always poor design. I have found
-# over time that a more stable approach is to move individual functions into
+# over time that a more stable approach is to move individual functions into
-# their individual scripts, all in one folder, one function (and its helpers)
+# their individual scripts, all in one folder, one function (and its helpers)
-# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
+# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
-# explained above.
+# explained above.
-
+
-
+
-
+
-# =    3  Task solutions  ======================================================
+# =    3  Task solutions  ======================================================
-
+
-calcGC <- function(s) {
+calcGC <- function(s) {
-  s <- gsub("[^agctAGCT]", "", s)
+  s <- gsub("[^agctAGCT]", "", s)
-  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
+  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
-}
+}
-
+
-expect_equal(calcGC("AATT"), 0)
+expect_equal(calcGC("AATT"), 0)
-expect_equal(calcGC("ATGC"), 0.5)
+expect_equal(calcGC("ATGC"), 0.5)
-expect_equal(calcGC("AC"),   0.5)
+expect_equal(calcGC("AC"),   0.5)
-expect_equal(calcGC("CGCG"), 1)
+expect_equal(calcGC("CGCG"), 1)
-
+
-
+
-
+
-# [END]
+# [END]
--- a/RPR-eUtils_XML.R
+++ b/RPR-eUtils_XML.R
@ -1,166 +1,166 @@
-# tocID <- "RPR-eUtils_XML.R"
+# tocID <- "RPR-eUtils_XML.R"
-#
+#
-# Purpose:  A Bioinformatics Course:
+# Purpose:  A Bioinformatics Course:
-#              R code accompanying the RPR-Scripting_data_downloads unit.
+#              R code accompanying the RPR-Scripting_data_downloads unit.
-#
+#
-# Version:  1.2.1
+# Version:  1.2.1
-#
+#
-# Date:     2017-10  -  2021-09
+# Date:     2017-10  -  2021-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2.1  2021 Maintenance
+#           1.2.1  2021 Maintenance
-#           1.2    2020 Updates
+#           1.2    2020 Updates
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.0    First ABC units version
+#           1.0    First ABC units version
-#           0.1    First code copied from 2016 material.
+#           0.1    First code copied from 2016 material.
-#
+#
-#
+#
-# TODO:
+# TODO:
-#
+#
-#
+#
-# == DO NOT SIMPLY  source()  THIS FILE! =======================================
+# == DO NOT SIMPLY  source()  THIS FILE! =======================================
-#
+#
-# If there are portions you don't understand, use R's help system, Google for an
+# If there are portions you don't understand, use R's help system, Google for an
-# answer, or ask your instructor. Don't continue if you don't understand what's
+# answer, or ask your instructor. Don't continue if you don't understand what's
-# going on. That's not how it works ...
+# going on. That's not how it works ...
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                       Line
+#TOC>   Section  Title                                       Line
-#TOC> -----------------------------------------------------------
+#TOC> -----------------------------------------------------------
-#TOC>   1        Working with NCBI eUtils                      43
+#TOC>   1        Working with NCBI eUtils                      43
-#TOC>   1.1        Task - fetchNCBItaxData() function         145
+#TOC>   1.1        Task - fetchNCBItaxData() function         145
-#TOC>   2        Task solutions                               152
+#TOC>   2        Task solutions                               152
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Working with NCBI eUtils  ============================================
+# =    1  Working with NCBI eUtils  ============================================
-
+
-
+
-# To begin, we load the xml2 package that contains functions
+# To begin, we load the xml2 package that contains functions
-# we need to receive and parse html data. NCBI's eUtils send information in
+# we need to receive and parse html data. NCBI's eUtils send information in
-# XML format so we need to be able to parse XML.
+# XML format so we need to be able to parse XML.
-if (! requireNamespace("xml2", quietly=TRUE)) {
+if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
+  install.packages("xml2")
-}
+}
-# Package information:
+# Package information:
-#  library(help = xml2)       # basic information
+#  library(help = xml2)       # basic information
-#  browseVignettes("xml2")    # available vignettes
+#  browseVignettes("xml2")    # available vignettes
-#  data(package = "xml2")     # available datasets
+#  data(package = "xml2")     # available datasets
-
+
-
+
-
+
-# We will walk through the process with the refSeqID
+# We will walk through the process with the refSeqID
-# of yeast Mbp1
+# of yeast Mbp1
-refSeqID <- "NP_010227"
+refSeqID <- "NP_010227"
-
+
-
+
-# First we build a query URL...
+# First we build a query URL...
-eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
-
+
-
+
-# Then we assemble an URL that will search for get the
+# Then we assemble an URL that will search for get the
-# unique, NCBI internal identifier,
+# unique, NCBI internal identifier,
-# for our refSeqID...
+# for our refSeqID...
-URL <- paste(eUtilsBase,
+URL <- paste(eUtilsBase,
-             "esearch.fcgi?",     # ...using the esearch program
+             "esearch.fcgi?",     # ...using the esearch program
-                                  # that finds an entry in an
+                                  # that finds an entry in an
-                                  # NCBI database
+                                  # NCBI database
-             "db=protein",
+             "db=protein",
-             "&term=", refSeqID,
+             "&term=", refSeqID,
-             sep="")
+             sep="")
-# Copy the URL and paste it into your browser to see
+# Copy the URL and paste it into your browser to see
-# what the response should look like.
+# what the response should look like.
-URL
+URL
-
+
-# To fetch a response in R, we use the function read_xml()
+# To fetch a response in R, we use the function read_xml()
-# with our URL as its argument.
+# with our URL as its argument.
-( myXML <- xml2::read_xml(URL) )
+( myXML <- xml2::read_xml(URL) )
-
+
-# This is XML. We can take the response apart into
+# This is XML. We can take the response apart into
-# its individual components with the as_list() function.
+# its individual components with the as_list() function.
-
+
-xml2::as_list(myXML)
+xml2::as_list(myXML)
-
+
-# Note how the XML "tree" is represented as a list of
+# Note how the XML "tree" is represented as a list of
-# lists of lists ...
+# lists of lists ...
-# If we know exactly what element we are looking for,
+# If we know exactly what element we are looking for,
-# we can extract it from this structure:
+# we can extract it from this structure:
-xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
+xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
-
+
-# But this is not very robust, it would break with the
+# But this is not very robust, it would break with the
-# slightest change that the NCBI makes to their data format -
+# slightest change that the NCBI makes to their data format -
-# and the NCBI changes things A LOT!
+# and the NCBI changes things A LOT!
-
+
-# Somewhat more robust is to specify the type of element
+# Somewhat more robust is to specify the type of element
-# we want - its the text contained in an <Id>...</Id>
+# we want - its the text contained in an <Id>...</Id>
-# element, and use the XPath XML parsing language to
+# element, and use the XPath XML parsing language to
-# retrieve it.
+# retrieve it.
-
+
-xml2::xml_find_all(myXML, "//Id") # returns a "node set"
+xml2::xml_find_all(myXML, "//Id") # returns a "node set"
-
+
-xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
+xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
-                                                  # of the node set
+                                                  # of the node set
-
+
-# We will need to do this more than once, so we write a function
+# We will need to do this more than once, so we write a function
-# for it...
+# for it...
-node2text <- function(doc, tag) {
+node2text <- function(doc, tag) {
-  # an extractor function for the contents of elements
+  # an extractor function for the contents of elements
-  # between given tags in an XML response.
+  # between given tags in an XML response.
-  # Contents of all matching elements is returned in
+  # Contents of all matching elements is returned in
-  # a vector of strings.
+  # a vector of strings.
-  path <- paste0("//", tag)
+  path <- paste0("//", tag)
-  nodes <- xml2::xml_find_all(doc, path)
+  nodes <- xml2::xml_find_all(doc, path)
-  return(xml2::xml_text(nodes))
+  return(xml2::xml_text(nodes))
-}
+}
-
+
-# using node2text() ...
+# using node2text() ...
-(GID <- node2text(myXML, "Id"))
+(GID <- node2text(myXML, "Id"))
-
+
-# The GI is the pivot for data requests at the
+# The GI is the pivot for data requests at the
-# NCBI.
+# NCBI.
-
+
-# Let's first get the associated data for this GI
+# Let's first get the associated data for this GI
-URL <- paste0(eUtilsBase,
+URL <- paste0(eUtilsBase,
-              "esummary.fcgi?",
+              "esummary.fcgi?",
-              "db=protein",
+              "db=protein",
-              "&id=",
+              "&id=",
-              GID,
+              GID,
-              "&version=2.0")
+              "&version=2.0")
-(myXML <- xml2::read_xml(URL))
+(myXML <- xml2::read_xml(URL))
-
+
-(taxID <- node2text(myXML, "TaxId"))
+(taxID <- node2text(myXML, "TaxId"))
-(organism <- node2text(myXML, "Organism"))
+(organism <- node2text(myXML, "Organism"))
-
+
-#  This forms the base of a function that gets taxonomy data
+#  This forms the base of a function that gets taxonomy data
-#  from an Entrez result. You can write this!
+#  from an Entrez result. You can write this!
-
+
-
+
-# ==   1.1  Task - fetchNCBItaxData() function  ================================
+# ==   1.1  Task - fetchNCBItaxData() function  ================================
-
+
-# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
+# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
-# information, returns a list with taxID and organism, if the operation is
+# information, returns a list with taxID and organism, if the operation is
-# successful, or a list of length 0 if there is an error.
+# successful, or a list of length 0 if there is an error.
-
+
-
+
-# =    2  Task solutions  ======================================================
+# =    2  Task solutions  ======================================================
-
+
-# I have placed such a function into the dbUtilities script: look it up by
+# I have placed such a function into the dbUtilities script: look it up by
-# clicking on  dbFetchNCBItaxData() in the Environment pane.
+# clicking on  dbFetchNCBItaxData() in the Environment pane.
-
+
-# Test:
+# Test:
-dbFetchNCBItaxData("XP_001837394")
+dbFetchNCBItaxData("XP_001837394")
-
+
-# Expected outout:
+# Expected outout:
-# ----------------
+# ----------------
-# taxID                         organism
+# taxID                         organism
-# 1 240176 Coprinopsis cinerea okayama7#130
+# 1 240176 Coprinopsis cinerea okayama7#130
-
+
-
+
-# [END]
+# [END]
--- a/data/0TST.pdb
+++ b/data/0TST.pdb
@ -1,10 +1,10 @@
-HEADER   TEST                                                 0TST      0TST   1
+HEADER   TEST                                                 0TST      0TST   1
-REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
+REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
-ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
+ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
-ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
+ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
-ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
+ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
-ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
+ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
-TER       5      GLY     1                                              0TST   7
+TER       5      GLY     1                                              0TST   7
-HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
+HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
-HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
+HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
-END                                                                     0TST  10
+END                                                                     0TST  10
--- a/data/1BM8.pdb
+++ b/data/1BM8.pdb
--- a/data/2F1C.fa
+++ b/data/2F1C.fa
@ -1,5 +1,5 @@
->2F1C:X|PDBID|CHAIN|SEQUENCE
+>2F1C:X|PDBID|CHAIN|SEQUENCE
-EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
+EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
-DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
+DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
-FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
+FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
 GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH
--- a/data/3FG7.fa
+++ b/data/3FG7.fa
@ -1,6 +1,6 @@
->3FG7:A|PDBID|CHAIN|SEQUENCE
+>3FG7:A|PDBID|CHAIN|SEQUENCE
-MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
+MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
-WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
+WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
-VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
+VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
-ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
+ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
-QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
+QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
--- a/data/MBP1_SACCE.json
+++ b/data/MBP1_SACCE.json
@ -1,20 +1,20 @@
-[
+[
-  { "name" : "MBP1_SACCE",
+  { "name" : "MBP1_SACCE",
-    "RefSeqID" : "NP_010227",
+    "RefSeqID" : "NP_010227",
-    "UniProtID" : "P39678",
+    "UniProtID" : "P39678",
-    "taxonomyID" : 559292,
+    "taxonomyID" : 559292,
-    "sequence" : [
+    "sequence" : [
-       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
+       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
-       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
+       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
-       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
+       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
-       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
+       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
-       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
+       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
-       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
+       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
-       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
+       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
-       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
+       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
-       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
+       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
-       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
+       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
-       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
+       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
-       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
+       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
-  }
+  }
-]
+]
--- a/data/PTPN5_HSa_coding.fa
+++ b/data/PTPN5_HSa_coding.fa
@ -1,30 +1,30 @@
->PTPN5-201 cds:protein_coding (ENST00000358540.7)
+>PTPN5-201 cds:protein_coding (ENST00000358540.7)
-ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
+ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
-GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
+GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
-GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
+GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
-CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
+CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
-TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
+TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
-GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
+GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
-TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
+TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
-TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
+TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
-CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
+CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
-AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
+AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
-ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
+ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
-ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
+ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
-GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
+GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
-GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
+GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
-GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
+GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
-CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
+CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
-GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
+GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
-CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
+CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
-GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
+GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
-GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
+GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
-ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
+ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
-GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
+GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
-TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
+TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
-GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
+GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
-GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
+GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
-GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
+GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
-GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
+GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
-ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
+ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
-CACCAGTCCCCAGAATGA
+CACCAGTCCCCAGAATGA
--- a/data/RAB39B_HSa_coding.fa
+++ b/data/RAB39B_HSa_coding.fa
@ -1,12 +1,12 @@
->RAB39B cds:protein_coding (ENST00000369454.4)
+>RAB39B cds:protein_coding (ENST00000369454.4)
-ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
+ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
-AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
+AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
-GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
+GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
-CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
+CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
-AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
+AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
-CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
+CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
-GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
+GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
-CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
+CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
-GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
+GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
-ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
+ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
-TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
+TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
--- a/data/RandomPhobiaPage.txt
+++ b/data/RandomPhobiaPage.txt
@ -1,131 +1,131 @@
-
+
-
+
-```{css, echo = FALSE}
+```{css, echo = FALSE}
-
+
-.striped tr:nth-child(even) {
+.striped tr:nth-child(even) {
-  background: #eaf1ff;
+  background: #eaf1ff;
-}
+}
-.striped {
+.striped {
-  padding: 5px;
+  padding: 5px;
-}
+}
-```
+```
-<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
+<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
-
+
-
+
-```{r setup, include=FALSE}
+```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(echo = TRUE)
-```
+```
-
+
-## Phobias! ##
+## Phobias! ##
-We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
+We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
-
+
-To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
+To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
-```{r packages}
+```{r packages}
-if (! requireNamespace("rvest", quietly=TRUE)) {
+if (! requireNamespace("rvest", quietly=TRUE)) {
-  install.packages("rvest")
+  install.packages("rvest")
-}
+}
-if (! requireNamespace("xml2", quietly=TRUE)) {
+if (! requireNamespace("xml2", quietly=TRUE)) {
-  install.packages("xml2")
+  install.packages("xml2")
-}
+}
-```
+```
-As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
+As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
-
+
-`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
+`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
-
+
-```{r getPageData, cache=TRUE}
+```{r getPageData, cache=TRUE}
-webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
+webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
-allTables <- rvest::html_table(webPage, fill = TRUE)
+allTables <- rvest::html_table(webPage, fill = TRUE)
-```
+```
-
+
-There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
+There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
-
+
-```{r collateTables, cache=TRUE}
+```{r collateTables, cache=TRUE}
-phobiaTable <- data.frame(Phobia = character(), Condition = character())
+phobiaTable <- data.frame(Phobia = character(), Condition = character())
-for (i in seq_along(allTables)) {
+for (i in seq_along(allTables)) {
-  df <- allTables[[i]]
+  df <- allTables[[i]]
-  if (all(colnames(df) == c("Phobia", "Condition"))) {
+  if (all(colnames(df) == c("Phobia", "Condition"))) {
-    phobiaTable <- rbind(phobiaTable, df)
+    phobiaTable <- rbind(phobiaTable, df)
-  }
+  }
-}
+}
-```
+```
-
+
-Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
+Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
-
+
-<p>&nbsp;
+<p>&nbsp;
-<p>
+<p>
-
+
-```{r , ref.label="randRow", echo=FALSE}
+```{r , ref.label="randRow", echo=FALSE}
-```
+```
-
+
-**Table**: seven random phobias<br/>
+**Table**: seven random phobias<br/>
-```{r renderPhobiaTable, echo=FALSE, results='asis'}
+```{r renderPhobiaTable, echo=FALSE, results='asis'}
-sel <- sample(1:nrow(phobiaTable), 7)
+sel <- sample(1:nrow(phobiaTable), 7)
-knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
+knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
-```
+```
-
+
-<p>&nbsp;
+<p>&nbsp;
-<p>
+<p>
-To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
+To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
-
+
-```{r randRow}
+```{r randRow}
-randRow <- function(M, seed = FALSE) {
+randRow <- function(M, seed = FALSE) {
-  # Return a random row from a dataframe M.
+  # Return a random row from a dataframe M.
-  if (seed) {
+  if (seed) {
-    oldseed <- .Random.seed                # play nice and save the RNG state ...
+    oldseed <- .Random.seed                # play nice and save the RNG state ...
-    set.seed(as.integer(seed))
+    set.seed(as.integer(seed))
-  }
+  }
-  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
+  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
-  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
+  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
-  return(r)
+  return(r)
-}
+}
-```
+```
-<p>&nbsp;
+<p>&nbsp;
-<p>
+<p>
-With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
+With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
-
+
-_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
+_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
-
+
-<p>&nbsp;
+<p>&nbsp;
-<p>
+<p>
-
+
-Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
+Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
-
+
-```{r preProcess}
+```{r preProcess}
-
+
-# select only single-word phobias that end with "phobia"
+# select only single-word phobias that end with "phobia"
-sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
+sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
-names <- phobiaTable$Phobia[sel]
+names <- phobiaTable$Phobia[sel]
-
+
-# extract the ones we did _not_ select
+# extract the ones we did _not_ select
-x <- phobiaTable$Phobia[! sel]
+x <- phobiaTable$Phobia[! sel]
-# use strsplit() to split them apart and flatten the resulting list
+# use strsplit() to split them apart and flatten the resulting list
-x <- unlist(strsplit(x, ", "))
+x <- unlist(strsplit(x, ", "))
-x <- unlist(strsplit(x, " "))
+x <- unlist(strsplit(x, " "))
-x <- unlist(strsplit(x, "/"))
+x <- unlist(strsplit(x, "/"))
-# use the same selection as above, and append the result to our "names""
+# use the same selection as above, and append the result to our "names""
-sel <- ! grepl(" ", x) & grepl(".phobia$", x)
+sel <- ! grepl(" ", x) & grepl(".phobia$", x)
-names <- c(names, x[sel])
+names <- c(names, x[sel])
-
+
-```
+```
-
+
-Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
+Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
-
+
-```{r showHist}
+```{r showHist}
-
+
-x <- nchar(names)
+x <- nchar(names)
-pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
+pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
-pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
+pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
-hist(x,
+hist(x,
-     main = "Length of phobia-names",
+     main = "Length of phobia-names",
-     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
+     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
-                   pShort, nchar(pShort), pLong, nchar(pLong)),
+                   pShort, nchar(pShort), pLong, nchar(pLong)),
-     cex.sub = 0.8,
+     cex.sub = 0.8,
-     xlab = "name",
+     xlab = "name",
-     ylab = "counts",
+     ylab = "counts",
-     col ="#aef5ee")
+     col ="#aef5ee")
-
+
-```
+```
-
+
-That's all.
+That's all.
-
+
-<!-- [END] -->
+<!-- [END] -->
--- a/data/S288C_YDL056W_MBP1_coding.fsa
+++ b/data/S288C_YDL056W_MBP1_coding.fsa
@ -1,43 +1,43 @@
->MBP1 YDL056W SGDID:S000002214
+>MBP1 YDL056W SGDID:S000002214
-ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
+ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
-TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
+TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
-AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
+AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
-GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
+GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
-AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
+AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
-GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
+GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
-TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
+TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
-AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
+AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
-CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
+CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
-AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
+AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
-CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
+CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
-TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
+TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
-CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
+CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
-GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
+GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
-CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
+CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
-CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
+CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
-CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
+CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
-CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
+CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
-TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
+TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
-CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
+CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
-TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
+TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
-ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
+ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
-TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
+TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
-ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
+ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
-TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
+TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
-AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
+AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
-TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
+TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
-ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
+ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
-ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
+ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
-GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
+GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
-GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
+GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
-ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
+ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
-CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
+CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
-ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
+ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
-GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
+GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
-AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
+AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
-CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
+CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
-GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
+GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
-CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
+CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
-ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
+ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
-AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
+AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
 GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA
--- a/data/SGD_features.README.txt
+++ b/data/SGD_features.README.txt
@ -1,47 +1,47 @@
-SGD_features.tab
+SGD_features.tab
-
+
-The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
+The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
-
+
-The SGD_features.tab file is updated weekly (Saturday).
+The SGD_features.tab file is updated weekly (Saturday).
-
+
-NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
+NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
-used chromosomal_feature.tab file.
+used chromosomal_feature.tab file.
-
+
-File contents:
+File contents:
-
+
-1. Information on current chromosomal features in SGD, including Dubious ORFs. 
+1. Information on current chromosomal features in SGD, including Dubious ORFs. 
-Also contains coordinates of intron, exons, and other subfeatures that are located
+Also contains coordinates of intron, exons, and other subfeatures that are located
-within a chromosomal feature.
+within a chromosomal feature.
-
+
-2. The relationship between subfeatures and the feature in which they
+2. The relationship between subfeatures and the feature in which they
-are located is identified by the feature name in column #7 (parent
+are located is identified by the feature name in column #7 (parent
-feature). For example, the parent feature of the intron found in
+feature). For example, the parent feature of the intron found in
-ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
+ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
-chromosome 6.
+chromosome 6.
-
+
-3. The coordinates of all features are in chromosomal coordinates.
+3. The coordinates of all features are in chromosomal coordinates.
-
+
-
+
-Columns within SGD_features.tab:
+Columns within SGD_features.tab:
-
+
-1.   Primary SGDID (mandatory)
+1.   Primary SGDID (mandatory)
-2.   Feature type (mandatory)
+2.   Feature type (mandatory)
-3.   Feature qualifier (optional)
+3.   Feature qualifier (optional)
-4.   Feature name (optional)
+4.   Feature name (optional)
-5.   Standard gene name (optional)
+5.   Standard gene name (optional)
-6.   Alias (optional, multiples separated by |)
+6.   Alias (optional, multiples separated by |)
-7.   Parent feature name (optional)
+7.   Parent feature name (optional)
-8.   Secondary SGDID (optional, multiples separated by |)
+8.   Secondary SGDID (optional, multiples separated by |)
-9.   Chromosome (optional)
+9.   Chromosome (optional)
-10.  Start_coordinate (optional)
+10.  Start_coordinate (optional)
-11.  Stop_coordinate (optional)
+11.  Stop_coordinate (optional)
-12.  Strand (optional)
+12.  Strand (optional)
-13.  Genetic position (optional)
+13.  Genetic position (optional)
-14.  Coordinate version (optional)
+14.  Coordinate version (optional)
-15.  Sequence version (optional)
+15.  Sequence version (optional)
-16.  Description (optional)
+16.  Description (optional)
-
+
-Note that "chromosome 17" is the mitochondrial chromosome.
+Note that "chromosome 17" is the mitochondrial chromosome.
-
+
-The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
+The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
-
+
--- a/data/SGD_features.tab
+++ b/data/SGD_features.tab
--- a/data/Species.csv
+++ b/data/Species.csv
--- a/data/intogen-KRAS-distribution-data.tsv
+++ b/data/intogen-KRAS-distribution-data.tsv
@ -1,179 +1,179 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
-93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
+93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
-86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
+86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
-72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
+72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
-63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
+63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
-36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
+36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
-24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
+24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
-23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
+23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
-16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
+16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
-13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
+13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
-11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
+11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
-11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
+11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
-10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
+10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
-9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
+9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
+7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
+7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
-7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
+7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
-7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
+7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
-5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
+5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
-5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
+5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
-5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
+5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
-4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
+4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
-3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
-3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
+3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
-3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
-3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
+3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
-3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
+3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
-3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
+3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
-2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
+2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
+2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
-2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
+2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
-2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
+2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
-2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
+2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
-2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
+2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
-2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
+2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
+1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
-0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
+0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
-0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
+0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
-1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
-0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
+0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
+1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
-1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
+1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
-1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
-1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
+1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
--- a/data/intogen-OR1A1-distribution-data.tsv
+++ b/data/intogen-OR1A1-distribution-data.tsv
@ -1,49 +1,49 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
+2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
-2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
+2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
-1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
+1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
-0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
+0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
-1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
+1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
--- a/data/intogen-PTPN11-distribution-data.tsv
+++ b/data/intogen-PTPN11-distribution-data.tsv
@ -1,113 +1,113 @@
-MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
+MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
-5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
+5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
-4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
+4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
-3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
+3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
-3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
+3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
+2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
-2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
-2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
+2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
-1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
+1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
-1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
+1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
-1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
-0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
-1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
+1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
+0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
+1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
-1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
-0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
+0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
-1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
+1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
-1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
+1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
-1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
+1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
--- a/data/refAPSES.mfa
+++ b/data/refAPSES.mfa
@ -1,39 +1,39 @@
->MBP1_ASPNI AN3154 XP_660758 Q5B8H6
+>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
+-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
-LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
+LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
-
+
->MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
+>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
-KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
+KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
-LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
+LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
-
+
->MBP1_COPCI  - XP_001837394 A8NYC6
+>MBP1_COPCI  - XP_001837394 A8NYC6
-QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
+QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
-LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
+LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
-
+
->MBP1_CRYNE  - XP_569090 Q5KMQ9
+>MBP1_CRYNE  - XP_569090 Q5KMQ9
-DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
+DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
-LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
+LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
-
+
->MBP1_NEUCR Swi4 XP_955821 Q7RW59
+>MBP1_NEUCR Swi4 XP_955821 Q7RW59
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
+-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
-LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
+LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
-
+
->MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
+>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
+-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
-LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
+LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
-
+
->MBP1_SACCE Mbp1 NP_010227 P39678
+>MBP1_SACCE Mbp1 NP_010227 P39678
-QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
+QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
-LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
+LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
-
+
->MBP1_SCHPO Res2 NP_593032 P41412
+>MBP1_SCHPO Res2 NP_593032 P41412
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
+-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
-LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
+LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
-
+
->MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
+>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
+-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
-LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
+LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
-
+
->MBP1_WALME  - XP_006957051 I4YGC0
+>MBP1_WALME  - XP_006957051 I4YGC0
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
+-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
-LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
+LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
--- a/data/refAPSES_PSI-BLAST.json
+++ b/data/refAPSES_PSI-BLAST.json
@ -1,490 +1,490 @@
-[
+[
-  { "name" : "68476_WALME",
+  { "name" : "68476_WALME",
-    "RefSeqID" : "XP_006957790",
+    "RefSeqID" : "XP_006957790",
-    "UniProtID" : "I4YDD8",
+    "UniProtID" : "I4YDD8",
-    "taxonomyID" : "671144",
+    "taxonomyID" : "671144",
-    "sequence" : [
+    "sequence" : [
-             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
+             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
-             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
+             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
-             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
+             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
-             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
+             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
-             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
+             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
-             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
+             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
-             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
+             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
-             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
+             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
-             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
+             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
-  },
+  },
-  { "name" : "00846_COPCI",
+  { "name" : "00846_COPCI",
-    "RefSeqID" : "XP_001831299",
+    "RefSeqID" : "XP_001831299",
-    "UniProtID" : "A8N8X1",
+    "UniProtID" : "A8N8X1",
-    "taxonomyID" : "240176",
+    "taxonomyID" : "240176",
-    "sequence" : [
+    "sequence" : [
-             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
+             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
-             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
+             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
-             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
+             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
-             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
+             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
-             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
+             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
-             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
+             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
-             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
+             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
-             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
+             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
-             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
+             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
-             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
+             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
-             "SEAQVVDIGRVSGFMQKVRDGII"]
+             "SEAQVVDIGRVSGFMQKVRDGII"]
-  },
+  },
-  { "name" : "8533_BIPOR",
+  { "name" : "8533_BIPOR",
-    "RefSeqID" : "XP_007691662",
+    "RefSeqID" : "XP_007691662",
-    "UniProtID" : "W6ZE71",
+    "UniProtID" : "W6ZE71",
-    "taxonomyID" : "930090",
+    "taxonomyID" : "930090",
-    "sequence" : [
+    "sequence" : [
-             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
+             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
-             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
+             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
-             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
+             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
-             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
+             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
-             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
+             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
-             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
+             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
-             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
+             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
-             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
+             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
-             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
+             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
-             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
+             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
-             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
+             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
-  },
+  },
-  { "name" : "PGTG_02039",
+  { "name" : "PGTG_02039",
-    "RefSeqID" : "XP_003320997",
+    "RefSeqID" : "XP_003320997",
-    "UniProtID" : "E3JX03",
+    "UniProtID" : "E3JX03",
-    "taxonomyID" : "418459",
+    "taxonomyID" : "418459",
-    "sequence" : [
+    "sequence" : [
-             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
+             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
-             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
+             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
-             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
+             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
-             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
+             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
-             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
+             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
-             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
+             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
-             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
+             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
-             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
+             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
-             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
+             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
-             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
+             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
-             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
+             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
-  },
+  },
-  { "name" : "MBPA_ASPNI",
+  { "name" : "MBPA_ASPNI",
-    "RefSeqID" : "XP_664319",
+    "RefSeqID" : "XP_664319",
-    "UniProtID" : "Q5AYB5",
+    "UniProtID" : "Q5AYB5",
-    "taxonomyID" : "227321",
+    "taxonomyID" : "227321",
-    "sequence" : [
+    "sequence" : [
-             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
+             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
-             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
+             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
-             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
+             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
-             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
+             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
-             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
+             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
-             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
+             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
-             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
+             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
-             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
+             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
-             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
+             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
-             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
+             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
-             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
+             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
-             "MRDRGQDW"]
+             "MRDRGQDW"]
-  },
+  },
-  { "name" : "05520_CRYNE",
+  { "name" : "05520_CRYNE",
-    "RefSeqID" : "XP_570545",
+    "RefSeqID" : "XP_570545",
-    "UniProtID" : "Q5KHS0",
+    "UniProtID" : "Q5KHS0",
-    "taxonomyID" : "214684",
+    "taxonomyID" : "214684",
-    "sequence" : [
+    "sequence" : [
-             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
+             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
-             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
+             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
-             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
+             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
-             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
+             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
-             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
+             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
-             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
+             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
-             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
+             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
-             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
+             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
-             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
+             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
-             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
+             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
-             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
+             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
-             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
+             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
-  },
+  },
-  { "name" : "RES1_SCHPO",
+  { "name" : "RES1_SCHPO",
-    "RefSeqID" : "NP_595496",
+    "RefSeqID" : "NP_595496",
-    "UniProtID" : "P33520",
+    "UniProtID" : "P33520",
-    "taxonomyID" : "284812",
+    "taxonomyID" : "284812",
-    "sequence" : [
+    "sequence" : [
-             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
+             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
-             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
+             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
-             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
+             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
-             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
+             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
-             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
+             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
-             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
+             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
-             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
+             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
-             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
+             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
-  },
+  },
-  { "name" : "CDC10_SCHPO",
+  { "name" : "CDC10_SCHPO",
-    "RefSeqID" : "NP_596132",
+    "RefSeqID" : "NP_596132",
-    "UniProtID" : "P01129",
+    "UniProtID" : "P01129",
-    "taxonomyID" : "284812",
+    "taxonomyID" : "284812",
-    "sequence" : [
+    "sequence" : [
-             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
+             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
-             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
+             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
-             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
+             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
-             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
+             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
-             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
+             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
-             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
+             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
-             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
+             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
-             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
+             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
-             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
+             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
-             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
+             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
-  },
+  },
-  { "name" : "05338_USTMA",
+  { "name" : "05338_USTMA",
-    "RefSeqID" : "XP_011392041",
+    "RefSeqID" : "XP_011392041",
-    "UniProtID" : "A0A0D1BWD8",
+    "UniProtID" : "A0A0D1BWD8",
-    "taxonomyID" : "237631",
+    "taxonomyID" : "237631",
-    "sequence" : [
+    "sequence" : [
-             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
+             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
-             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
+             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
-             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
+             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
-             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
+             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
-             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
+             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
-             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
+             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
-             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
+             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
-             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
+             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
-             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
+             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
-             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
+             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
-             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
+             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
-             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
+             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
-             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
+             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
-             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
+             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
-  },
+  },
-  { "name" : "SWI4_SACCE",
+  { "name" : "SWI4_SACCE",
-    "RefSeqID" : "NP_011036",
+    "RefSeqID" : "NP_011036",
-    "UniProtID" : "P25302",
+    "UniProtID" : "P25302",
-    "taxonomyID" : "559292",
+    "taxonomyID" : "559292",
-    "sequence" : [
+    "sequence" : [
-             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
+             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
-             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
+             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
-             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
+             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
-             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
+             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
-             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
+             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
-             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
+             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
-             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
+             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
-             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
+             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
-             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
+             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
-             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
+             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
-             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
+             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
-             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
+             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
-             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
+             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
-             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
+             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
-  },
+  },
-  { "name" : "SWI6_NEUCR",
+  { "name" : "SWI6_NEUCR",
-    "RefSeqID" : "XP_962967",
+    "RefSeqID" : "XP_962967",
-    "UniProtID" : "Q7SBG9",
+    "UniProtID" : "Q7SBG9",
-    "taxonomyID" : "367110",
+    "taxonomyID" : "367110",
-    "sequence" : [
+    "sequence" : [
-             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
+             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
-             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
+             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
-             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
+             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
-             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
+             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
-             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
+             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
-             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
+             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
-             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
+             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
-             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
+             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
-             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
+             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
-             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
+             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
-             "EKPELEIARVRRFLGGVEGVVH"]
+             "EKPELEIARVRRFLGGVEGVVH"]
-  },
+  },
-  { "name" : "15042_USTMA",
+  { "name" : "15042_USTMA",
-    "RefSeqID" : "XP_011388143",
+    "RefSeqID" : "XP_011388143",
-    "UniProtID" : "A0A0D1CVS5",
+    "UniProtID" : "A0A0D1CVS5",
-    "taxonomyID" : "237631",
+    "taxonomyID" : "237631",
-    "sequence" : [
+    "sequence" : [
-             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
+             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
-             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
+             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
-             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
+             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
-             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
+             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
-             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
+             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
-             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
+             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
-             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
+             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
-             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
+             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
-  },
+  },
-  { "name" : "04778_USTMA",
+  { "name" : "04778_USTMA",
-    "RefSeqID" : "XP_011391646",
+    "RefSeqID" : "XP_011391646",
-    "UniProtID" : "A0A0D1DQM4",
+    "UniProtID" : "A0A0D1DQM4",
-    "taxonomyID" : "237631",
+    "taxonomyID" : "237631",
-    "sequence" : [
+    "sequence" : [
-             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
+             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
-             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
+             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
-             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
+             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
-             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
+             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
-             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
+             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
-             "PESAQLFTIHDFGSDPFYAEQVERG"]
+             "PESAQLFTIHDFGSDPFYAEQVERG"]
-  },
+  },
-  { "name" : "STUA_ASPNI",
+  { "name" : "STUA_ASPNI",
-    "RefSeqID" : "XP_663440",
+    "RefSeqID" : "XP_663440",
-    "UniProtID" : "P36011",
+    "UniProtID" : "P36011",
-    "taxonomyID" : "227321",
+    "taxonomyID" : "227321",
-    "sequence" : [
+    "sequence" : [
-             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
+             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
-             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
+             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
-             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
+             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
-             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
+             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
-             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
+             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
-             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
+             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
-             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
+             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
-             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
+             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
-  },
+  },
-  { "name" : "STUA_NEUCR",
+  { "name" : "STUA_NEUCR",
-    "RefSeqID" : "XP_960837",
+    "RefSeqID" : "XP_960837",
-    "UniProtID" : "Q1K6U0",
+    "UniProtID" : "Q1K6U0",
-    "taxonomyID" : "367110",
+    "taxonomyID" : "367110",
-    "sequence" : [
+    "sequence" : [
-             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
+             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
-             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
+             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
-             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
+             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
-             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
+             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
-             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
+             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
-             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
+             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
-             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
+             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
-             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
+             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
-             "RRR"]
+             "RRR"]
-  },
+  },
-  { "name" : "PHD1_SACCE",
+  { "name" : "PHD1_SACCE",
-    "RefSeqID" : "NP_012881",
+    "RefSeqID" : "NP_012881",
-    "UniProtID" : "P36093",
+    "UniProtID" : "P36093",
-    "taxonomyID" : "559292",
+    "taxonomyID" : "559292",
-    "sequence" : [
+    "sequence" : [
-             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
+             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
-             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
+             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
-             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
+             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
-             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
+             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
-             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
+             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
-  },
+  },
-  { "name" : "08099_COPCI",
+  { "name" : "08099_COPCI",
-    "RefSeqID" : "XP_001836714",
+    "RefSeqID" : "XP_001836714",
-    "UniProtID" : "A8NVH3",
+    "UniProtID" : "A8NVH3",
-    "taxonomyID" : "240176",
+    "taxonomyID" : "240176",
-    "sequence" : [
+    "sequence" : [
-             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
+             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
-             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
+             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
-             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
+             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
-             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
+             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
-             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
+             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
-             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
+             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
-             "PHRPW"]
+             "PHRPW"]
-  },
+  },
-  { "name" : "68479_WALME",
+  { "name" : "68479_WALME",
-    "RefSeqID" : "XP_006957792",
+    "RefSeqID" : "XP_006957792",
-    "UniProtID" : "I4YDE0",
+    "UniProtID" : "I4YDE0",
-    "taxonomyID" : "671144",
+    "taxonomyID" : "671144",
-    "sequence" : [
+    "sequence" : [
-             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
+             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
-             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
+             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
-             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
+             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
-             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
+             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
-  },
+  },
-  { "name" : "11943_PUCGR",
+  { "name" : "11943_PUCGR",
-    "RefSeqID" : "XP_003330006",
+    "RefSeqID" : "XP_003330006",
-    "UniProtID" : "E3KMR2",
+    "UniProtID" : "E3KMR2",
-    "taxonomyID" : "418459",
+    "taxonomyID" : "418459",
-    "sequence" : [
+    "sequence" : [
-             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
+             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
-             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
+             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
-             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
+             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
-             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
+             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
-             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
+             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
-             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
+             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
-             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
+             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
-             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
+             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
-  },
+  },
-  { "name" : "03082_PUCGR",
+  { "name" : "03082_PUCGR",
-    "RefSeqID" : "XP_003321545",
+    "RefSeqID" : "XP_003321545",
-    "UniProtID" : "E3JYK1",
+    "UniProtID" : "E3JYK1",
-    "taxonomyID" : "418459",
+    "taxonomyID" : "418459",
-    "sequence" : [
+    "sequence" : [
-             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
+             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
-             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
+             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
-             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
+             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
-             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
+             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
-             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
+             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
-             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
+             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
-             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
+             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
-             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
+             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
-             "LIMEWNPSC"]
+             "LIMEWNPSC"]
-  },
+  },
-  { "name" : "SOK2_SACCE",
+  { "name" : "SOK2_SACCE",
-    "RefSeqID" : "NP_013729",
+    "RefSeqID" : "NP_013729",
-    "UniProtID" : "P53438",
+    "UniProtID" : "P53438",
-    "taxonomyID" : "559292",
+    "taxonomyID" : "559292",
-    "sequence" : [
+    "sequence" : [
-             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
+             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
-             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
+             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
-             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
+             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
-             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
+             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
-             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
+             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
-             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
+             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
-             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
+             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
-             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
+             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
-             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
+             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
-             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
+             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
-  },
+  },
-  { "name" : "14426_COPCI",
+  { "name" : "14426_COPCI",
-    "RefSeqID" : "XP_002911429",
+    "RefSeqID" : "XP_002911429",
-    "UniProtID" : "D6RMB0",
+    "UniProtID" : "D6RMB0",
-    "taxonomyID" : "240176",
+    "taxonomyID" : "240176",
-    "sequence" : [
+    "sequence" : [
-             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
+             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
-             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
+             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
-             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
+             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
-             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
+             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
-             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
+             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
-             "ITYLPNFL"]
+             "ITYLPNFL"]
-  },
+  },
-  { "name" : "BQT4_SCHPO",
+  { "name" : "BQT4_SCHPO",
-    "RefSeqID" : "NP_596166",
+    "RefSeqID" : "NP_596166",
-    "UniProtID" : "O60158",
+    "UniProtID" : "O60158",
-    "taxonomyID" : "284812",
+    "taxonomyID" : "284812",
-    "sequence" : [
+    "sequence" : [
-             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
+             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
-             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
+             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
-             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
+             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
-             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
+             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
-             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
+             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
-             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
+             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
-  },
+  },
-  { "name" : "PGTG_05590",
+  { "name" : "PGTG_05590",
-    "RefSeqID" : "XP_003323688",
+    "RefSeqID" : "XP_003323688",
-    "UniProtID" : "E3K4V4",
+    "UniProtID" : "E3K4V4",
-    "taxonomyID" : "418459",
+    "taxonomyID" : "418459",
-    "sequence" : [
+    "sequence" : [
-             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
+             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
-             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
+             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
-             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
+             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
-             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
+             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
-             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
+             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
-             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
+             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
-             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
+             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
-  },
+  },
-  { "name" : "06560_NEUCR",
+  { "name" : "06560_NEUCR",
-    "RefSeqID" : "XP_962267",
+    "RefSeqID" : "XP_962267",
-    "UniProtID" : "Q7S9H5",
+    "UniProtID" : "Q7S9H5",
-    "taxonomyID" : "367110",
+    "taxonomyID" : "367110",
-    "sequence" : [
+    "sequence" : [
-             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
+             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
-             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
+             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
-             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
+             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
-             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
+             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
-             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
+             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
-             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
+             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
-             "ANVL"]
+             "ANVL"]
-  },
+  },
-  { "name" : "81480_BIPOR",
+  { "name" : "81480_BIPOR",
-    "RefSeqID" : "XP_007682909",
+    "RefSeqID" : "XP_007682909",
-    "UniProtID" : "W6ZKJ4",
+    "UniProtID" : "W6ZKJ4",
-    "taxonomyID" : "930090",
+    "taxonomyID" : "930090",
-    "sequence" : [
+    "sequence" : [
-             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
+             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
-             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
+             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
-             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
+             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
-             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
+             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
-             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
+             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
-             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
+             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
-  },
+  },
-  { "name" : "01622_ASPNI",
+  { "name" : "01622_ASPNI",
-    "RefSeqID" : "XP_657766",
+    "RefSeqID" : "XP_657766",
-    "UniProtID" : "Q5BH18",
+    "UniProtID" : "Q5BH18",
-    "taxonomyID" : "227321",
+    "taxonomyID" : "227321",
-    "sequence" : [
+    "sequence" : [
-             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
+             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
-             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
+             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
-             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
+             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
-             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
+             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
-             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
+             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
-             "RVRNRALMGVTAAFALAKPALVLLEA"]
+             "RVRNRALMGVTAAFALAKPALVLLEA"]
-  },
+  },
-  { "name" : "05405_ASPNI",
+  { "name" : "05405_ASPNI",
-    "RefSeqID" : "XP_663009",
+    "RefSeqID" : "XP_663009",
-    "UniProtID" : "Q5B225",
+    "UniProtID" : "Q5B225",
-    "taxonomyID" : "227321",
+    "taxonomyID" : "227321",
-    "sequence" : [
+    "sequence" : [
-             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
+             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
-             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
+             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
-             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
+             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
-             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
+             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
-             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
+             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
-             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
+             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
-             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
+             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
-             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
+             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
-  },
+  },
-  { "name" : "105954_BIPOR",
+  { "name" : "105954_BIPOR",
-    "RefSeqID" : "XP_007691967",
+    "RefSeqID" : "XP_007691967",
-    "UniProtID" : "W6Z1H5",
+    "UniProtID" : "W6Z1H5",
-    "taxonomyID" : "930090",
+    "taxonomyID" : "930090",
-    "sequence" : [
+    "sequence" : [
-             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
+             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
-             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
+             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
-             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
+             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
-             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
+             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
-             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
+             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
-             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
+             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
-  },
+  },
-  { "name" : "69819_WALME",
+  { "name" : "69819_WALME",
-    "RefSeqID" : "XP_006959479",
+    "RefSeqID" : "XP_006959479",
-    "UniProtID" : "I4Y911",
+    "UniProtID" : "I4Y911",
-    "taxonomyID" : "671144",
+    "taxonomyID" : "671144",
-    "sequence" : [
+    "sequence" : [
-             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
+             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
-             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
+             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
-             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
+             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
-             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
+             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
-             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
+             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
-  },
+  },
-  { "name" : "02840_CRYNE",
+  { "name" : "02840_CRYNE",
-    "RefSeqID" : "XP_568872",
+    "RefSeqID" : "XP_568872",
-    "UniProtID" : "Q5KM59",
+    "UniProtID" : "Q5KM59",
-    "taxonomyID" : "214684",
+    "taxonomyID" : "214684",
-    "sequence" : [
+    "sequence" : [
-             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
+             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
-             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
+             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
-             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
+             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
-             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
+             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
-             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
+             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
-  },
+  },
-  { "name" : "11055_USTMA",
+  { "name" : "11055_USTMA",
-    "RefSeqID" : "XP_011390537",
+    "RefSeqID" : "XP_011390537",
-    "UniProtID" : "A0A0D1DZM8",
+    "UniProtID" : "A0A0D1DZM8",
-    "taxonomyID" : "237631",
+    "taxonomyID" : "237631",
-    "sequence" : [
+    "sequence" : [
-             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
+             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
-             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
+             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
-             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
+             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
-             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
+             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
-             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
+             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
-             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
+             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
-             "ASILPW"]
+             "ASILPW"]
-  },
+  },
-  { "name" : "XBP1_NEUCR",
+  { "name" : "XBP1_NEUCR",
-    "RefSeqID" : "XP_962373",
+    "RefSeqID" : "XP_962373",
-    "UniProtID" : "Q7S9W7",
+    "UniProtID" : "Q7S9W7",
-    "taxonomyID" : "367110",
+    "taxonomyID" : "367110",
-    "sequence" : [
+    "sequence" : [
-             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
+             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
-             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
+             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
-             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
+             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
-             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
+             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
-             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
+             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
-             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
+             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
-             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
+             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
-             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
+             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
-             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
+             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
-  },
+  },
-  { "name" : "XBP1_SACCE",
+  { "name" : "XBP1_SACCE",
-    "RefSeqID" : "NP_012165",
+    "RefSeqID" : "NP_012165",
-    "UniProtID" : "P40489",
+    "UniProtID" : "P40489",
-    "taxonomyID" : "559292",
+    "taxonomyID" : "559292",
-    "sequence" : [
+    "sequence" : [
-             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
+             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
-             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
+             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
-             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
+             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
-             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
+             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
-             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
+             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
-             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
+             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
-             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
+             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
-             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
+             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
-             "FKTNSKQ"]
+             "FKTNSKQ"]
-  }
+  }
-]
+]
--- a/data/refAnnotations.json
+++ b/data/refAnnotations.json
@ -1,116 +1,116 @@
-[
+[
-  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
+  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
-  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
+  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
-  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
+  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
-  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
+  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
-  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
+  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
-  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
+  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
-
+
-  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
+  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
-  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
+  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
-  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
+  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
-  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
+  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
-  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
+  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
-
+
-  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
+  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
-  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
+  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
-  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
+  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
-  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
+  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
-  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
+  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
-  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
+  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
-
+
-  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
+  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
-  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
+  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
-  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
+  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
-  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
+  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
-  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
+  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
-
+
-  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
+  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
-  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
+  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
-  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
+  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
-  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
+  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
-  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
+  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
-
+
-  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
+  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
-  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
+  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
-  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
+  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
-  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
+  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
-  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
+  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
-  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
+  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
-  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
+  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
-
+
-  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
+  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
-  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
+  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
-  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
+  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
-  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
+  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
-
+
-  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
+  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
-  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
+  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
-  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
+  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
-  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
+  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
-  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
+  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
-  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
+  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
-
+
-  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
+  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
-  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
+  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
-  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
+  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
-  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
+  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
-  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
+  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
-  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
+  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
-  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
+  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
-
+
-  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
+  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
-  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
+  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
-  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
+  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
-  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
+  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
-  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
+  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
-  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
+  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
-  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
+  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
-]
+]
--- a/data/refFeatures.json
+++ b/data/refFeatures.json
@ -1,47 +1,47 @@
-[
+[
-  { "name" : "APSES fold",
+  { "name" : "APSES fold",
-    "description " : "DNA binding domain by similarity to structure",
+    "description " : "DNA binding domain by similarity to structure",
-    "sourceDB" : "PDB",
+    "sourceDB" : "PDB",
-    "accession" : "1BM8_A_1_99"},
+    "accession" : "1BM8_A_1_99"},
-
+
-  { "name" : "KilA-N",
+  { "name" : "KilA-N",
-    "description " : "DNA binding domain by Pfam annotation",
+    "description " : "DNA binding domain by Pfam annotation",
-    "sourceDB" : "Pfam",
+    "sourceDB" : "Pfam",
-    "accession" : "PF04383"},
+    "accession" : "PF04383"},
-
+
-  { "name" : "AT hook",
+  { "name" : "AT hook",
-    "description " : "DNA interaction motif by SMART annotation",
+    "description " : "DNA interaction motif by SMART annotation",
-    "sourceDB" : "SMART",
+    "sourceDB" : "SMART",
-    "accession" : null},
+    "accession" : null},
-
+
-  { "name" : "low complexity",
+  { "name" : "low complexity",
-    "description " : "SEG annotation by SMART",
+    "description " : "SEG annotation by SMART",
-    "sourceDB" : "SMART",
+    "sourceDB" : "SMART",
-    "accession" : null},
+    "accession" : null},
-
+
-  { "name" : "Ankyrin fold",
+  { "name" : "Ankyrin fold",
-    "description " : "Ankyrin domain by SMART annotation",
+    "description " : "Ankyrin domain by SMART annotation",
-    "sourceDB" : "SMART",
+    "sourceDB" : "SMART",
-    "accession" : "SM00248"},
+    "accession" : "SM00248"},
-
+
-  { "name" : "Swi6 fold",
+  { "name" : "Swi6 fold",
-    "description " : "Swi6 fold by similarity to structure",
+    "description " : "Swi6 fold by similarity to structure",
-    "sourceDB" : "PDB",
+    "sourceDB" : "PDB",
-    "accession" : "1SW6_B"},
+    "accession" : "1SW6_B"},
-
+
-  { "name" : "coiled coil",
+  { "name" : "coiled coil",
-    "description " : "Coiled coil by SMART annotation",
+    "description " : "Coiled coil by SMART annotation",
-    "sourceDB" : "SMART",
+    "sourceDB" : "SMART",
-    "accession" : null},
+    "accession" : null},
-
+
-  { "name" : "McInerny 2011",
+  { "name" : "McInerny 2011",
-    "description " : "Yeast cell cycle review",
+    "description " : "Yeast cell cycle review",
-    "sourceDB" : "PubMed",
+    "sourceDB" : "PubMed",
-    "accession" : "21310294"}
+    "accession" : "21310294"}
-]
+]
-
+
-
+
-
+
-
+
-
+
-
+
--- a/data/refMBP1Proteins.json
+++ b/data/refMBP1Proteins.json
@ -1,155 +1,155 @@
-[
+[
-  { "name" : "MBP1_SCHPO",
+  { "name" : "MBP1_SCHPO",
-    "RefSeqID" : "NP_593032",
+    "RefSeqID" : "NP_593032",
-    "UniProtID" : "P41412",
+    "UniProtID" : "P41412",
-    "taxonomyID" : 284812,
+    "taxonomyID" : 284812,
-    "sequence" : [
+    "sequence" : [
-       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
+       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
-       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
+       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
-       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
+       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
-       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
+       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
-       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
+       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
-       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
+       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
-       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
+       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
-       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
+       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
-       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
+       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
-       "AMSCGINPEDLSLEILDAVEEALTREK"]
+       "AMSCGINPEDLSLEILDAVEEALTREK"]
-  },
+  },
-  { "name" : "MBP1_ASPNI",
+  { "name" : "MBP1_ASPNI",
-    "RefSeqID" : "XP_660758",
+    "RefSeqID" : "XP_660758",
-    "UniProtID" : "Q5B8H6",
+    "UniProtID" : "Q5B8H6",
-    "taxonomyID" : 227321,
+    "taxonomyID" : 227321,
-    "sequence" : [
+    "sequence" : [
-       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
+       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
-       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
+       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
-       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
+       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
-       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
+       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
-       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
+       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
-       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
+       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
-       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
+       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
-       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
+       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
-       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
+       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
-       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
+       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
-  },
+  },
-  { "name" : "MBP1_BIPOR",
+  { "name" : "MBP1_BIPOR",
-    "RefSeqID" : "XP_007682304",
+    "RefSeqID" : "XP_007682304",
-    "UniProtID" : "W6ZM86",
+    "UniProtID" : "W6ZM86",
-    "taxonomyID" : 930090,
+    "taxonomyID" : 930090,
-    "sequence" : [
+    "sequence" : [
-       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
+       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
-       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
+       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
-       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
+       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
-       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
+       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
-       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
+       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
-       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
+       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
-       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
+       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
-       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
+       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
-       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
+       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
-       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
+       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
-  },
+  },
-  { "name" : "MBP1_NEUCR",
+  { "name" : "MBP1_NEUCR",
-    "RefSeqID" : "XP_955821",
+    "RefSeqID" : "XP_955821",
-    "UniProtID" : "Q7RW59",
+    "UniProtID" : "Q7RW59",
-    "taxonomyID" : 367110,
+    "taxonomyID" : 367110,
-    "sequence" : [
+    "sequence" : [
-       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
+       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
-       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
+       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
-       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
+       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
-       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
+       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
-       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
+       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
-       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
+       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
-       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
+       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
-       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
+       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
-       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
+       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
-       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
+       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
-       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
+       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
-       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
+       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
-  },
+  },
-  { "name" : "MBP1_COPCI",
+  { "name" : "MBP1_COPCI",
-    "RefSeqID" : "XP_001837394",
+    "RefSeqID" : "XP_001837394",
-    "UniProtID" : "A8NYC6",
+    "UniProtID" : "A8NYC6",
-    "taxonomyID" : 240176,
+    "taxonomyID" : 240176,
-    "sequence" : [
+    "sequence" : [
-       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
+       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
-       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
+       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
-       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
+       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
-       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
+       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
-       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
+       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
-       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
+       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
-       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
+       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
-       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
+       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
-       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
+       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
-       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
+       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
-       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
+       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
-  },
+  },
-  { "name" : "MBP1_CRYNE",
+  { "name" : "MBP1_CRYNE",
-    "RefSeqID" : "XP_569090",
+    "RefSeqID" : "XP_569090",
-    "UniProtID" : "Q5KMQ9",
+    "UniProtID" : "Q5KMQ9",
-    "taxonomyID" : 214684,
+    "taxonomyID" : 214684,
-    "sequence" : [
+    "sequence" : [
-       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
+       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
-       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
+       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
-       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
+       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
-       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
+       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
-       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
+       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
-       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
+       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
-       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
+       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
-       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
+       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
-       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
+       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
-       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
+       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
-       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
+       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
-  },
+  },
-  { "name" : "MBP1_PUCGR",
+  { "name" : "MBP1_PUCGR",
-    "RefSeqID" : "XP_003327086",
+    "RefSeqID" : "XP_003327086",
-    "UniProtID" : "E3KED4",
+    "UniProtID" : "E3KED4",
-    "taxonomyID" : 418459,
+    "taxonomyID" : 418459,
-    "sequence" : [
+    "sequence" : [
-       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
+       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
-       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
+       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
-       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
+       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
-       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
+       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
-       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
+       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
-       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
+       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
-       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
+       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
-       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
+       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
-       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
+       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
-       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
+       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
-       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
+       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
-       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
+       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
-       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
+       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
-       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
+       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
-  },
+  },
-  { "name" : "MBP1_USTMA",
+  { "name" : "MBP1_USTMA",
-    "RefSeqID" : "XP_011392621",
+    "RefSeqID" : "XP_011392621",
-    "UniProtID" : "A0A0D1DP35",
+    "UniProtID" : "A0A0D1DP35",
-    "taxonomyID" : 237631,
+    "taxonomyID" : 237631,
-    "sequence" : [
+    "sequence" : [
-       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
+       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
-       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
+       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
-       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
+       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
-       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
+       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
-       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
+       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
-       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
+       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
-       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
+       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
-       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
+       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
-       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
+       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
-       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
+       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
-       "P"]
+       "P"]
-  },
+  },
-  { "name" : "MBP1_WALME",
+  { "name" : "MBP1_WALME",
-    "RefSeqID" : "XP_006957051",
+    "RefSeqID" : "XP_006957051",
-    "UniProtID" : "I4YGC0",
+    "UniProtID" : "I4YGC0",
-    "taxonomyID" : 671144,
+    "taxonomyID" : 671144,
-    "sequence" : [
+    "sequence" : [
-       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
+       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
-       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
+       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
-       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
+       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
-       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
+       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
-       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
+       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
-       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
+       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
-       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
+       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
-       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
+       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
-       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
+       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
-       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
+       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
-  }
+  }
-]
+]
--- a/data/refTaxonomy.json
+++ b/data/refTaxonomy.json
@ -1,22 +1,22 @@
-[
+[
-  { "ID" : 227321,
+  { "ID" : 227321,
-    "species" : "Aspergillus nidulans FGSC A4"},
+    "species" : "Aspergillus nidulans FGSC A4"},
-  { "ID" : 930090,
+  { "ID" : 930090,
-    "species" : "Bipolaris oryzae ATCC 44560"},
+    "species" : "Bipolaris oryzae ATCC 44560"},
-  { "ID" : 240176,
+  { "ID" : 240176,
-    "species" : "Coprinopsis cinerea okayama7#130"},
+    "species" : "Coprinopsis cinerea okayama7#130"},
-  { "ID" : 214684,
+  { "ID" : 214684,
-    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
+    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
-  { "ID" : 367110,
+  { "ID" : 367110,
-    "species" : "Neurospora crassa OR74A"},
+    "species" : "Neurospora crassa OR74A"},
-  { "ID" : 418459,
+  { "ID" : 418459,
-    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
+    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
-  { "ID" : 559292,
+  { "ID" : 559292,
-    "species" : "Saccharomyces cerevisiae S288C"},
+    "species" : "Saccharomyces cerevisiae S288C"},
-  { "ID" : 284812,
+  { "ID" : 284812,
-    "species" : "Schizosaccharomyces pombe 972h-"},
+    "species" : "Schizosaccharomyces pombe 972h-"},
-  { "ID" : 237631,
+  { "ID" : 237631,
-    "species" : "Ustilago maydis 521"},
+    "species" : "Ustilago maydis 521"},
-  { "ID" : 671144,
+  { "ID" : 671144,
-    "species" : "Wallemia mellicola CBS 633.66"}
+    "species" : "Wallemia mellicola CBS 633.66"}
-]
+]
--- a/data/referenceDomainAnnotations.txt
+++ b/data/referenceDomainAnnotations.txt
@ -1,115 +1,115 @@
-ID	protein.ID	feature.ID	start	end	note
+ID	protein.ID	feature.ID	start	end	note
-# MBP1_SACCE
+# MBP1_SACCE
-NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
+NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
-NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
+NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
-NA	ref_pro_4	ref_ftr_4	108	122	low complexity
+NA	ref_pro_4	ref_ftr_4	108	122	low complexity
-NA	ref_pro_4	ref_ftr_4	236	241	low complexity
+NA	ref_pro_4	ref_ftr_4	236	241	low complexity
-NA	ref_pro_4	ref_ftr_4	279	307	low complexity
+NA	ref_pro_4	ref_ftr_4	279	307	low complexity
-NA	ref_pro_4	ref_ftr_4	700	717	low complexity
+NA	ref_pro_4	ref_ftr_4	700	717	low complexity
-NA	ref_pro_4	ref_ftr_4	700	717	low complexity
+NA	ref_pro_4	ref_ftr_4	700	717	low complexity
-NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
+NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
-NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
+NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
-NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
+NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
-NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
+NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
-NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
+NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
-# MBP1_ASPNI
+# MBP1_ASPNI
-NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
+NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
-NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
+NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
-NA	ref_pro_1	ref_ftr_4	529	534	low complexity
+NA	ref_pro_1	ref_ftr_4	529	534	low complexity
-NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
+NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
-NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
+NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
-NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
+NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
-NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
+NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
-# MBP1_BIPOR
+# MBP1_BIPOR
-NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
+NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
-NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
+NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
-NA	ref_pro_2	ref_ftr_4	134	152	low complexity
+NA	ref_pro_2	ref_ftr_4	134	152	low complexity
-NA	ref_pro_2	ref_ftr_4	267	278	low complexity
+NA	ref_pro_2	ref_ftr_4	267	278	low complexity
-NA	ref_pro_2	ref_ftr_4	670	685	low complexity
+NA	ref_pro_2	ref_ftr_4	670	685	low complexity
-NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
+NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
-NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
+NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
-NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
+NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
-NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
+NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
-NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
+NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
-# MBP1_NEUCR
+# MBP1_NEUCR
-NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
+NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
-NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
+NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
-NA	ref_pro_3	ref_ftr_4	130	141	low complexity
+NA	ref_pro_3	ref_ftr_4	130	141	low complexity
-NA	ref_pro_3	ref_ftr_4	253	266	low complexity
+NA	ref_pro_3	ref_ftr_4	253	266	low complexity
-NA	ref_pro_3	ref_ftr_4	514	525	low complexity
+NA	ref_pro_3	ref_ftr_4	514	525	low complexity
-NA	ref_pro_3	ref_ftr_4	554	564	low complexity
+NA	ref_pro_3	ref_ftr_4	554	564	low complexity
-NA	ref_pro_3	ref_ftr_4	601	618	low complexity
+NA	ref_pro_3	ref_ftr_4	601	618	low complexity
-NA	ref_pro_3	ref_ftr_4	620	629	low complexity
+NA	ref_pro_3	ref_ftr_4	620	629	low complexity
-NA	ref_pro_3	ref_ftr_4	636	652	low complexity
+NA	ref_pro_3	ref_ftr_4	636	652	low complexity
-NA	ref_pro_3	ref_ftr_4	658	672	low complexity
+NA	ref_pro_3	ref_ftr_4	658	672	low complexity
-NA	ref_pro_3	ref_ftr_4	725	735	low complexity
+NA	ref_pro_3	ref_ftr_4	725	735	low complexity
-NA	ref_pro_3	ref_ftr_4	752	771	low complexity
+NA	ref_pro_3	ref_ftr_4	752	771	low complexity
-NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
+NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
-NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
+NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
-NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
+NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
-NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
+NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
-# MBP1_SCHPO
+# MBP1_SCHPO
-NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
+NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
-NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
+NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
-NA	ref_pro_5	ref_ftr_4	111	125	low complexity
+NA	ref_pro_5	ref_ftr_4	111	125	low complexity
-NA	ref_pro_5	ref_ftr_4	136	145	low complexity
+NA	ref_pro_5	ref_ftr_4	136	145	low complexity
-NA	ref_pro_5	ref_ftr_4	176	191	low complexity
+NA	ref_pro_5	ref_ftr_4	176	191	low complexity
-NA	ref_pro_5	ref_ftr_4	422	447	low complexity
+NA	ref_pro_5	ref_ftr_4	422	447	low complexity
-NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
+NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
-NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
+NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
-NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
+NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
-NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
+NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
-# MBP1_COPCI
+# MBP1_COPCI
-NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
+NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
-NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
+NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
-NA	ref_pro_6	ref_ftr_4	170	191	low complexity
+NA	ref_pro_6	ref_ftr_4	170	191	low complexity
-NA	ref_pro_6	ref_ftr_4	435	450	low complexity
+NA	ref_pro_6	ref_ftr_4	435	450	low complexity
-NA	ref_pro_6	ref_ftr_4	611	626	low complexity
+NA	ref_pro_6	ref_ftr_4	611	626	low complexity
-NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
+NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
-NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
+NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
-NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
+NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
-NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
+NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
-NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
+NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
-NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
+NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
-# MBP1_CRYNE
+# MBP1_CRYNE
-NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
+NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
-NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
+NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
-NA	ref_pro_7	ref_ftr_4	66	85	low complexity
+NA	ref_pro_7	ref_ftr_4	66	85	low complexity
-NA	ref_pro_7	ref_ftr_4	413	423	low complexity
+NA	ref_pro_7	ref_ftr_4	413	423	low complexity
-NA	ref_pro_7	ref_ftr_4	633	644	low complexity
+NA	ref_pro_7	ref_ftr_4	633	644	low complexity
-NA	ref_pro_7	ref_ftr_4	697	709	low complexity
+NA	ref_pro_7	ref_ftr_4	697	709	low complexity
-NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
+NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
-NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
+NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
-NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
+NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
-# MBP1_PUCGR
+# MBP1_PUCGR
-NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
+NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
-NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
+NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
-NA	ref_pro_8	ref_ftr_4	208	227	low complexity
+NA	ref_pro_8	ref_ftr_4	208	227	low complexity
-NA	ref_pro_8	ref_ftr_4	273	291	low complexity
+NA	ref_pro_8	ref_ftr_4	273	291	low complexity
-NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
+NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
-NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
+NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
-NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
+NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
-NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
+NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
-NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
+NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
-# MBP1_USTMA
+# MBP1_USTMA
-NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
+NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
-NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
+NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
-NA	ref_pro_9	ref_ftr_4	106	116	low complexity
+NA	ref_pro_9	ref_ftr_4	106	116	low complexity
-NA	ref_pro_9	ref_ftr_4	161	183	low complexity
+NA	ref_pro_9	ref_ftr_4	161	183	low complexity
-NA	ref_pro_9	ref_ftr_4	657	672	low complexity
+NA	ref_pro_9	ref_ftr_4	657	672	low complexity
-NA	ref_pro_9	ref_ftr_4	776	796	low complexity
+NA	ref_pro_9	ref_ftr_4	776	796	low complexity
-NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
+NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
-NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
+NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
-NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
+NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
-NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
+NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
-# MBP1_WALME
+# MBP1_WALME
-NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
+NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
-NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
+NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
-NA	ref_pro_10	ref_ftr_4	149	162	low complexity
+NA	ref_pro_10	ref_ftr_4	149	162	low complexity
-NA	ref_pro_10	ref_ftr_4	171	188	low complexity
+NA	ref_pro_10	ref_ftr_4	171	188	low complexity
-NA	ref_pro_10	ref_ftr_4	618	628	low complexity
+NA	ref_pro_10	ref_ftr_4	618	628	low complexity
-NA	ref_pro_10	ref_ftr_4	634	660	low complexity
+NA	ref_pro_10	ref_ftr_4	634	660	low complexity
-NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
+NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
-NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
+NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
-NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
+NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
-NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
+NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
--- a/functionTemplate.R
+++ b/functionTemplate.R
@ -1,37 +1,37 @@
-# functionTemplate.R
+# functionTemplate.R
-#
+#
-# Purpose:  (General)
+# Purpose:  (General)
-#
+#
-# ToDo:
+# ToDo:
-# Notes:
+# Notes:
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-myFunction <- function(a, b=1) {
+myFunction <- function(a, b=1) {
-	# Purpose:
+	# Purpose:
-	#     Describe ...
+	#     Describe ...
-    # Version:
+    # Version:
-    # Date:
+    # Date:
-    # Author:
+    # Author:
-    #
+    #
-    # Parameters:
+    # Parameters:
-	#     a: ...
+	#     a: ...
-	#     b: ...
+	#     b: ...
-	# Value:
+	# Value:
-	#     result: ...
+	#     result: ...
-	# Example: <example invocation>
+	# Example: <example invocation>
-
+
-	# code ...
+	# code ...
-
+
-	return(result)
+	return(result)
-}
+}
-
+
-
+
-# ====  TESTS  =================================================================
+# ====  TESTS  =================================================================
-# Enter your function tests here...
+# Enter your function tests here...
-
+
-if (FALSE) {
+if (FALSE) {
-  # test ...
+  # test ...
-}
+}
-
+
-# [END]
+# [END]
--- a/myScripts/.myProfile.R
+++ b/myScripts/.myProfile.R
@ -1,21 +1,21 @@
-# .myProfile.R
+# .myProfile.R
-# This contains information which the course framework needs from time to time
+# This contains information which the course framework needs from time to time
-# to personalize assignments, validate submissions etc. Make sure that
+# to personalize assignments, validate submissions etc. Make sure that
-# the information correctly matches our official records.
+# the information correctly matches our official records.
-# myEmail          char      A string with your eMail address. Use your official
+# myEmail          char      A string with your eMail address. Use your official
-#                            UofT eMail address.
+#                            UofT eMail address.
-# myStudentNumber  numeric   Your UofT student number. Take care to have this
+# myStudentNumber  numeric   Your UofT student number. Take care to have this
-#                            correct.
+#                            correct.
-#
+#
-# NOTE:
+# NOTE:
-# After you have updated this script, move the file to your "myScripts" folder.
+# After you have updated this script, move the file to your "myScripts" folder.
-# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
+# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
-#
+#
-# ==============================================================================
+# ==============================================================================
-# options(stringsAsFactors = FALSE)
+# options(stringsAsFactors = FALSE)
-
+
-myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
+myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
-myStudentNumber <- 1005845285  # e.g. 1003141592
+myStudentNumber <- 1005845285  # e.g. 1003141592
-MYSPE <- "Cutaneotrichosporon oleaginosum" 
+MYSPE <- "Cutaneotrichosporon oleaginosum" 
-
+
-# [END]
+# [END]
--- a/myScripts/ABC-INT-Mutation_impact-code.R
+++ b/myScripts/ABC-INT-Mutation_impact-code.R
@ -1,54 +1,51 @@
-myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
+gen_mutations <- function(seq, N) {
-myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
+  sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
-myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
+  stats <- c()
-myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
+  stats <- cbind(stats, c(0, 0, 0))
-rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
+  rownames(stats) <- c("silent", "missense", "nonsense")
-
+  colnames(stats) <- c("occurrences")
-gen_mutations <- function(seq, N) {
+  # Actual function
-  stats <- c()
+  for (i in 1:N) {
-  stats <- cbind(stats, c(0, 0, 0))
+    original_seq <- Biostrings::DNAString(seq)
-  rownames(stats) <- c("silent", "missense", "nonsense")
+    aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
-  colnames(stats) <- c("occurrences")
+
-  # Actual function
+    mut_seq <- Biostrings::DNAString(seq)
-  for (i in 1:217) {
+    mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
-    # select index for mutation
+    possible_mutations <- Biostrings::DNA_BASES
-    working_seq <- Biostrings::DNAString(seq)
+    possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
-    aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE)
+    mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
-    mut_action <- sample(c("ins", "del", "sub"), 1, TRUE)
+    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
-    mut_seq <- Biostrings::DNAString(seq)
+
-    if (mut_action == "sub") {
+
-      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
+    term_aa <- regexpr(pattern = "\\*", aa_seq)
-      possible_mutations <- Biostrings::DNA_BASES
+    term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
-      possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))]
+    if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
-      mut_change <- sample(possible_mutations, 1, replace = TRUE)
+      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
-      mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change)
+    } else if (mut_aa == aa_seq) {
-    } else if (mut_action == "ins") {
+      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
-      mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE)
+    } else {
-      possible_mutations <- Biostrings::DNA_BASES
+      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
-      mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = ""))
+    }
-    } else {
+  }
-      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
+  sealKey()
-      mut_seq <- mut_seq[-mut_index]
+  return(stats)
-    }
+}
-    mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
+
-    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
+gen_mutations("ATGATGATGATGATGATG", 1000)
-
+gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
-    # Note: we need silent, nonsense, and missense
+gen_mutations("TATTACTATTACTATTAC", 500)
-    mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa))
+gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
-    aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq))
+gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
-    if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) {
+gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
-      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
+
-    } else if (mut_aa == aa_seq) {
+
-      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
+myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
-    } else {
+myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
-      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
+myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
-    }
+myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
-  }
+rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
-  return(stats)
+
-}
+gen_mutations(myFA["RAB39B", 2], 10000)
-N_test <- 1200
+gen_mutations(myFA["PTPN5", 2], 10000)
-gen_mutations("ATGATGATGATGATGATG", N_test)
+gen_mutations(myFA["PTPN11", 2], 10000)
-gen_mutations("CCCCCCCCCCCCCCCCCC", N_test)
+gen_mutations(myFA["KRAS", 2], 10000)
 gen_mutations("TATTACTATTACTATTAC", N_test)
 gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
 gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)
--- a/myScripts/BIN-Storing_data.R
+++ b/myScripts/BIN-Storing_data.R
@ -1,41 +1,41 @@
-# ==   1.3  Task: submit for credit (part 1/2)  ================================
+# ==   1.3  Task: submit for credit (part 1/2)  ================================
-# == Submission - Code to add another philosopher to the datamodel:
+# == Submission - Code to add another philosopher to the datamodel:
-
+
-pID <- autoincrement(philDB$person)
+pID <- autoincrement(philDB$person)
-immanuelKant <- data.frame(id = pID,
+immanuelKant <- data.frame(id = pID,
-                           name = "Immanuel Kant",
+                           name = "Immanuel Kant",
-                           born = "1724",
+                           born = "1724",
-                           died = "1804",
+                           died = "1804",
-                           school = "Enlightenment Philosophy")
+                           school = "Enlightenment Philosophy")
-philDB$person <- rbind(philDB$person, immanuelKant)
+philDB$person <- rbind(philDB$person, immanuelKant)
-
+
-bID = autoincrement(philDB$books)
+bID = autoincrement(philDB$books)
-immanuelKantWork <- data.frame(id = bID,
+immanuelKantWork <- data.frame(id = bID,
-                               title = "Critique of Pure Reason",
+                               title = "Critique of Pure Reason",
-                               published = "1781")
+                               published = "1781")
-philDB$books <- rbind(philDB$books, immanuelKantWork)
+philDB$books <- rbind(philDB$books, immanuelKantWork)
-philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
+philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
-
+
-bID = autoincrement(philDB$books)
+bID = autoincrement(philDB$books)
-immanuelKantWork <- data.frame(id = bID,
+immanuelKantWork <- data.frame(id = bID,
-                               title = "Critique of Judgement",
+                               title = "Critique of Judgement",
-                               published = "1790")
+                               published = "1790")
-philDB$books <- rbind(philDB$books, immanuelKantWork)
+philDB$books <- rbind(philDB$books, immanuelKantWork)
-philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
+philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
-
+
-# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
+# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
-
+
-schools <- unique(philDB$person$school)
+schools <- unique(philDB$person$school)
-schools <- sort(schools)
+schools <- sort(schools)
-
+
-for (s in schools) {
+for (s in schools) {
-  cat(sprintf("%s\n", s))
+  cat(sprintf("%s\n", s))
-  authors = which(philDB$person$school == s)
+  authors = which(philDB$person$school == s)
-  for (author in authors) {
+  for (author in authors) {
-    works = which(philDB$works$personID == author)
+    works = which(philDB$works$personID == author)
-    for (work in works) {
+    for (work in works) {
-      bookId = which(philDB$books$id == philDB$works$bookID[work])
+      bookId = which(philDB$books$id == philDB$works$bookID[work])
-      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
+      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
-    }
+    }
-  }
+  }
 }
--- a/myScripts/CUTOLTaxonomy.json
+++ b/myScripts/CUTOLTaxonomy.json
@ -1,4 +1,4 @@
-[{
+[{
-	"ID": 879819,
+	"ID": 879819,
-	"species": "Cutaneotrichosporon oleaginosum"}
+	"species": "Cutaneotrichosporon oleaginosum"}
-]
+]
--- a/myScripts/MBP1_CUTOL.json
+++ b/myScripts/MBP1_CUTOL.json
@ -1,19 +1,19 @@
-[
+[
-  { "name" : "MBP1_CUTOL",
+  { "name" : "MBP1_CUTOL",
-    "RefSeqID" : "XP_018278493.1",
+    "RefSeqID" : "XP_018278493.1",
-    "UniProtID" : "A0A0J0XLN0",
+    "UniProtID" : "A0A0J0XLN0",
-    "taxonomyID" : 879819,
+    "taxonomyID" : 879819,
-    "sequence" : [
+    "sequence" : [
-       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
+       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
-       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
+       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
-       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
+       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
-       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
+       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
-       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
+       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
-       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
+       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
-       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
+       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
-       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
+       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
-       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
+       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
-       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
+       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
-       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
+       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
-  }
+  }
-]
+]
--- a/myScripts/README-myScripts.txt
+++ b/myScripts/README-myScripts.txt
@ -1,8 +1,8 @@
-README - myScripts folder:
+README - myScripts folder:
-==========================
+==========================
-
+
-The "myScripts" folder is a place to keep your personal files
+The "myScripts" folder is a place to keep your personal files
-safe. No files will be submitted into this folder on the GitHub, master
+safe. No files will be submitted into this folder on the GitHub, master
-copy. Thefore everything you put into this folder is safe from being
+copy. Thefore everything you put into this folder is safe from being
-inadvertently overwritten by a file with the same name that would be
+inadvertently overwritten by a file with the same name that would be
-downloaded in a GitHub "pull" request.
+downloaded in a GitHub "pull" request.
--- a/myScripts/makeProteinDB.R
+++ b/myScripts/makeProteinDB.R
@ -1,4 +1,4 @@
-source("./scripts/ABC-createRefDB.R")
+source("./scripts/ABC-createRefDB.R")
-
+
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
-myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
+myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
--- a/myScripts/myScript.R
+++ b/myScripts/myScript.R
@ -1,38 +1,38 @@
-# myScript.R
+# myScript.R
-#
+#
-# --- As you work with this file, you can delete the instructions below --------
+# --- As you work with this file, you can delete the instructions below --------
-# Write your notes and code experiments into this document. Save it
+# Write your notes and code experiments into this document. Save it
-# from time to time - however I recommend that you do not _commit_
+# from time to time - however I recommend that you do not _commit_
-# your saved version.
+# your saved version.
-#
+#
-# As long as you do not _commit_ this script to version control,
+# As long as you do not _commit_ this script to version control,
-# you can _pull_ updated versions of the entire project from GitHub
+# you can _pull_ updated versions of the entire project from GitHub
-# by using the RStudio version control interface. However, once
+# by using the RStudio version control interface. However, once
-# you _commit_ any file in your local version, RStudio will require
+# you _commit_ any file in your local version, RStudio will require
-# you to resolve conflicts before you can _pull_ updates.
+# you to resolve conflicts before you can _pull_ updates.
-# --- As you work with this file, you can delete the instructions above --------
+# --- As you work with this file, you can delete the instructions above --------
-#
+#
-## Purpose: <...>
+## Purpose: <...>
-#
+#
-# Version: <...>
+# Version: <...>
-#
+#
-# Date:    <...>
+# Date:    <...>
-# Author:  <Name> (<namee@mail.utoronto.ca>)
+# Author:  <Name> (<namee@mail.utoronto.ca>)
-#
+#
-# Versions:
+# Versions:
-#
+#
-#   <number>    <Features>
+#   <number>    <Features>
-#
+#
-# TODO:
+# TODO:
-#   <...>
+#   <...>
-#
+#
-# ====================================================================
+# ====================================================================
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-# [END]
+# [END]
-
+
--- a/plottingReference.R
+++ b/plottingReference.R
--- a/scriptTemplate.R
+++ b/scriptTemplate.R
@ -1,75 +1,75 @@
-# scriptTemplate.R
+# scriptTemplate.R
-#
+#
-# Purpose:
+# Purpose:
-# Version:
+# Version:
-# Date:
+# Date:
-# Author:
+# Author:
-#
+#
-# Input:
+# Input:
-# Output:
+# Output:
-# Dependencies:
+# Dependencies:
-#
+#
-# ToDo:
+# ToDo:
-# Notes:
+# Notes:
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-setwd("<your/project/directory>")
+setwd("<your/project/directory>")
-
+
-# ====  PARAMETERS  ============================================================
+# ====  PARAMETERS  ============================================================
-# Define and explain all parameters. No "magic numbers" in your code below.
+# Define and explain all parameters. No "magic numbers" in your code below.
-
+
-
+
-
+
-# ====  PACKAGES  ==============================================================
+# ====  PACKAGES  ==============================================================
-# Check that required packages have been installed. Install if needed.
+# Check that required packages have been installed. Install if needed.
-
+
-if (! requireNamespace("seqinr", quietly=TRUE)) {
+if (! requireNamespace("seqinr", quietly=TRUE)) {
-  install.packages("seqinr")
+  install.packages("seqinr")
-}
+}
-# Package information:
+# Package information:
-#  library(help = seqinr)       # basic information
+#  library(help = seqinr)       # basic information
-#  browseVignettes("seqinr")    # available vignettes
+#  browseVignettes("seqinr")    # available vignettes
-#  data(package = "seqinr")     # available datasets
+#  data(package = "seqinr")     # available datasets
-
+
-# Note: use package functions with the :: operator - eg.
+# Note: use package functions with the :: operator - eg.
-# seqinr::aaa("K")
+# seqinr::aaa("K")
-
+
-
+
-
+
-# ====  FUNCTIONS  =============================================================
+# ====  FUNCTIONS  =============================================================
-
+
-# Define functions or source external files
+# Define functions or source external files
-source("<myUtilityFunctionsScript.R>")
+source("<myUtilityFunctionsScript.R>")
-
+
-myFunction <- function(a, b=1) {
+myFunction <- function(a, b=1) {
-	# Purpose:
+	# Purpose:
-	#     Describe ...
+	#     Describe ...
-	# Parameters:
+	# Parameters:
-	#     a: ...
+	#     a: ...
-	#     b: ...
+	#     b: ...
-	# Value:
+	# Value:
-	#     result: ...
+	#     result: ...
-
+
-	# code ...
+	# code ...
-
+
-	return(result)
+	return(result)
-}
+}
-
+
-
+
-
+
-# ====  PROCESS  ===============================================================
+# ====  PROCESS  ===============================================================
-# Enter the step-by-step process of your project here. Strive to write your
+# Enter the step-by-step process of your project here. Strive to write your
-# code so that you can simply run this entire file and re-create all
+# code so that you can simply run this entire file and re-create all
-# intermediate results.
+# intermediate results.
-
+
-
+
-
+
-
+
-
+
-
+
-# ====  TESTS  =================================================================
+# ====  TESTS  =================================================================
-# Enter your function tests here...
+# Enter your function tests here...
-
+
-
+
-# [END]
+# [END]
--- a/scripts/ABC-createRefDB.R
+++ b/scripts/ABC-createRefDB.R
@ -1,30 +1,30 @@
-# ABC-createRefDB.R
+# ABC-createRefDB.R
-#
+#
-# Create a reference protein database for Mbp1-like proteins
+# Create a reference protein database for Mbp1-like proteins
-#
+#
-# Boris Steipe for ABC learning units
+# Boris Steipe for ABC learning units
-#
+#
-# For the species, see:
+# For the species, see:
-# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
+# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
-#
+#
-# For the data model, see
+# For the data model, see
-# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
+# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
-# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
+# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-myDB <- dbInit()
+myDB <- dbInit()
-
+
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
-
+
-myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
+myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
-
+
-myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
+myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
-
+
-myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
+myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
-
+
-
+
-# [END]
+# [END]
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@ -1,443 +1,443 @@
-# tocID <- "scripts/ABC-makeMYSPElist.R"
+# tocID <- "scripts/ABC-makeMYSPElist.R"
-#
+#
-# Purpose:  Create a list of genome sequenced fungi with protein annotations and
+# Purpose:  Create a list of genome sequenced fungi with protein annotations and
-#               Mbp1 homologues.
+#               Mbp1 homologues.
-#
+#
-# Version: 1.4
+# Version: 1.4
-#
+#
-# Date:    2016  09  -  2021  09
+# Date:    2016  09  -  2021  09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions
+# Versions
-#          1.4    New retrieval logic
+#          1.4    New retrieval logic
-#          1.3    Rewrite to change datasource. NCBI has not been updated
+#          1.3    Rewrite to change datasource. NCBI has not been updated
-#                   since 2012. Use ensembl fungi as initial source.
+#                   since 2012. Use ensembl fungi as initial source.
-#          1.2    Change from require() to requireNamespace()
+#          1.2    Change from require() to requireNamespace()
-#          1.1.2  Moved BLAST.R to ./scripts directory
+#          1.1.2  Moved BLAST.R to ./scripts directory
-#          1.1    Update 2017
+#          1.1    Update 2017
-#          1.0    First code 2016
+#          1.0    First code 2016
-#
+#
-# TODO:
+# TODO:
-#
+#
-# ==============================================================================
+# ==============================================================================
-#
+#
-# DO NOT  source()  THIS FILE!
+# DO NOT  source()  THIS FILE!
-#
+#
-# This file is code I provide for your deeper understanding of a process and
+# This file is code I provide for your deeper understanding of a process and
-# to provide you with useful sample code. It is not actually necessary for
+# to provide you with useful sample code. It is not actually necessary for
-# you to run this code, but I encourage you to read it carefully and discuss
+# you to run this code, but I encourage you to read it carefully and discuss
-# if there are parts you don't understand.
+# if there are parts you don't understand.
-#
+#
-# Run the commands that interact with the NCBI servers only if you want to
+# Run the commands that interact with the NCBI servers only if you want to
-# experiment specifically with the code and/or parameters. I have commented out
+# experiment specifically with the code and/or parameters. I have commented out
-# those parts. If you only want to study the general workflow, just load()
+# those parts. If you only want to study the general workflow, just load()
-# the respective intermediate results.
+# the respective intermediate results.
-#
+#
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                    Line
+#TOC>   Section  Title                                    Line
-#TOC> --------------------------------------------------------
+#TOC> --------------------------------------------------------
-#TOC>   1        The strategy                               55
+#TOC>   1        The strategy                               55
-#TOC>   2        PACKAGES AND INITIALIZATIONS               67
+#TOC>   2        PACKAGES AND INITIALIZATIONS               67
-#TOC>   3        ENSEMBL FUNGI                              75
+#TOC>   3        ENSEMBL FUNGI                              75
-#TOC>   3.1        Import                                   78
+#TOC>   3.1        Import                                   78
-#TOC>   4        BLAST SEARCH                              155
+#TOC>   4        BLAST SEARCH                              155
-#TOC>   4.1        find homologous proteins                161
+#TOC>   4.1        find homologous proteins                161
-#TOC>   4.2        Identify species in "hits"              192
+#TOC>   4.2        Identify species in "hits"              192
-#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
+#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
-#TOC>   6        STUDENT NUMBERS                           375
+#TOC>   6        STUDENT NUMBERS                           375
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  The strategy  ========================================================
+# =    1  The strategy  ========================================================
-
+
-# This script will create a list of "MYSPE" species and save it in an R object
+# This script will create a list of "MYSPE" species and save it in an R object
-# MYSPEspecies that is stored in the data subdirectory of this project from
+# MYSPEspecies that is stored in the data subdirectory of this project from
-# where it can be loaded. The strategy is as follows: we download a list of
+# where it can be loaded. The strategy is as follows: we download a list of
-# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
+# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
-# species that have been annotated.
+# species that have been annotated.
-# Next we perform a BLAST search, to identify fungal species that have
+# Next we perform a BLAST search, to identify fungal species that have
-# genes that are homologous to yeast MBP1.
+# genes that are homologous to yeast MBP1.
-#
+#
-# ...
+# ...
-
+
-# =    2  PACKAGES AND INITIALIZATIONS  ========================================
+# =    2  PACKAGES AND INITIALIZATIONS  ========================================
-
+
-# httr provides interfaces to Webservers on the Internet
+# httr provides interfaces to Webservers on the Internet
-if (! requireNamespace("httr", quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-
+
-
+
-# =    3  ENSEMBL FUNGI  =======================================================
+# =    3  ENSEMBL FUNGI  =======================================================
-
+
-
+
-# ==   3.1  Import  ============================================================
+# ==   3.1  Import  ============================================================
-
+
-# Navigate to https://fungi.ensembl.org and click on the link to the full
+# Navigate to https://fungi.ensembl.org and click on the link to the full
-# list of all species: https://fungi.ensembl.org/species.html
+# list of all species: https://fungi.ensembl.org/species.html
-# On the page, click on the spreadsheet symbol top right and choose
+# On the page, click on the spreadsheet symbol top right and choose
-# "download whole table". The file will be named  "Species.csv", in your
+# "download whole table". The file will be named  "Species.csv", in your
-# usual downloads folder. Move it to the data folder, and read it.
+# usual downloads folder. Move it to the data folder, and read it.
-
+
-sDat <- read.csv("./data/Species.csv")
+sDat <- read.csv("./data/Species.csv")
-str(sDat)
+str(sDat)
-
+
-# The most obvious way to partition these is according to Classification ...
+# The most obvious way to partition these is according to Classification ...
-# (poking around a bit in the UniProt taxonomy database shows that the
+# (poking around a bit in the UniProt taxonomy database shows that the
-#  classification used here is the taxonomic rank of "order").
+#  classification used here is the taxonomic rank of "order").
-# how many classifications do we have?
+# how many classifications do we have?
-length(unique(sDat$Classification))  # 66
+length(unique(sDat$Classification))  # 66
-
+
-# To have a good set for the class, we should have about 100.
+# To have a good set for the class, we should have about 100.
-# Let's see for which of these we can find Mbp1 homologues.
+# Let's see for which of these we can find Mbp1 homologues.
-# First, we'll keep only the colums for name, classification, and taxID, and
+# First, we'll keep only the colums for name, classification, and taxID, and
-# drop the rest ...
+# drop the rest ...
-sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
+sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
-colnames(sDat) <- c("name", "order", "taxID")
+colnames(sDat) <- c("name", "order", "taxID")
-
+
-# Next, we make an extra column: genus - the first part of the binomial name.
+# Next, we make an extra column: genus - the first part of the binomial name.
-# We'll use the gsub() function, and for that we need a "regular expression"
+# We'll use the gsub() function, and for that we need a "regular expression"
-# that matches to all characters from the first blank to the end of the string:
+# that matches to all characters from the first blank to the end of the string:
-myPatt <- "\\s.*$"  # one whitespace (\\s) ...
+myPatt <- "\\s.*$"  # one whitespace (\\s) ...
-                    # followed by any character (.) 0..n times (*) ...
+                    # followed by any character (.) 0..n times (*) ...
-                    # until the end of the string
+                    # until the end of the string
-
+
-# using gsub() we substitue all matching characters with the empty string "" -
+# using gsub() we substitue all matching characters with the empty string "" -
-# this deletes the matching characters
+# this deletes the matching characters
-# Test this:
+# Test this:
-gsub(myPatt, "", "Genus")                      # one word: unchanged
+gsub(myPatt, "", "Genus")                      # one word: unchanged
-gsub(myPatt, "", "gEnus species")              # two words: return only first
+gsub(myPatt, "", "gEnus species")              # two words: return only first
-gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
+gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
-
+
-# apply this to the "name" column and add the result as a separate column
+# apply this to the "name" column and add the result as a separate column
-# called "genus"
+# called "genus"
-sDat$genus <- gsub(myPatt, "", sDat$name)
+sDat$genus <- gsub(myPatt, "", sDat$name)
-
+
-# what do we get?
+# what do we get?
-c(head(unique(sDat$genus)),
+c(head(unique(sDat$genus)),
-  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
+  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
-                             # is a problem that we have to keep in mind.
+                             # is a problem that we have to keep in mind.
-                             # (Always inspect your results!)
+                             # (Always inspect your results!)
-# Drop all rows for which the genus contains special chracters -
+# Drop all rows for which the genus contains special chracters -
-# like "[Candida]"
+# like "[Candida]"
-sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
+sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
-
+
-length(table(sDat$genus))    # how many genus?
+length(table(sDat$genus))    # how many genus?
-hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
+hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
-                                              # most genus have very few, but
+                                              # most genus have very few, but
-                                              # some have very many species.
+                                              # some have very many species.
-sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
+sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
-
+
-# We should have at least one species from each taxonomic order, but we can
+# We should have at least one species from each taxonomic order, but we can
-# add a few genus until we have about 100 validated species.
+# add a few genus until we have about 100 validated species.
-
+
-# Let's add a column for species, by changing our regular expression a bit,
+# Let's add a column for species, by changing our regular expression a bit,
-# using ^ (start of string), \\S (NOT a whitespace),
+# using ^ (start of string), \\S (NOT a whitespace),
-# and + (one or more matches), capturing the match (...), and returning
+# and + (one or more matches), capturing the match (...), and returning
-# it as the substitution (\\1) ...
+# it as the substitution (\\1) ...
-
+
-myPatt <- "^(\\S+\\s\\S+)\\s.*$"
+myPatt <- "^(\\S+\\s\\S+)\\s.*$"
-sDat$species <- gsub(myPatt, "\\1", sDat$name)
+sDat$species <- gsub(myPatt, "\\1", sDat$name)
-
+
-# And we reorder the columns, just for aesthetics:
+# And we reorder the columns, just for aesthetics:
-sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
+sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
-
+
-# Final check:
+# Final check:
-any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
+any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
-
+
-#
+#
-# Now we check which of these have Mbp1 homologues ...
+# Now we check which of these have Mbp1 homologues ...
-
+
-# =    4  BLAST SEARCH  ========================================================
+# =    4  BLAST SEARCH  ========================================================
-
+
-
+
-# We run a BLAST search to find all proteins related to yeast Mbp1 in any
+# We run a BLAST search to find all proteins related to yeast Mbp1 in any
-# fungus. With the results, we'll annotate our sDat table.
+# fungus. With the results, we'll annotate our sDat table.
-
+
-# ==   4.1  find homologous proteins  ==========================================
+# ==   4.1  find homologous proteins  ==========================================
-#
+#
-# Use BLAST to fetch proteins related to Mbp1 and identify the species that
+# Use BLAST to fetch proteins related to Mbp1 and identify the species that
-# contain them.
+# contain them.
-
+
-# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
+# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
-# amount of error handling involved that is not supported by the API in a
+# amount of error handling involved that is not supported by the API in a
-# principled way but requires rather ad hoc solutions. The code I threw together
+# principled way but requires rather ad hoc solutions. The code I threw together
-# to make a BLAST interface (demo-quality, not research-quality) is in the file
+# to make a BLAST interface (demo-quality, not research-quality) is in the file
-# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
-# standard task of communicating with servers and parsing responses - everyday
+# standard task of communicating with servers and parsing responses - everyday
-# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
+# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
-# parser in currently available packages.
+# parser in currently available packages.
-#
+#
-# DON'T use this for BLAST searches unless you have read the NCBI policy
+# DON'T use this for BLAST searches unless you have read the NCBI policy
-# for automated tasks. If you indicriminately pound on the NCBI's BLAST
+# for automated tasks. If you indicriminately pound on the NCBI's BLAST
-# server, they will blacklist your IP-address. See:
+# server, they will blacklist your IP-address. See:
-# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
+#
-# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
+# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
-# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
+# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
-#                    db = "refseq_protein",        # database to search in
+#                    db = "refseq_protein",        # database to search in
-#                    nHits = 3000,                 # 945 hits in 2020
+#                    nHits = 3000,                 # 945 hits in 2020
-#                    E = 0.01,                     #
+#                    E = 0.01,                     #
-#                    limits = "txid4751[ORGN]")    # = fungi
+#                    limits = "txid4751[ORGN]")    # = fungi
-# saveRDS(BLASThits, file="data/BLASThits.rds")
+# saveRDS(BLASThits, file="data/BLASThits.rds")
-#
+#
-# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
+# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
-#
+#
-BLASThits <- readRDS(file = "data/BLASThits.rds")
+BLASThits <- readRDS(file = "data/BLASThits.rds")
-
+
-# ==   4.2  Identify species in "hits"  ========================================
+# ==   4.2  Identify species in "hits"  ========================================
-
+
-# This is a very big list that can't be usefully analyzed manually. Here
+# This is a very big list that can't be usefully analyzed manually. Here
-# we are only interested in the species names that it contains.
+# we are only interested in the species names that it contains.
-
+
-# How many hits in the list?
+# How many hits in the list?
-length(BLASThits$hits)      # 1,134
+length(BLASThits$hits)      # 1,134
-
+
-# Let's look at a hit somewhere down the list
+# Let's look at a hit somewhere down the list
-str(BLASThits$hit[[277]])
+str(BLASThits$hit[[277]])
-
+
-# A fair amount of parsing has gone into the BLAST.R code to prepare the results
+# A fair amount of parsing has gone into the BLAST.R code to prepare the results
-# in a useful way. The species information is in the $species element of every
+# in a useful way. The species information is in the $species element of every
-# hit.
+# hit.
-
+
-# Run a loop to extract all the species names into a vector. We subset ...
+# Run a loop to extract all the species names into a vector. We subset ...
-# Blasthits$hits                 ... the list of hits, from which we choose ...
+# Blasthits$hits                 ... the list of hits, from which we choose ...
-# Blasthits$hits[[i]]            ... the i-th hit, and get ...
+# Blasthits$hits[[i]]            ... the i-th hit, and get ...
-# Blasthits$hits[[i]]$species    ... the species element from that.
+# Blasthits$hits[[i]]$species    ... the species element from that.
-# Subsetting FTW.
+# Subsetting FTW.
-
+
-BLASTspecies <- character()
+BLASTspecies <- character()
-for (i in seq_along(BLASThits$hits)) {
+for (i in seq_along(BLASThits$hits)) {
-    BLASTspecies[i] <- BLASThits$hits[[i]]$species
+    BLASTspecies[i] <- BLASThits$hits[[i]]$species
-}
+}
-
+
-# You can confirm that BLASTspecies has the expected size.
+# You can confirm that BLASTspecies has the expected size.
-length(BLASTspecies)
+length(BLASTspecies)
-
+
-# if we delete some of these later on, we still want to remember which hit
+# if we delete some of these later on, we still want to remember which hit
-# they came from. Thus we name() the elements with their index, which is the
+# they came from. Thus we name() the elements with their index, which is the
-# same as the index of the hit in BLASThits
+# same as the index of the hit in BLASThits
-names(BLASTspecies) <- 1:length(BLASTspecies)
+names(BLASTspecies) <- 1:length(BLASTspecies)
-
+
-
+
-# let's plot the distribution of E-values
+# let's plot the distribution of E-values
-eVals <- numeric()
+eVals <- numeric()
-for (i in seq_along(BLASThits$hits)) {
+for (i in seq_along(BLASThits$hits)) {
-  eVals[i] <- BLASThits$hits[[i]]$E
+  eVals[i] <- BLASThits$hits[[i]]$E
-}
+}
-range(eVals)
+range(eVals)
-sum(eVals == 0)
+sum(eVals == 0)
-
+
-# let's plot the log of all values > 0 to see how they are distributed
+# let's plot the log of all values > 0 to see how they are distributed
-# plotting only one vectyor of numbers plots their index as x, and
+# plotting only one vectyor of numbers plots their index as x, and
-# their value as y ...
+# their value as y ...
-plot(log(eVals[eVals > 0]), col = "#CC0000")
+plot(log(eVals[eVals > 0]), col = "#CC0000")
-
+
-# This is very informative: I would suspect that the first ten or so are
+# This is very informative: I would suspect that the first ten or so are
-# virtually identical to the yeast protein, then we have about 800 hits with
+# virtually identical to the yeast protein, then we have about 800 hits with
-# decreasing similarity, and then about 200 more that may actually be false
+# decreasing similarity, and then about 200 more that may actually be false
-# positives. Also - we plotted them by index, that means the table is SORTED:
+# positives. Also - we plotted them by index, that means the table is SORTED:
-# Lower E-values strictly come before higher E-values.
+# Lower E-values strictly come before higher E-values.
-
+
-# Again, some species appear more than once, e.g. ...
+# Again, some species appear more than once, e.g. ...
-sum(BLASTspecies == "Saccharomyces cerevisiae")
+sum(BLASTspecies == "Saccharomyces cerevisiae")
-
+
-# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
+# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
-
+
-# Therefore we remove duplicates. Removing duplicates will leave the FIRST
+# Therefore we remove duplicates. Removing duplicates will leave the FIRST
-# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
+# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
-# species, we will retain only the protein that has the highest similarity
+# species, we will retain only the protein that has the highest similarity
-# to yeast Mbp1, not any of its more distant paralogues.
+# to yeast Mbp1, not any of its more distant paralogues.
-sel <- ! duplicated(BLASTspecies)
+sel <- ! duplicated(BLASTspecies)
-BLASTspecies <- BLASTspecies[sel]
+BLASTspecies <- BLASTspecies[sel]
-
+
-length(BLASTspecies)
+length(BLASTspecies)
-# i.e. we got rid of about two thirds of the hits.
+# i.e. we got rid of about two thirds of the hits.
-tail(BLASTspecies)  # see how the names are useful!
+tail(BLASTspecies)  # see how the names are useful!
-                    # again - there are some special characters ...
+                    # again - there are some special characters ...
-                    # what are they?
+                    # what are they?
-BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
+BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
-
+
-# remove the brackets ...
+# remove the brackets ...
-BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
+BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
-# drop any new duplicates ...
+# drop any new duplicates ...
-BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
+BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
-
+
-# check the number again:
+# check the number again:
-length(BLASTspecies)
+length(BLASTspecies)
-# Think a bit about this: what may be the biological reason to find that
+# Think a bit about this: what may be the biological reason to find that
-# on average, in 388 fungi across the entire phylogenetic tree, we have
+# on average, in 388 fungi across the entire phylogenetic tree, we have
-# three sequences that are homologous to yeast Mbp1?
+# three sequences that are homologous to yeast Mbp1?
-
+
-# Let's look at the distribution of E-values in this selection (Subsetting FTW):
+# Let's look at the distribution of E-values in this selection (Subsetting FTW):
-# we plot all values that are TRUE in the vector "sel" that we created above,
+# we plot all values that are TRUE in the vector "sel" that we created above,
-# AND greater than 0
+# AND greater than 0
-plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
+plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
-
+
-
+
-# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
+# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
-
+
-# Next we add the blast result to our sDat dataframe. We'll store the index,
+# Next we add the blast result to our sDat dataframe. We'll store the index,
-# the E-value, and the Query-bounds from which we can estimate which domains
+# the E-value, and the Query-bounds from which we can estimate which domains
-# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
+# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
-# Mbp1's N-terminal APSES domain.)
+# Mbp1's N-terminal APSES domain.)
-#
+#
-# First we pull the hits we wanted from the BLASTspecies:
+# First we pull the hits we wanted from the BLASTspecies:
-iHits <- as.numeric(names(BLASTspecies))
+iHits <- as.numeric(names(BLASTspecies))
-length(iHits)     # one index for each TRUE in sel
+length(iHits)     # one index for each TRUE in sel
-
+
-# add columns to sDat
+# add columns to sDat
-l <- nrow(sDat)
+l <- nrow(sDat)
-sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
+sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
-sDat$eVal   <- numeric(l)  # E-value of the hit
+sDat$eVal   <- numeric(l)  # E-value of the hit
-sDat$lAli   <- numeric(l)  # length of the aligned region
+sDat$lAli   <- numeric(l)  # length of the aligned region
-
+
-# extract and merge
+# extract and merge
-for (iHit in iHits) {
+for (iHit in iHits) {
-  thisSp <- BLASThits$hits[[iHit]]$species
+  thisSp <- BLASThits$hits[[iHit]]$species
-  sel <- sDat$species == thisSp
+  sel <- sDat$species == thisSp
-
+
-  sDat$iHit[sel]   <- iHit
+  sDat$iHit[sel]   <- iHit
-  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
+  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
-  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
+  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
-}
+}
-
+
-# Are all reference species accounted for?
+# Are all reference species accounted for?
-selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
+selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
-REFspecies %in% sDat$species[selA]     # yes, all there
+REFspecies %in% sDat$species[selA]     # yes, all there
-
+
-selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
+selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
-
+
-sum(selA & selB)   # How many rows?
+sum(selA & selB)   # How many rows?
-
+
-# sDat of course includes all duplicates. Some may be multiply sequenced, some
+# sDat of course includes all duplicates. Some may be multiply sequenced, some
-# may be different strains. We'll use the same strategy as before and keep
+# may be different strains. We'll use the same strategy as before and keep
-# only the best hit: order the rows by E-value, then drop all rows which
+# only the best hit: order the rows by E-value, then drop all rows which
-# are duplicated.
+# are duplicated.
-
+
-
+
-# drop all rows without BLAST hits ...
+# drop all rows without BLAST hits ...
-sDat <- sDat[ ! (sDat$iHit == 0) , ]
+sDat <- sDat[ ! (sDat$iHit == 0) , ]
-
+
-# order sDat by E-value ...
+# order sDat by E-value ...
-sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
+sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
-
+
-# drop all rows with duplicated species ...
+# drop all rows with duplicated species ...
-sDat <- sDat[ ! duplicated(sDat$species) , ]
+sDat <- sDat[ ! duplicated(sDat$species) , ]
-
+
-# Lets look at the E-values ...
+# Lets look at the E-values ...
-plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
+plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
-
+
-# and alignment lengths ...
+# and alignment lengths ...
-plot(sDat$lAli, col = "#00DDAA")
+plot(sDat$lAli, col = "#00DDAA")
-
+
-# How many ...
+# How many ...
-length(unique(sDat$name))
+length(unique(sDat$name))
-length(unique(sDat$species))
+length(unique(sDat$species))
-length(unique(sDat$genus))
+length(unique(sDat$genus))
-length(unique(sDat$order))
+length(unique(sDat$order))
-
+
-# I need an extra species for admin purposes later on ...
+# I need an extra species for admin purposes later on ...
-sel <- grep("Sporothrix schenckii", sDat$species)
+sel <- grep("Sporothrix schenckii", sDat$species)
-SPOSCdat <- sDat[sel, ]
+SPOSCdat <- sDat[sel, ]
-sDat <- sDat[-sel, ]
+sDat <- sDat[-sel, ]
-
+
-# To get the final dataset, we remove the reference species with their
+# To get the final dataset, we remove the reference species with their
-# entire orders ...
+# entire orders ...
-REForders <- unique(sDat$order[sDat$species %in% REFspecies])
+REForders <- unique(sDat$order[sDat$species %in% REFspecies])
-sel <- sDat$order %in% REForders
+sel <- sDat$order %in% REForders
-REFdat <- sDat[sel , ]
+REFdat <- sDat[sel , ]
-sDat   <- sDat[ ! sel , ]
+sDat   <- sDat[ ! sel , ]
-
+
-# REFdat should now contain only the REFspecies ...
+# REFdat should now contain only the REFspecies ...
-( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
+( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
-
+
-# ... but all of them
+# ... but all of them
-sum(REFspecies %in% REFdat$species)
+sum(REFspecies %in% REFdat$species)
-
+
-# ... and we have enough left in sDat to prune sDat to unique genus
+# ... and we have enough left in sDat to prune sDat to unique genus
-sDat <- sDat[ ! duplicated(sDat$genus) , ]
+sDat <- sDat[ ! duplicated(sDat$genus) , ]
-nrow(sDat)   # 84
+nrow(sDat)   # 84
-
+
-# I add back "Sporothrix schenckii" ...
+# I add back "Sporothrix schenckii" ...
-sDat <- rbind(SPOSCdat, sDat)
+sDat <- rbind(SPOSCdat, sDat)
-
+
-# ... and save for future use.
+# ... and save for future use.
-# saveRDS(sDat, file = "data/sDat.rds")
+# saveRDS(sDat, file = "data/sDat.rds")
-# saveRDS(REFdat, file = "data/REFdat.rds")
+# saveRDS(REFdat, file = "data/REFdat.rds")
-
+
-
+
-
+
-# =    6  STUDENT NUMBERS  =====================================================
+# =    6  STUDENT NUMBERS  =====================================================
-#
+#
-# An asymmetric function to retrieve a MYSPE species
+# An asymmetric function to retrieve a MYSPE species
-#
+#
-sDat <- readRDS(file = "data/sDat.rds")
+sDat <- readRDS(file = "data/sDat.rds")
-
+
-students <- read.csv("../BCH441-2021-students.csv")
+students <- read.csv("../BCH441-2021-students.csv")
-sN <- students$Integration.ID
+sN <- students$Integration.ID
-sN <- sN[! is.na(sN)]
+sN <- sN[! is.na(sN)]
-sN <- as.character(sN)
+sN <- as.character(sN)
-sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
+sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
-
+
-set.seed(112358)
+set.seed(112358)
-theseSpecies <- sDat[sample(1:nrow(sDat)), ]
+theseSpecies <- sDat[sample(1:nrow(sDat)), ]
-all(sort(theseSpecies$name) == sort(sDat$name))
+all(sort(theseSpecies$name) == sort(sDat$name))
-nrow((theseSpecies))
+nrow((theseSpecies))
-(iX <- grep("Sporothrix schenckii", theseSpecies$name))
+(iX <- grep("Sporothrix schenckii", theseSpecies$name))
-theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
+theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
-rndMin <-  992000000
+rndMin <-  992000000
-rndMax <- 1020000000
+rndMax <- 1020000000
-N <- 10000
+N <- 10000
-keys <- as.character(sample(rndMin:rndMax, N + 1000))
+keys <- as.character(sample(rndMin:rndMax, N + 1000))
-keys <- keys[! (keys %in% sN)]
+keys <- keys[! (keys %in% sN)]
-keys <- keys[1:N]
+keys <- keys[1:N]
-keys[1:length(sN)] <- sN
+keys[1:length(sN)] <- sN
-
+
-nRep <- floor(N/nrow(theseSpecies))
+nRep <- floor(N/nrow(theseSpecies))
-MYSPEdat <- theseSpecies
+MYSPEdat <- theseSpecies
-for(i in 1:nRep) {
+for(i in 1:nRep) {
-  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
+  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
-}
+}
-MYSPEdat <- MYSPEdat[1:N, ]
+MYSPEdat <- MYSPEdat[1:N, ]
-for (i in 1:N) {
+for (i in 1:N) {
-  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
+  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
-}
+}
-set.seed(NULL)
+set.seed(NULL)
-MYSPEdat <- MYSPEdat[sample(1:N), ]
+MYSPEdat <- MYSPEdat[sample(1:N), ]
-
+
-# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
+# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
-
+
-# === validate
+# === validate
-x <- character()
+x <- character()
-for (n in sN) {
+for (n in sN) {
-  sp <- getMYSPE(n)
+  sp <- getMYSPE(n)
-  if (length(sp) != 1) {
+  if (length(sp) != 1) {
-    stop(print(as.character(n)))
+    stop(print(as.character(n)))
-  } else {
+  } else {
-    x <- c(x, sp)
+    x <- c(x, sp)
-  }
+  }
-}
+}
-
+
-# === species for late-comers
+# === species for late-comers
-y <- unique(MYSPEdat$species)
+y <- unique(MYSPEdat$species)
-print(y[!(y %in% x)])
+print(y[!(y %in% x)])
-
+
-
+
-# === validate
+# === validate
-l <- length(sN)
+l <- length(sN)
-sp <- character(l)
+sp <- character(l)
-for(i in 1:l) {
+for(i in 1:l) {
-  sp[i] <- getMYSPE(sN[i])
+  sp[i] <- getMYSPE(sN[i])
-}
+}
-any(duplicated(sp))
+any(duplicated(sp))
-length(unique(sp))
+length(unique(sp))
-which(! sDat$species %in% sp)  # these can be assigned to late-comers
+which(! sDat$species %in% sp)  # these can be assigned to late-comers
-
+
-# Done.
+# Done.
-
+
-# [END]
+# [END]
--- a/scripts/ABC-makeSTRINGedges.R
+++ b/scripts/ABC-makeSTRINGedges.R
@ -1,168 +1,168 @@
-# tocID <- "scripts/ABC-makeSTRINGedges.R"
+# tocID <- "scripts/ABC-makeSTRINGedges.R"
-#
+#
-# Create a subnetwork of high-confidence human STRING edges.
+# Create a subnetwork of high-confidence human STRING edges.
-#
+#
-# Notes:
+# Notes:
-#
+#
-#      The large source- datafile is NOT posted to github. If you want to
+#      The large source- datafile is NOT posted to github. If you want to
-#      experiment with the original data, download it and place it into your
+#      experiment with the original data, download it and place it into your
-#      local  ./data  directory.
+#      local  ./data  directory.
-#
+#
-#      STRING data source:
+#      STRING data source:
-#        Download page:
+#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
+# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
-#        Data: (127.6 Mb)
+#        Data: (127.6 Mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
+# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
-#
+#
-# Version:  1.0
+# Version:  1.0
-#
+#
-# Date:     2020-09
+# Date:     2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.0    Rewrite
+#           1.0    Rewrite
-#
+#
-# TODO:
+# TODO:
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                             Line
+#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
+#TOC> -------------------------------------------------
-#TOC>   1        Initialize                          44
+#TOC>   1        Initialize                          44
-#TOC>   2        Read STRING Data                    51
+#TOC>   2        Read STRING Data                    51
-#TOC>   3        Define cutoff and subset            63
+#TOC>   3        Define cutoff and subset            63
-#TOC>   4        Drop  duplicates                   103
+#TOC>   4        Drop  duplicates                   103
-#TOC>   5        Simple statistics                  127
+#TOC>   5        Simple statistics                  127
-#TOC>   6        Write to file                      160
+#TOC>   6        Write to file                      160
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  Initialize  ==========================================================
+# =    1  Initialize  ==========================================================
-
+
-if (! requireNamespace("readr", quietly = TRUE)) {
+if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
+  install.packages("readr")
-}
+}
-
+
-
+
-# =    2  Read STRING Data  ====================================================
+# =    2  Read STRING Data  ====================================================
-
+
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 127.6MB, the uncompressed version is probably
+# The .gz compressed version is 127.6MB, the uncompressed version is probably
-# 848 Mb. Fortunately readr:: can read from compressed
+# 848 Mb. Fortunately readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
+# files, and does so automatically, based on the file extension.
-( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
+( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
+STR <- readr::read_delim(fn, delim = " ")
-nrow(STR)  #  11,759,454 rows
+nrow(STR)  #  11,759,454 rows
-head(STR)
+head(STR)
-
+
-
+
-# =    3  Define cutoff and subset  ============================================
+# =    3  Define cutoff and subset  ============================================
-
+
-# approximate distribution of combined_score
+# approximate distribution of combined_score
-hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
+hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
-
+
-# Let's table the counts >= 850 and plot them for better resolution.
+# Let's table the counts >= 850 and plot them for better resolution.
-
+
-myTb <- table(STR$combined_score[STR$combined_score >= 850])
+myTb <- table(STR$combined_score[STR$combined_score >= 850])
-is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
+is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
-
+
-plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
+plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
-myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
+myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
-                         # frequently assigns a combined score of 0.900
+                         # frequently assigns a combined score of 0.900
-
+
-# Let's plot these counts as cumulative sums, in reverse order, scaled
+# Let's plot these counts as cumulative sums, in reverse order, scaled
-# as combined scores.
+# as combined scores.
-myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
+myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
-plot(myX,
+plot(myX,
-     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
+     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
-     xlim = c(1.0, 0.85),            # reverse x-axis
+     xlim = c(1.0, 0.85),            # reverse x-axis
-     type = "l",
+     type = "l",
-     main = "STRING interactions for 9606 (top 600,000)",
+     main = "STRING interactions for 9606 (top 600,000)",
-     xlab = "combined_score",
+     xlab = "combined_score",
-     ylab = "cumulative counts",
+     ylab = "cumulative counts",
-     col = "#CC0000")
+     col = "#CC0000")
-abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
+abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
-
+
-# What's the cutoff for 100,000 edges?
+# What's the cutoff for 100,000 edges?
-which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
+which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
-
+
-# confirm
+# confirm
-sum(STR$combined_score >= 964) # 101,348
+sum(STR$combined_score >= 964) # 101,348
-abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
+abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
-
+
-# subset the table, and use only the protein IDs and the combined_score
+# subset the table, and use only the protein IDs and the combined_score
-STR <- STR[STR$combined_score >= 964,
+STR <- STR[STR$combined_score >= 964,
-            c("protein1", "protein2", "combined_score")]
+            c("protein1", "protein2", "combined_score")]
-colnames(STR) <- c("a", "b", "score")
+colnames(STR) <- c("a", "b", "score")
-
+
-
+
-# =    4  Drop  duplicates  ====================================================
+# =    4  Drop  duplicates  ====================================================
-
+
-# identify duplicate interactions by creating keys in a defined alphabetical
+# identify duplicate interactions by creating keys in a defined alphabetical
-# sort order, then checking for  duplicated().
+# sort order, then checking for  duplicated().
-# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
+# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
-# (X:U, X:U) has a duplicate.
+# (X:U, X:U) has a duplicate.
-
+
-AB <- STR$a < STR$b        # logical vector: genes we need to swap
+AB <- STR$a < STR$b        # logical vector: genes we need to swap
-tmp <- STR$b               # copy column b
+tmp <- STR$b               # copy column b
-STR$b[AB] <- STR$a[AB]     # copy a's into b
+STR$b[AB] <- STR$a[AB]     # copy a's into b
-STR$a[AB] <- tmp[AB]       # copy tmp's into a
+STR$a[AB] <- tmp[AB]       # copy tmp's into a
-all(STR$a >= STR$b)        # confirm: TRUE
+all(STR$a >= STR$b)        # confirm: TRUE
-
+
-# now, make combined keys, like this:
+# now, make combined keys, like this:
-paste0(STR$a[1:10], ":", STR$b[1:10])
+paste0(STR$a[1:10], ":", STR$b[1:10])
-
+
-tmp <- paste0(STR$a, ":", STR$b)
+tmp <- paste0(STR$a, ":", STR$b)
-sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
+sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
-                     # both a:b and b:a !
+                     # both a:b and b:a !
-
+
-# drop all duplicated interactions from tmp
+# drop all duplicated interactions from tmp
-STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
+STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
-
+
-
+
-# =    5  Simple statistics  ===================================================
+# =    5  Simple statistics  ===================================================
-
+
-# how many unique genes?
+# how many unique genes?
-length(unique(c(STR$a, STR$b)))   # 8,445
+length(unique(c(STR$a, STR$b)))   # 8,445
-
+
-# how many self-edges?
+# how many self-edges?
-sum(STR$a == STR$b)  # none
+sum(STR$a == STR$b)  # none
-
+
-# log(rank) / log(frequency)
+# log(rank) / log(frequency)
-myTbl <- table(c(STR$a, STR$b))
+myTbl <- table(c(STR$a, STR$b))
-myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
+myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
-
+
-hist(myTbl, breaks = 40, col = "#FFEEBB")
+hist(myTbl, breaks = 40, col = "#FFEEBB")
-
+
-# number of singletons
+# number of singletons
-sum(myTbl == 1) # almost a quarter
+sum(myTbl == 1) # almost a quarter
-
+
-# maximum?
+# maximum?
-myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
+myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
-                                   # Google: CDC5L
+                                   # Google: CDC5L
-
+
-# Zipf-plot
+# Zipf-plot
-plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
+plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
-     type = "b", cex = 0.7,
+     type = "b", cex = 0.7,
-     main = "STRINGedges - degrees",
+     main = "STRINGedges - degrees",
-     xlab = "log(rank)",
+     xlab = "log(rank)",
-     ylab = "log(frequency)",
+     ylab = "log(frequency)",
-     col = "#FFBB88")
+     col = "#FFBB88")
-
+
-sprintf("Average number of interactions: %5.2f",
+sprintf("Average number of interactions: %5.2f",
-         nrow(STR) / length(unique(c(STR$a, STR$b))))
+         nrow(STR) / length(unique(c(STR$a, STR$b))))
-
+
-
+
-# =    6  Write to file  =======================================================
+# =    6  Write to file  =======================================================
-
+
-saveRDS(STR, file = "./data/STRINGedges.rds")
+saveRDS(STR, file = "./data/STRINGedges.rds")
-
+
-# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
+# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
-                                                    # object when needed
+                                                    # object when needed
-
+
-
+
-# [END]
+# [END]
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@ -1,167 +1,167 @@
-# tocID <- "scripts/ABC-makeScCCnet.R"
+# tocID <- "scripts/ABC-makeScCCnet.R"
-#
+#
-# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
+# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
-# GOSlim annotation.
+# GOSlim annotation.
-#
+#
-# Boris Steipe for ABC learning units
+# Boris Steipe for ABC learning units
-#
+#
-# Notes:
+# Notes:
-#
+#
-#      The large source- datafiles are NOT posted to github. If you want to
+#      The large source- datafiles are NOT posted to github. If you want to
-#      experiment with your own code, download them and place them into your
+#      experiment with your own code, download them and place them into your
-#      local  ./data  directory.
+#      local  ./data  directory.
-#
+#
-#      STRING data source:
+#      STRING data source:
-#        Download page:
+#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
+# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
-#        Data: (20.1 mb)
+#        Data: (20.1 mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
+# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
-#
+#
-#      GOSlim data source: (Note: this has moved from GO to SGD)
+#      GOSlim data source: (Note: this has moved from GO to SGD)
-#        Info page: https://www.yeastgenome.org/downloads
+#        Info page: https://www.yeastgenome.org/downloads
-#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
+#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
-#        Data: (3 mb)
+#        Data: (3 mb)
-# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
+# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
-#
+#
-#
+#
-# Version:  1.2
+# Version:  1.2
-#
+#
-# Date:     2017-10  -  2020-09
+# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
+#
-# Versions:
+# Versions:
-#           1.2    2020 Update. GO Slim Yeast mow at SGD
+#           1.2    2020 Update. GO Slim Yeast mow at SGD
-#           1.1    Change from require() to requireNamespace(),
+#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
+#                      use <package>::<function>() idiom throughout
-#           1.0    First code copied from 2016 material.
+#           1.0    First code copied from 2016 material.
-#
+#
-# TODO:
+# TODO:
-#
+#
-# ==============================================================================
+# ==============================================================================
-# SRCDIR <- "./instructor"
+# SRCDIR <- "./instructor"
-
+
-
+
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-#TOC> 
+#TOC> 
-#TOC>   Section  Title                                           Line
+#TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------------
+#TOC> ---------------------------------------------------------------
-#TOC>   1        INITIALIZE                                        58
+#TOC>   1        INITIALIZE                                        58
-#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
+#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
-#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
+#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
-#TOC>   3.1        Intersect interactions and annotations         122
+#TOC>   3.1        Intersect interactions and annotations         122
-#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
+#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
-#TOC> 
+#TOC> 
-#TOC> ==========================================================================
+#TOC> ==========================================================================
-
+
-
+
-# =    1  INITIALIZE  ==========================================================
+# =    1  INITIALIZE  ==========================================================
-
+
-SRCDIR <- "./data"
+SRCDIR <- "./data"
-if (! requireNamespace("readr", quietly = TRUE)) {
+if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
+  install.packages("readr")
-}
+}
-
+
-
+
-# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
+# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
-
+
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
+# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
-# really not necessary to uncompress since readr:: can read from compressed
+# really not necessary to uncompress since readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
+# files, and does so automatically, based on the file extension.
-( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
+( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
+STR <- readr::read_delim(fn, delim = " ")
-
+
-# Subset only IDs and combined_score column
+# Subset only IDs and combined_score column
-STR <- STR[ , c("protein1", "protein2", "combined_score")]
+STR <- STR[ , c("protein1", "protein2", "combined_score")]
-
+
-# head(STR)
+# head(STR)
-# sum(STR$combined_score > 909)  # 100270 edges
+# sum(STR$combined_score > 909)  # 100270 edges
-# subset for 100,000 highest confidence edges
+# subset for 100,000 highest confidence edges
-STR <- STR[(STR$combined_score > 909), ]
+STR <- STR[(STR$combined_score > 909), ]
-head(STR)
+head(STR)
-
+
-# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
+# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
-STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
+STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
-STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
+STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
-head(STR)
+head(STR)
-
+
-# get a vector of gene names in this list
+# get a vector of gene names in this list
-myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
+myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
-                                                      # names
+                                                      # names
-length(myIntxGenes)
+length(myIntxGenes)
-sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
+sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
-
+
-
+
-# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
+# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
-#
+#
-# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
+# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
-( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
+( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
-
+
-Gsl <- readr::read_tsv(fn,
+Gsl <- readr::read_tsv(fn,
-                       col_names = c("ID",
+                       col_names = c("ID",
-                                     "name",
+                                     "name",
-                                     "SGDId",
+                                     "SGDId",
-                                     "Ontology",
+                                     "Ontology",
-                                     "termName",
+                                     "termName",
-                                     "termID",
+                                     "termID",
-                                     "status"))
+                                     "status"))
-
+
-head(Gsl)
+head(Gsl)
-
+
-# What cell cycle names does it contain?
+# What cell cycle names does it contain?
-myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
+myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
-myGslTermNames[grep("cycle", myGslTermNames)]
+myGslTermNames[grep("cycle", myGslTermNames)]
-# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
+# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
-
+
-# Choose "mitotic cell cycle" as the GOslim term to subset with
+# Choose "mitotic cell cycle" as the GOslim term to subset with
-
+
-scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
+scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
-length(scCCgenes)  # 324 genes annotated to that term
+length(scCCgenes)  # 324 genes annotated to that term
-
+
-# ==   3.1  Intersect interactions and annotations  ============================
+# ==   3.1  Intersect interactions and annotations  ============================
-
+
-sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
+sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
-#                                # functional interactions
+#                                # functional interactions
-
+
-
+
-# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
+# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
-#
+#
-# Define scCCnet ... the S. Cervisiae Cell Cycle network
+# Define scCCnet ... the S. Cervisiae Cell Cycle network
-# Subset all rows for which BOTH genes are in the GOslim cell cycle set
+# Subset all rows for which BOTH genes are in the GOslim cell cycle set
-#
+#
-scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
+scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
-               (STR$protein2 %in% scCCgenes), ]
+               (STR$protein2 %in% scCCgenes), ]
-
+
-# How many genes are there?
+# How many genes are there?
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
-
+
-# Each edge is listed twice - now remove duplicates.
+# Each edge is listed twice - now remove duplicates.
-
+
-# Step 1: make a vector: sort two names so the fiRst one is alphabetically
+# Step 1: make a vector: sort two names so the fiRst one is alphabetically
-#         smaller Than the second one. This brings the two names into a defined
+#         smaller Than the second one. This brings the two names into a defined
-#         order. Then concatenate them with a "." - the resulting string
+#         order. Then concatenate them with a "." - the resulting string
-#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
+#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
-#         and c("B", "A") also gives "A.B". This identifies duplicates.
+#         and c("B", "A") also gives "A.B". This identifies duplicates.
-
+
-x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
+x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
-           1,
+           1,
-           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
+           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
-head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
+head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
-
+
-sum(duplicated(x))  # 1453
+sum(duplicated(x))  # 1453
-
+
-# Step 2: drop all rows that contain duplicates in x
+# Step 2: drop all rows that contain duplicates in x
-scCCnet <- scCCnet[! duplicated(x), ]
+scCCnet <- scCCnet[! duplicated(x), ]
-
+
-# Confirm we didn't loose genes
+# Confirm we didn't loose genes
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
-nrow(scCCnet)
+nrow(scCCnet)
-# Network has 283 nodes, 1453 edges
+# Network has 283 nodes, 1453 edges
-
+
-saveRDS(scCCnet, file = "./data/scCCnet.rds")
+saveRDS(scCCnet, file = "./data/scCCnet.rds")
-
+
-# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
+# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
-                                             #      object when needed
+                                             #      object when needed
-
+
-
+
-# [END]
+# [END]
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@ -1,135 +1,135 @@
-# tocID <- "scripts/ABC-writeALN.R"
+# tocID <- "scripts/ABC-writeALN.R"
-#
+#
-# ToDo:    calculate consensus line
+# ToDo:    calculate consensus line
-#          append sequence numbers
+#          append sequence numbers
-# Notes:
+# Notes:
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-writeALN <- function(ali,
+writeALN <- function(ali,
-                     range,
+                     range,
-                     note = "",
+                     note = "",
-                     myCon = stdout(),
+                     myCon = stdout(),
-                     blockWidth = 60) {
+                     blockWidth = 60) {
-  # Purpose:
+  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
+  #     a file in multi-FASTA format.
-  # Version: 2.0
+  # Version: 2.0
-  # Date:    2017 10
+  # Date:    2017 10
-  # Author:  Boris Steipe
+  # Author:  Boris Steipe
-  #
+  #
-  # Parameters:
+  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector.
+  #                       vector.
-  #     range      num  a two-integer vector of start and end positions if
+  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
+  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full alignment length.
+  #                       a domain. Defaults to the full alignment length.
-  #     note       chr  a vector of character that is appended to the name
+  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
+  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
+  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
+  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
+  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
+  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
+  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
+  # Value:
-  #     NA   the function is invoked for its side effect of printing an
+  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
+  #          alignment to stdout() or file.
-
+
-  blockWidth <- as.integer(blockWidth)
+  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
+  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
+  }
-  if (blockWidth < 1) {
+  if (blockWidth < 1) {
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
+  }
-  if (blockWidth > 60) {
+  if (blockWidth > 60) {
-    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
+    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
-  }
+  }
-
+
-  # Extract the raw data from the objects depending on their respective class
+  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
+  # and put it into a named vector of strings.
-
+
-  # Extract XStringSet from MsaXMultipleAlignment ...
+  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
+  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
+      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
+      ali <- ali@unmasked
-  }
+  }
-
+
-  # Process XStringSet
+  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
+  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
+      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
+      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
+                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
+  } else if (class(ali) == "character") {
-    sSet <- ali
+    sSet <- ali
-  } else {
+  } else {
-    stop(paste("Input object of class",
+    stop(paste("Input object of class",
-               class(ali),
+               class(ali),
-               "can't be handled by this function."))
+               "can't be handled by this function."))
-  }
+  }
-
+
-  if (missing(range)) {
+  if (missing(range)) {
-    range <- 1
+    range <- 1
-    range[2] <- max(nchar(sSet))
+    range[2] <- max(nchar(sSet))
-  } else {
+  } else {
-    range <- as.integer(range)
+    range <- as.integer(range)
-    if(length(range) != 2 ||
+    if(length(range) != 2 ||
-       any(is.na(range)) ||
+       any(is.na(range)) ||
-       range[1] > range[2] ||
+       range[1] > range[2] ||
-       range[1] < 1) {
+       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
+    }
-  }
+  }
-
+
-  # Right-pad any sequence with "-" that is shorter than ranges[2]
+  # Right-pad any sequence with "-" that is shorter than ranges[2]
-    for (i in seq_along(sSet)) {
+    for (i in seq_along(sSet)) {
-      if (nchar(sSet[i]) < range[2]) {
+      if (nchar(sSet[i]) < range[2]) {
-        sSet[i] <- paste0(sSet[i],
+        sSet[i] <- paste0(sSet[i],
-                          paste0(rep("-", range[2] - nchar(sSet[i])),
+                          paste0(rep("-", range[2] - nchar(sSet[i])),
-                                 collapse = ""))
+                                 collapse = ""))
-      }
+      }
-    }
+    }
-
+
-  # Right-pad sequence names
+  # Right-pad sequence names
-  sNames <- names(sSet)
+  sNames <- names(sSet)
-  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
+  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
-  for (i in seq_along(sNames)) {
+  for (i in seq_along(sNames)) {
-    sNames[i] <- paste0(sNames[i],
+    sNames[i] <- paste0(sNames[i],
-                      paste0(rep(" ", len - nchar(sNames[i])),
+                      paste0(rep(" ", len - nchar(sNames[i])),
-                             collapse = ""))
+                             collapse = ""))
-  }
+  }
-
+
-
+
-  # Process each sequence
+  # Process each sequence
-  txt <- paste0("CLUSTAL W format. ", note)
+  txt <- paste0("CLUSTAL W format. ", note)
-  txt[2] <- ""
+  txt[2] <- ""
-
+
-  iStarts <- seq(range[1], range[2], by = blockWidth)
+  iStarts <- seq(range[1], range[2], by = blockWidth)
-  iEnds <- c((iStarts[-1] - 1), range[2])
+  iEnds <- c((iStarts[-1] - 1), range[2])
-
+
-  for (i in seq_along(iStarts)) {
+  for (i in seq_along(iStarts)) {
-    for (j in seq_along(sSet)) {
+    for (j in seq_along(sSet)) {
-      txt <- c(txt,
+      txt <- c(txt,
-               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
+               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
-    }
+    }
-    txt <- c(txt, "")  # append a blank consenus line
+    txt <- c(txt, "")  # append a blank consenus line
-    txt <- c(txt, "")  # append a separator line
+    txt <- c(txt, "")  # append a separator line
-  }
+  }
-
+
-  writeLines(txt, con= myCon)
+  writeLines(txt, con= myCon)
-
+
-}
+}
-
+
-# ====  TESTS  =================================================================
+# ====  TESTS  =================================================================
-# Enter your function tests here...
+# Enter your function tests here...
-
+
-if (FALSE) {
+if (FALSE) {
-  # test ...
+  # test ...
-}
+}
-
+
-
+
-
+
-# [END]
+# [END]
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@ -1,121 +1,121 @@
-# ABC-writeMFA.R
+# ABC-writeMFA.R
-#
+#
-# ToDo:
+# ToDo:
-# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
+# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
-#
+#
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-writeMFA <- function(ali,
+writeMFA <- function(ali,
-                     range,
+                     range,
-                     note = "",
+                     note = "",
-                     myCon = stdout(),
+                     myCon = stdout(),
-                     blockWidth = 80) {
+                     blockWidth = 80) {
-  # Purpose:
+  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
+  #     a file in multi-FASTA format.
-  # Version: 2.1
+  # Version: 2.1
-  # Date:    2017  10
+  # Date:    2017  10
-  # Author:  Boris Steipe
+  # Author:  Boris Steipe
-  #
+  #
-  # Parameters:
+  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector
+  #                       vector
-  #     range      num  a two-integer vector of start and end positions if
+  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
+  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full sequence length.
+  #                       a domain. Defaults to the full sequence length.
-  #     note       chr  a vector of character that is appended to the name
+  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
+  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
+  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
+  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
+  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
+  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
+  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
+  # Value:
-  #     NA   the function is invoked for its side effect of printing an
+  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
+  #          alignment to stdout() or file.
-
+
-  blockWidth <- as.integer(blockWidth)
+  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
+  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
+  }
-  if (! blockWidth > 0){
+  if (! blockWidth > 0){
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
+  }
-
+
-  # Extract the raw data from the objects depending on their respective class
+  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
+  # and put it into a named vector of strings.
-
+
-  # Extract XStringSet from MsaXMultipleAlignment ...
+  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
+  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
+      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
+      ali <- ali@unmasked
-  }
+  }
-
+
-  # Process XStringSet
+  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
+  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
+      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
+      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
+                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
+  } else if (class(ali) == "character") {
-    sSet <- ali
+    sSet <- ali
-  } else {
+  } else {
-    stop(paste("Input object of class",
+    stop(paste("Input object of class",
-               class(ali),
+               class(ali),
-               "can't be handled by this function."))
+               "can't be handled by this function."))
-  }
+  }
-
+
-  if (missing(range)) {
+  if (missing(range)) {
-    range <- 1
+    range <- 1
-    range[2] <- max(nchar(sSet))
+    range[2] <- max(nchar(sSet))
-  } else {
+  } else {
-    range <- as.integer(range)
+    range <- as.integer(range)
-    if(length(range) != 2 ||
+    if(length(range) != 2 ||
-       any(is.na(range)) ||
+       any(is.na(range)) ||
-       range[1] > range[2] ||
+       range[1] > range[2] ||
-       range[1] < 1) {
+       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
+    }
-  }
+  }
-
+
-  # Process each sequence
+  # Process each sequence
-  txt <- character()
+  txt <- character()
-  if (note != "") {  # construct header line
+  if (note != "") {  # construct header line
-    headers <- paste(names(sSet), note)
+    headers <- paste(names(sSet), note)
-  } else {
+  } else {
-    headers <- names(sSet)
+    headers <- names(sSet)
-  }
+  }
-
+
-  for (i in seq_along(sSet)) {
+  for (i in seq_along(sSet)) {
-
+
-    # output FASTA header
+    # output FASTA header
-    txt <- c(txt, sprintf(">%s", headers[i]))
+    txt <- c(txt, sprintf(">%s", headers[i]))
-
+
-    # output the sequence in blocks of blockWidth per line ...
+    # output the sequence in blocks of blockWidth per line ...
-    iStarts <- seq(range[1], range[2], by = blockWidth)
+    iStarts <- seq(range[1], range[2], by = blockWidth)
-    iEnds <- c((iStarts[-1] - 1), range[2])
+    iEnds <- c((iStarts[-1] - 1), range[2])
-
+
-    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
+    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
-    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
+    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
-    txt <- c(txt, thisSeq)
+    txt <- c(txt, thisSeq)
-
+
-    txt <- c(txt, "")  # append an empty line for readability
+    txt <- c(txt, "")  # append an empty line for readability
-  }
+  }
-
+
-  writeLines(txt, con = myCon)
+  writeLines(txt, con = myCon)
-
+
-}
+}
-
+
-# ====  TESTS  =================================================================
+# ====  TESTS  =================================================================
-# Enter your function tests here...
+# Enter your function tests here...
-
+
-if (FALSE) {
+if (FALSE) {
-  # test ...
+  # test ...
-}
+}
-
+
-
+
-
+
-# [END]
+# [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@ -1,384 +1,384 @@
-# BLAST.R
+# BLAST.R
-#
+#
-# Purpose: Send off one BLAST search and return parsed list of results
+# Purpose: Send off one BLAST search and return parsed list of results
-#          This script uses the BLAST URL-API
+#          This script uses the BLAST URL-API
-#          (Application Programming Interface) at the NCBI.
+#          (Application Programming Interface) at the NCBI.
-#          Read about the constraints here:
+#          Read about the constraints here:
-#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
+#
-#
+#
-# Version: 3.2
+# Version: 3.2
-# Date:    2016 09 - 2020 09
+# Date:    2016 09 - 2020 09
-# Author:  Boris Steipe
+# Author:  Boris Steipe
-#
+#
-# Versions:
+# Versions:
-#    3.2   2020 updates
+#    3.2   2020 updates
-#    3.1   Change from require() to requireNamespace(),
+#    3.1   Change from require() to requireNamespace(),
-#          use <package>::<function>() idiom throughout
+#          use <package>::<function>() idiom throughout
-#    3.0   parsing logic had not been fully implemented; Fixed.
+#    3.0   parsing logic had not been fully implemented; Fixed.
-#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
+#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
-#          refactored parseBLASTalignment() to handle lists with multiple hits.
+#          refactored parseBLASTalignment() to handle lists with multiple hits.
-#    2.0   Completely rewritten because the interface completely changed.
+#    2.0   Completely rewritten because the interface completely changed.
-#          Code adpated in part from NCBI Perl sample code:
+#          Code adpated in part from NCBI Perl sample code:
-#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
+#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
-#    1.0   first version posted for BCH441 2016, based on BLAST - API
+#    1.0   first version posted for BCH441 2016, based on BLAST - API
-#
+#
-# ToDo:    Return the organism/strain name in the output, and propagate
+# ToDo:    Return the organism/strain name in the output, and propagate
-#          into MYSPE selection script.
+#          into MYSPE selection script.
-#
+#
-# Notes:   This is somewhat pedestrian, but apparently there are currently
+# Notes:   This is somewhat pedestrian, but apparently there are currently
-#          no R packages that contain such code.
+#          no R packages that contain such code.
-#
+#
-# ==============================================================================
+# ==============================================================================
-
+
-
+
-if (! requireNamespace("httr", quietly = TRUE)) {
+if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
+  install.packages("httr")
-}
+}
-
+
-
+
-BLAST <- function(Q,
+BLAST <- function(Q,
-                  db = "refseq_protein",
+                  db = "refseq_protein",
-                  nHits = 30,
+                  nHits = 30,
-                  E = 0.1,
+                  E = 0.1,
-                  limits = "",
+                  limits = "",
-                  rid = "",
+                  rid = "",
-                  query = "",
+                  query = "",
-                  quietly = FALSE,
+                  quietly = FALSE,
-                  myTimeout = 120) {
+                  myTimeout = 120) {
-    # Purpose:
+    # Purpose:
-    #     Basic BLAST search
+    #     Basic BLAST search
-    #
+    #
-    # Parameters:
+    # Parameters:
-    #     Q: query - either a valid ID or a sequence
+    #     Q: query - either a valid ID or a sequence
-    #     db: "refseq_protein" by default,
+    #     db: "refseq_protein" by default,
-    #         other legal values include: "nr", "pdb", "swissprot" ...
+    #         other legal values include: "nr", "pdb", "swissprot" ...
-    #     nHits: number of hits to maximally return
+    #     nHits: number of hits to maximally return
-    #     E: E-value cutoff. Do not return hits whose score would be expected
+    #     E: E-value cutoff. Do not return hits whose score would be expected
-    #        to occur E or more times in a database of random sequence.
+    #        to occur E or more times in a database of random sequence.
-    #     limits: a valid ENTREZ filter
+    #     limits: a valid ENTREZ filter
-    #     rid: a request ID - to retrieve earlier search results
+    #     rid: a request ID - to retrieve earlier search results
-    #     query: the actual query string (needed when retrieving results
+    #     query: the actual query string (needed when retrieving results
-    #            with an rid)
+    #            with an rid)
-    #     quietly: controls printing of wait-time progress bar
+    #     quietly: controls printing of wait-time progress bar
-    #     timeout: how much longer _after_ rtoe to wait for a result
+    #     timeout: how much longer _after_ rtoe to wait for a result
-    #              before giving up (seconds)
+    #              before giving up (seconds)
-    # Value:
+    # Value:
-    #     result: list of process status or resulting hits, and some metadata
+    #     result: list of process status or resulting hits, and some metadata
-
+
-
+
-    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
+    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
-
+
-    results <- list()
+    results <- list()
-    results$query = query
+    results$query = query
-    results$rid <- rid
+    results$rid <- rid
-    results$rtoe <- 0
+    results$rtoe <- 0
-
+
-    if (rid == "") {  # If no rid is available, spawn a search.
+    if (rid == "") {  # If no rid is available, spawn a search.
-                      # Else, proceed directly to retrieval.
+                      # Else, proceed directly to retrieval.
-
+
-      # prepare query, GET(), and parse rid and rtoe from BLAST server response
+      # prepare query, GET(), and parse rid and rtoe from BLAST server response
-      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                              "?",
+                              "?",
-                              "CMD=Put",
+                              "CMD=Put",
-                              "&PROGRAM=", "blastp",
+                              "&PROGRAM=", "blastp",
-                              "&QUERY=", URLencode(Q),
+                              "&QUERY=", URLencode(Q),
-                              "&DATABASE=", db,
+                              "&DATABASE=", db,
-                              "&MATRIX=", "BLOSUM62",
+                              "&MATRIX=", "BLOSUM62",
-                              "&EXPECT=", as.character(E),
+                              "&EXPECT=", as.character(E),
-                              "&HITLIST_SIZE=", as.character(nHits),
+                              "&HITLIST_SIZE=", as.character(nHits),
-                              "&ALIGNMENTS=", as.character(nHits),
+                              "&ALIGNMENTS=", as.character(nHits),
-                              "&FORMAT_TYPE=Text")
+                              "&FORMAT_TYPE=Text")
-
+
-      if (limits != "") {
+      if (limits != "") {
-        results$query <- paste0(
+        results$query <- paste0(
-          results$query,
+          results$query,
-          "&ENTREZ_QUERY=", limits)
+          "&ENTREZ_QUERY=", limits)
-      }
+      }
-
+
-      # send it off ...
+      # send it off ...
-      response <- httr::GET(results$query)
+      response <- httr::GET(results$query)
-      if (httr::http_status(response)$category != "Success" ) {
+      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
+        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
-                     httr::http_status(response)$message))
+                     httr::http_status(response)$message))
-      }
+      }
-
+
-      txt <- httr::content(response, "text", encoding = "UTF-8")
+      txt <- httr::content(response, "text", encoding = "UTF-8")
-
+
-      patt <- "RID = (\\w+)" # match the request id
+      patt <- "RID = (\\w+)" # match the request id
-      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
+      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
-
+
-      patt <- "RTOE = (\\d+)" # match the expected completion time
+      patt <- "RTOE = (\\d+)" # match the expected completion time
-      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
+      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
-
+
-      # Now we wait ...
+      # Now we wait ...
-      if (quietly) {
+      if (quietly) {
-        Sys.sleep(results$rtoe)
+        Sys.sleep(results$rtoe)
-      } else {
+      } else {
-        cat(sprintf("BLAST is processing %s:\n", results$rid))
+        cat(sprintf("BLAST is processing %s:\n", results$rid))
-        waitTimer(results$rtoe)
+        waitTimer(results$rtoe)
-      }
+      }
-
+
-    } # done sending query and retrieving rid, rtoe
+    } # done sending query and retrieving rid, rtoe
-
+
-    # Enter an infinite loop to check for result availability
+    # Enter an infinite loop to check for result availability
-    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                         "?",
+                         "?",
-                         "CMD=Get",
+                         "CMD=Get",
-                         "&RID=", results$rid,
+                         "&RID=", results$rid,
-                         "&FORMAT_TYPE=Text",
+                         "&FORMAT_TYPE=Text",
-                         "&FORMAT_OBJECT=SearchInfo",
+                         "&FORMAT_OBJECT=SearchInfo",
-                         sep = "")
+                         sep = "")
-
+
-    while (TRUE) {
+    while (TRUE) {
-      # Check whether the result is ready
+      # Check whether the result is ready
-      response <- httr::GET(checkStatus)
+      response <- httr::GET(checkStatus)
-      if (httr::http_status(response)$category != "Success" ) {
+      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
+        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
-                     httr::http_status(response)$message))
+                     httr::http_status(response)$message))
-      }
+      }
-
+
-      txt <- httr::content(response, "text", encoding = "UTF-8")
+      txt <- httr::content(response, "text", encoding = "UTF-8")
-
+
-      if (length(grep("Status=WAITING",  txt)) > 0) {
+      if (length(grep("Status=WAITING",  txt)) > 0) {
-        myTimeout <- myTimeout - EXTRAWAIT
+        myTimeout <- myTimeout - EXTRAWAIT
-
+
-        if (myTimeout <= 0) { # abort
+        if (myTimeout <= 0) { # abort
-          cat("BLAST search not concluded before timeout. Aborting.\n")
+          cat("BLAST search not concluded before timeout. Aborting.\n")
-          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
+          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
-                      "Trying checking back later with >",
+                      "Trying checking back later with >",
-                      results$rid))
+                      results$rid))
-          return(results)
+          return(results)
-        }
+        }
-
+
-        if (quietly) {
+        if (quietly) {
-          Sys.sleep(EXTRAWAIT)
+          Sys.sleep(EXTRAWAIT)
-        } else {
+        } else {
-          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
+          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
-                      EXTRAWAIT,
+                      EXTRAWAIT,
-                      myTimeout))
+                      myTimeout))
-          waitTimer(EXTRAWAIT)
+          waitTimer(EXTRAWAIT)
-          next
+          next
-        }
+        }
-
+
-      } else if (length(grep("Status=FAILED",  txt)) > 0) {
+      } else if (length(grep("Status=FAILED",  txt)) > 0) {
-          cat("BLAST search returned status \"FAILED\". Aborting.\n")
+          cat("BLAST search returned status \"FAILED\". Aborting.\n")
-          return(results)
+          return(results)
-
+
-      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
+      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
-          cat("BLAST search returned status \"UNKNOWN\".\n")
+          cat("BLAST search returned status \"UNKNOWN\".\n")
-          cat("This probably means the rid has expired. Aborting.\n")
+          cat("This probably means the rid has expired. Aborting.\n")
-          return(results)
+          return(results)
-
+
-      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
+      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
-
+
-          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
+          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
-            cat("BLAST search ready but no hits found. Aborting.\n")
+            cat("BLAST search ready but no hits found. Aborting.\n")
-            return(results)
+            return(results)
-
+
-          } else {
+          } else {
-            break  # done ... retrieve search result
+            break  # done ... retrieve search result
-          }
+          }
-      }
+      }
-    } # end result-check loop
+    } # end result-check loop
-
+
-    # retrieve results from BLAST server
+    # retrieve results from BLAST server
-    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                      "?",
+                      "?",
-                      "&CMD=Get",
+                      "&CMD=Get",
-                      "&RID=", results$rid,
+                      "&RID=", results$rid,
-                      "&FORMAT_TYPE=Text",
+                      "&FORMAT_TYPE=Text",
-                      sep = "")
+                      sep = "")
-
+
-    response <- httr::GET(retrieve)
+    response <- httr::GET(retrieve)
-    if (httr::http_status(response)$category != "Success" ) {
+    if (httr::http_status(response)$category != "Success" ) {
-      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
+      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
-                   httr::http_status(response)$message))
+                   httr::http_status(response)$message))
-    }
+    }
-
+
-    txt <- httr::content(response, "text", encoding = "UTF-8")
+    txt <- httr::content(response, "text", encoding = "UTF-8")
-
+
-    # txt contains the whole set of results. Process:
+    # txt contains the whole set of results. Process:
-
+
-    # First, we strsplit() on linebreaks:
+    # First, we strsplit() on linebreaks:
-    txt <- unlist(strsplit(txt, "\n"))
+    txt <- unlist(strsplit(txt, "\n"))
-
+
-    # The alignments range from the first line that begins with ">" ...
+    # The alignments range from the first line that begins with ">" ...
-    iFirst <- grep("^>", txt)[1]
+    iFirst <- grep("^>", txt)[1]
-
+
-    # ... to the last line that begins with "Sbjct"
+    # ... to the last line that begins with "Sbjct"
-    x <- grep("^Sbjct", txt)
+    x <- grep("^Sbjct", txt)
-    iLast <- x[length(x)]
+    iLast <- x[length(x)]
-
+
-    # Get the alignments block
+    # Get the alignments block
-    txt <- txt[iFirst:iLast]
+    txt <- txt[iFirst:iLast]
-
+
-    # Drop empty lines
+    # Drop empty lines
-    txt <- txt[!(nchar(txt) == 0)]
+    txt <- txt[!(nchar(txt) == 0)]
-
+
-    # A line that ends "]" but does not begin ">" seems to be a split
+    # A line that ends "]" but does not begin ">" seems to be a split
-    # defline ... eg.
+    # defline ... eg.
-    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
+    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
-    #  [2] "EXF-2481]"
+    #  [2] "EXF-2481]"
-    #  Merge these lines to the preceding lines and delete them.
+    #  Merge these lines to the preceding lines and delete them.
-    #
+    #
-    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
+    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
-    if (length(x) > 0) {
+    if (length(x) > 0) {
-      txt[x-1] <- paste0(txt[x-1], txt[x])
+      txt[x-1] <- paste0(txt[x-1], txt[x])
-      txt <- txt[-x]
+      txt <- txt[-x]
-    }
+    }
-
+
-    # Special case: there may be multiple deflines when the BLAST hit is to
+    # Special case: there may be multiple deflines when the BLAST hit is to
-    # redundant, identical sequences. Keep only the first instance.
+    # redundant, identical sequences. Keep only the first instance.
-    iKeep <- ! grepl("^>", txt)
+    iKeep <- ! grepl("^>", txt)
-    x <- rle(iKeep)
+    x <- rle(iKeep)
-    x$positions <- cumsum(x$lengths)
+    x$positions <- cumsum(x$lengths)
-    i <- which(x$lengths > 1 & x$values == FALSE)
+    i <- which(x$lengths > 1 & x$values == FALSE)
-    if (length(i) > 0) {
+    if (length(i) > 0) {
-      firsts <- x$positions[i] - x$lengths[i] + 1
+      firsts <- x$positions[i] - x$lengths[i] + 1
-      iKeep[firsts] <- TRUE
+      iKeep[firsts] <- TRUE
-      txt <- txt[iKeep]
+      txt <- txt[iKeep]
-    }
+    }
-
+
-    # After this preprocessing the following should be true:
+    # After this preprocessing the following should be true:
-    # - Every alignment block begins with a defline in which the
+    # - Every alignment block begins with a defline in which the
-    #   first character is ">"
+    #   first character is ">"
-    # - There is only one defline in each block.
+    # - There is only one defline in each block.
-    # - Lines are not split.
+    # - Lines are not split.
-
+
-    # Make a dataframe of first and last indices of alignment blocks
+    # Make a dataframe of first and last indices of alignment blocks
-    x <- grep("^>", txt)
+    x <- grep("^>", txt)
-    blocks <- data.frame(iFirst = x,
+    blocks <- data.frame(iFirst = x,
-                         iLast  = c((x[-1] - 1), length(txt)))
+                         iLast  = c((x[-1] - 1), length(txt)))
-
+
-    # Build the hits list by parsing the blocks
+    # Build the hits list by parsing the blocks
-    results$hits <- list()
+    results$hits <- list()
-
+
-    for (i in seq_len(nrow(blocks))) {
+    for (i in seq_len(nrow(blocks))) {
-      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
+      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
-      results$hits[[i]] <- parseBLASTalignment(thisBlock)
+      results$hits[[i]] <- parseBLASTalignment(thisBlock)
-    }
+    }
-
+
-    return(results)
+    return(results)
-}
+}
-
+
-parseBLASTalignment <- function(hit) {
+parseBLASTalignment <- function(hit) {
-  # Parse data from a character vector containing a BLAST hit
+  # Parse data from a character vector containing a BLAST hit
-  # Parameters:
+  # Parameters:
-  #    hit  char   one BLAST hit as char vector
+  #    hit  char   one BLAST hit as char vector
-  # Value:
+  # Value:
-  #          list   $def          chr   defline
+  #          list   $def          chr   defline
-  #                 $accession    chr   accession number
+  #                 $accession    chr   accession number
-  #                 $organism     chr   complete organism definition
+  #                 $organism     chr   complete organism definition
-  #                 $species      chr   binomial species
+  #                 $species      chr   binomial species
-  #                 $E            num   E value
+  #                 $E            num   E value
-  #                 $lengthAli    num   length of the alignment
+  #                 $lengthAli    num   length of the alignment
-  #                 $nIdentitites num   number of identities
+  #                 $nIdentitites num   number of identities
-  #                 $nGaps        num   number of gaps
+  #                 $nGaps        num   number of gaps
-  #                 $Qbounds      num   2-element vector of query start-end
+  #                 $Qbounds      num   2-element vector of query start-end
-  #                 $Sbounds      num   2-element vector of subject start-end
+  #                 $Sbounds      num   2-element vector of subject start-end
-  #                 $Qseq         chr   query sequence
+  #                 $Qseq         chr   query sequence
-  #                 $midSeq       chr   midline string
+  #                 $midSeq       chr   midline string
-  #                 $Sseq         chr   subject sequence
+  #                 $Sseq         chr   subject sequence
-
+
-  getToken <- function(patt, v) {
+  getToken <- function(patt, v) {
-    # get the first token identified by pattern patt in character vector v
+    # get the first token identified by pattern patt in character vector v
-    v <- v[grep(patt, v)]
+    v <- v[grep(patt, v)]
-    if (length(v) > 1) { v <- v[1] }
+    if (length(v) > 1) { v <- v[1] }
-    if (length(v) == 0) { token <- NA
+    if (length(v) == 0) { token <- NA
-    } else {
+    } else {
-      token <- regmatches(v, regexec(patt, v))[[1]][2] }
+      token <- regmatches(v, regexec(patt, v))[[1]][2] }
-    return(token)
+    return(token)
-  }
+  }
-
+
-  h <- list()
+  h <- list()
-
+
-  # FASTA defline
+  # FASTA defline
-  h$def <- hit[1]
+  h$def <- hit[1]
-
+
-  # accesion number (ID), use the first if there are several, separated by "|"
+  # accesion number (ID), use the first if there are several, separated by "|"
-  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
+  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
-  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
+
-  # organism
+  # organism
-  patt <- "\\[(.+)]"
+  patt <- "\\[(.+)]"
-  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
+
-  # species
+  # species
-  x <- unlist(strsplit(h$organism, "\\s+"))
+  x <- unlist(strsplit(h$organism, "\\s+"))
-  if (length(x) >= 2) {
+  if (length(x) >= 2) {
-    h$species <- paste(x[1], x[2])
+    h$species <- paste(x[1], x[2])
-  } else if (length(x) == 1) {
+  } else if (length(x) == 1) {
-    h$species <- paste(x[1], "sp.")
+    h$species <- paste(x[1], "sp.")
-  } else {
+  } else {
-    h$species <- NA
+    h$species <- NA
-  }
+  }
-
+
-  # E-value
+  # E-value
-  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
+  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
-
+
-  # length of alignment
+  # length of alignment
-  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
+  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
-
+
-  # number of identities
+  # number of identities
-  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
+  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
-
+
-  # number of gaps
+  # number of gaps
-  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
+  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
-
+
-  # split up alignment section
+  # split up alignment section
-  idx <- grep("^Query ", hit)
+  idx <- grep("^Query ", hit)
-  Que <- hit[idx]
+  Que <- hit[idx]
-  Mid <- hit[idx + 1]
+  Mid <- hit[idx + 1]
-  Sbj <- hit[idx + 2]
+  Sbj <- hit[idx + 2]
-
+
-  # first and last positions
+  # first and last positions
-  h$Qbounds <- c(start = 0, end = 0)
+  h$Qbounds <- c(start = 0, end = 0)
-  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
+  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
-  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
+  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
-
+
-  h$Sbounds <- c(start = 0, end = 0)
+  h$Sbounds <- c(start = 0, end = 0)
-  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
+  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
-  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
+  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
-
+
-  # aligned sequences
+  # aligned sequences
-  for (i in seq_along(Que)) {
+  for (i in seq_along(Que)) {
-    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
+    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
-    m <- regexec(patt, Que[i])
+    m <- regexec(patt, Que[i])
-    iFirst <- m[[1]][2]
+    iFirst <- m[[1]][2]
-    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
+    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
-    Que[i] <- substring(Que[i], iFirst, iLast)
+    Que[i] <- substring(Que[i], iFirst, iLast)
-    Mid[i] <- substring(Mid[i], iFirst, iLast)
+    Mid[i] <- substring(Mid[i], iFirst, iLast)
-    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
+    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
-  }
+  }
-
+
-  h$Qseq   <- paste0(Que, collapse = "")
+  h$Qseq   <- paste0(Que, collapse = "")
-  h$midSeq <- paste0(Mid, collapse = "")
+  h$midSeq <- paste0(Mid, collapse = "")
-  h$Sseq   <- paste0(Sbj, collapse = "")
+  h$Sseq   <- paste0(Sbj, collapse = "")
-
+
-  return(h)
+  return(h)
-}
+}
-
+
-
+
-# ==== TESTS ===================================================================
+# ==== TESTS ===================================================================
-
+
-if (FALSE) {
+if (FALSE) {
-  # define query:
+  # define query:
-  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
+  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
-               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-               sep="")
+               sep="")
-  # or ...
+  # or ...
-  q <- "NP_010227" # refseq ID
+  q <- "NP_010227" # refseq ID
-
+
-  test <- BLAST(q,
+  test <- BLAST(q,
-                nHits = 100,
+                nHits = 100,
-                E = 0.001,
+                E = 0.001,
-                rid = "",
+                rid = "",
-                limits = "txid4751[ORGN]")  # Fungi
+                limits = "txid4751[ORGN]")  # Fungi
-  str(test)
+  str(test)
-  length(test$hits)
+  length(test$hits)
-}
+}
-
+
-# [END]
+# [END]
-
+
--- a/tests/test_biCode.R
+++ b/tests/test_biCode.R
@ -1,32 +1,32 @@
-# test_biCode.R
+# test_biCode.R
-#
+#
-
+
-context("biCode() utility function tests")  # A set of tests for some
+context("biCode() utility function tests")  # A set of tests for some
-                                            # functionality
+                                            # functionality
-
+
-test_that("expected input is processed correctly", {  # Related expectations
+test_that("expected input is processed correctly", {  # Related expectations
-  expect_equal(biCode("homo sapiens"), "HOMSA")
+  expect_equal(biCode("homo sapiens"), "HOMSA")
-  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
+  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
-  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
+  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
-               c("PHACI", "MACRU"))
+               c("PHACI", "MACRU"))
-})
+})
-
+
-test_that("unexpected input is managed", {
+test_that("unexpected input is managed", {
-  expect_equal(biCode(""), ".....")
+  expect_equal(biCode(""), ".....")
-  expect_equal(biCode(" "), ".....")
+  expect_equal(biCode(" "), ".....")
-  expect_equal(biCode("123 12"), ".....")
+  expect_equal(biCode("123 12"), ".....")
-  expect_equal(biCode("h sapiens"), "H..SA")
+  expect_equal(biCode("h sapiens"), "H..SA")
-})
+})
-
+
-test_that("NA values are preserved", {
+test_that("NA values are preserved", {
-  expect_true(is.na((biCode(NA))))
+  expect_true(is.na((biCode(NA))))
-  expect_equal(biCode(c("first", NA, "last")),
+  expect_equal(biCode(c("first", NA, "last")),
-               c("FIRST", NA, "LAST."))
+               c("FIRST", NA, "LAST."))
-})
+})
-
+
-test_that("Missing argument throws an error", {
+test_that("Missing argument throws an error", {
-  expect_error(biCode(), "argument \"s\" is missing, with no default")
+  expect_error(biCode(), "argument \"s\" is missing, with no default")
-})
+})
-
+
-
+
-# [END]
+# [END]
`@ -1,3 +1,3 @@`
	`# BCH441-WORK-ABC-units`	`# BCH441-WORK-ABC-units`

	`This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.`	`This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.`