Line termination change and old code.

2021-11-16 00:31:48 -05:00
parent b1e00f52f7
commit affe00f6fb
86 changed files with 37873 additions and 37876 deletions
--- a/scripts/ABC-createRefDB.R
+++ b/scripts/ABC-createRefDB.R
@@ -1,30 +1,30 @@
-# ABC-createRefDB.R
-#
-# Create a reference protein database for Mbp1-like proteins
-#
-# Boris Steipe for ABC learning units
-#
-# For the species, see:
-# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
-#
-# For the data model, see
-# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
-# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
-#
-# ==============================================================================
-
-
-myDB <- dbInit()
-
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
-myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
-
-myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
-
-myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
-
-myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
-
-
-# [END]
+# ABC-createRefDB.R
+#
+# Create a reference protein database for Mbp1-like proteins
+#
+# Boris Steipe for ABC learning units
+#
+# For the species, see:
+# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
+#
+# For the data model, see
+# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
+# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
+#
+# ==============================================================================
+
+
+myDB <- dbInit()
+
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
+myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
+
+myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
+
+myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
+
+myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
+
+
+# [END]
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
--- a/scripts/ABC-makeMYSPElist.R
+++ b/scripts/ABC-makeMYSPElist.R
@@ -1,443 +1,443 @@
-# tocID <- "scripts/ABC-makeMYSPElist.R"
-#
-# Purpose:  Create a list of genome sequenced fungi with protein annotations and
-#               Mbp1 homologues.
-#
-# Version: 1.4
-#
-# Date:    2016  09  -  2021  09
-# Author:  Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions
-#          1.4    New retrieval logic
-#          1.3    Rewrite to change datasource. NCBI has not been updated
-#                   since 2012. Use ensembl fungi as initial source.
-#          1.2    Change from require() to requireNamespace()
-#          1.1.2  Moved BLAST.R to ./scripts directory
-#          1.1    Update 2017
-#          1.0    First code 2016
-#
-# TODO:
-#
-# ==============================================================================
-#
-# DO NOT  source()  THIS FILE!
-#
-# This file is code I provide for your deeper understanding of a process and
-# to provide you with useful sample code. It is not actually necessary for
-# you to run this code, but I encourage you to read it carefully and discuss
-# if there are parts you don't understand.
-#
-# Run the commands that interact with the NCBI servers only if you want to
-# experiment specifically with the code and/or parameters. I have commented out
-# those parts. If you only want to study the general workflow, just load()
-# the respective intermediate results.
-#
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                    Line
-#TOC> --------------------------------------------------------
-#TOC>   1        The strategy                               55
-#TOC>   2        PACKAGES AND INITIALIZATIONS               67
-#TOC>   3        ENSEMBL FUNGI                              75
-#TOC>   3.1        Import                                   78
-#TOC>   4        BLAST SEARCH                              155
-#TOC>   4.1        find homologous proteins                161
-#TOC>   4.2        Identify species in "hits"              192
-#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
-#TOC>   6        STUDENT NUMBERS                           375
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  The strategy  ========================================================
-
-# This script will create a list of "MYSPE" species and save it in an R object
-# MYSPEspecies that is stored in the data subdirectory of this project from
-# where it can be loaded. The strategy is as follows: we download a list of
-# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
-# species that have been annotated.
-# Next we perform a BLAST search, to identify fungal species that have
-# genes that are homologous to yeast MBP1.
-#
-# ...
-
-# =    2  PACKAGES AND INITIALIZATIONS  ========================================
-
-# httr provides interfaces to Webservers on the Internet
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-
-
-# =    3  ENSEMBL FUNGI  =======================================================
-
-
-# ==   3.1  Import  ============================================================
-
-# Navigate to https://fungi.ensembl.org and click on the link to the full
-# list of all species: https://fungi.ensembl.org/species.html
-# On the page, click on the spreadsheet symbol top right and choose
-# "download whole table". The file will be named  "Species.csv", in your
-# usual downloads folder. Move it to the data folder, and read it.
-
-sDat <- read.csv("./data/Species.csv")
-str(sDat)
-
-# The most obvious way to partition these is according to Classification ...
-# (poking around a bit in the UniProt taxonomy database shows that the
-#  classification used here is the taxonomic rank of "order").
-# how many classifications do we have?
-length(unique(sDat$Classification))  # 66
-
-# To have a good set for the class, we should have about 100.
-# Let's see for which of these we can find Mbp1 homologues.
-# First, we'll keep only the colums for name, classification, and taxID, and
-# drop the rest ...
-sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
-colnames(sDat) <- c("name", "order", "taxID")
-
-# Next, we make an extra column: genus - the first part of the binomial name.
-# We'll use the gsub() function, and for that we need a "regular expression"
-# that matches to all characters from the first blank to the end of the string:
-myPatt <- "\\s.*$"  # one whitespace (\\s) ...
-                    # followed by any character (.) 0..n times (*) ...
-                    # until the end of the string
-
-# using gsub() we substitue all matching characters with the empty string "" -
-# this deletes the matching characters
-# Test this:
-gsub(myPatt, "", "Genus")                      # one word: unchanged
-gsub(myPatt, "", "gEnus species")              # two words: return only first
-gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
-
-# apply this to the "name" column and add the result as a separate column
-# called "genus"
-sDat$genus <- gsub(myPatt, "", sDat$name)
-
-# what do we get?
-c(head(unique(sDat$genus)),
-  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
-                             # is a problem that we have to keep in mind.
-                             # (Always inspect your results!)
-# Drop all rows for which the genus contains special chracters -
-# like "[Candida]"
-sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
-
-length(table(sDat$genus))    # how many genus?
-hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
-                                              # most genus have very few, but
-                                              # some have very many species.
-sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
-
-# We should have at least one species from each taxonomic order, but we can
-# add a few genus until we have about 100 validated species.
-
-# Let's add a column for species, by changing our regular expression a bit,
-# using ^ (start of string), \\S (NOT a whitespace),
-# and + (one or more matches), capturing the match (...), and returning
-# it as the substitution (\\1) ...
-
-myPatt <- "^(\\S+\\s\\S+)\\s.*$"
-sDat$species <- gsub(myPatt, "\\1", sDat$name)
-
-# And we reorder the columns, just for aesthetics:
-sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
-
-# Final check:
-any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
-
-#
-# Now we check which of these have Mbp1 homologues ...
-
-# =    4  BLAST SEARCH  ========================================================
-
-
-# We run a BLAST search to find all proteins related to yeast Mbp1 in any
-# fungus. With the results, we'll annotate our sDat table.
-
-# ==   4.1  find homologous proteins  ==========================================
-#
-# Use BLAST to fetch proteins related to Mbp1 and identify the species that
-# contain them.
-
-# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
-# amount of error handling involved that is not supported by the API in a
-# principled way but requires rather ad hoc solutions. The code I threw together
-# to make a BLAST interface (demo-quality, not research-quality) is in the file
-# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
-# standard task of communicating with servers and parsing responses - everyday
-# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
-# parser in currently available packages.
-#
-# DON'T use this for BLAST searches unless you have read the NCBI policy
-# for automated tasks. If you indicriminately pound on the NCBI's BLAST
-# server, they will blacklist your IP-address. See:
-# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
-# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
-# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
-#                    db = "refseq_protein",        # database to search in
-#                    nHits = 3000,                 # 945 hits in 2020
-#                    E = 0.01,                     #
-#                    limits = "txid4751[ORGN]")    # = fungi
-# saveRDS(BLASThits, file="data/BLASThits.rds")
-#
-# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
-#
-BLASThits <- readRDS(file = "data/BLASThits.rds")
-
-# ==   4.2  Identify species in "hits"  ========================================
-
-# This is a very big list that can't be usefully analyzed manually. Here
-# we are only interested in the species names that it contains.
-
-# How many hits in the list?
-length(BLASThits$hits)      # 1,134
-
-# Let's look at a hit somewhere down the list
-str(BLASThits$hit[[277]])
-
-# A fair amount of parsing has gone into the BLAST.R code to prepare the results
-# in a useful way. The species information is in the $species element of every
-# hit.
-
-# Run a loop to extract all the species names into a vector. We subset ...
-# Blasthits$hits                 ... the list of hits, from which we choose ...
-# Blasthits$hits[[i]]            ... the i-th hit, and get ...
-# Blasthits$hits[[i]]$species    ... the species element from that.
-# Subsetting FTW.
-
-BLASTspecies <- character()
-for (i in seq_along(BLASThits$hits)) {
-    BLASTspecies[i] <- BLASThits$hits[[i]]$species
-}
-
-# You can confirm that BLASTspecies has the expected size.
-length(BLASTspecies)
-
-# if we delete some of these later on, we still want to remember which hit
-# they came from. Thus we name() the elements with their index, which is the
-# same as the index of the hit in BLASThits
-names(BLASTspecies) <- 1:length(BLASTspecies)
-
-
-# let's plot the distribution of E-values
-eVals <- numeric()
-for (i in seq_along(BLASThits$hits)) {
-  eVals[i] <- BLASThits$hits[[i]]$E
-}
-range(eVals)
-sum(eVals == 0)
-
-# let's plot the log of all values > 0 to see how they are distributed
-# plotting only one vectyor of numbers plots their index as x, and
-# their value as y ...
-plot(log(eVals[eVals > 0]), col = "#CC0000")
-
-# This is very informative: I would suspect that the first ten or so are
-# virtually identical to the yeast protein, then we have about 800 hits with
-# decreasing similarity, and then about 200 more that may actually be false
-# positives. Also - we plotted them by index, that means the table is SORTED:
-# Lower E-values strictly come before higher E-values.
-
-# Again, some species appear more than once, e.g. ...
-sum(BLASTspecies == "Saccharomyces cerevisiae")
-
-# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
-
-# Therefore we remove duplicates. Removing duplicates will leave the FIRST
-# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
-# species, we will retain only the protein that has the highest similarity
-# to yeast Mbp1, not any of its more distant paralogues.
-sel <- ! duplicated(BLASTspecies)
-BLASTspecies <- BLASTspecies[sel]
-
-length(BLASTspecies)
-# i.e. we got rid of about two thirds of the hits.
-tail(BLASTspecies)  # see how the names are useful!
-                    # again - there are some special characters ...
-                    # what are they?
-BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
-
-# remove the brackets ...
-BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
-# drop any new duplicates ...
-BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
-
-# check the number again:
-length(BLASTspecies)
-# Think a bit about this: what may be the biological reason to find that
-# on average, in 388 fungi across the entire phylogenetic tree, we have
-# three sequences that are homologous to yeast Mbp1?
-
-# Let's look at the distribution of E-values in this selection (Subsetting FTW):
-# we plot all values that are TRUE in the vector "sel" that we created above,
-# AND greater than 0
-plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
-
-
-# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
-
-# Next we add the blast result to our sDat dataframe. We'll store the index,
-# the E-value, and the Query-bounds from which we can estimate which domains
-# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
-# Mbp1's N-terminal APSES domain.)
-#
-# First we pull the hits we wanted from the BLASTspecies:
-iHits <- as.numeric(names(BLASTspecies))
-length(iHits)     # one index for each TRUE in sel
-
-# add columns to sDat
-l <- nrow(sDat)
-sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
-sDat$eVal   <- numeric(l)  # E-value of the hit
-sDat$lAli   <- numeric(l)  # length of the aligned region
-
-# extract and merge
-for (iHit in iHits) {
-  thisSp <- BLASThits$hits[[iHit]]$species
-  sel <- sDat$species == thisSp
-
-  sDat$iHit[sel]   <- iHit
-  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
-  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
-}
-
-# Are all reference species accounted for?
-selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
-REFspecies %in% sDat$species[selA]     # yes, all there
-
-selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
-
-sum(selA & selB)   # How many rows?
-
-# sDat of course includes all duplicates. Some may be multiply sequenced, some
-# may be different strains. We'll use the same strategy as before and keep
-# only the best hit: order the rows by E-value, then drop all rows which
-# are duplicated.
-
-
-# drop all rows without BLAST hits ...
-sDat <- sDat[ ! (sDat$iHit == 0) , ]
-
-# order sDat by E-value ...
-sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
-
-# drop all rows with duplicated species ...
-sDat <- sDat[ ! duplicated(sDat$species) , ]
-
-# Lets look at the E-values ...
-plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
-
-# and alignment lengths ...
-plot(sDat$lAli, col = "#00DDAA")
-
-# How many ...
-length(unique(sDat$name))
-length(unique(sDat$species))
-length(unique(sDat$genus))
-length(unique(sDat$order))
-
-# I need an extra species for admin purposes later on ...
-sel <- grep("Sporothrix schenckii", sDat$species)
-SPOSCdat <- sDat[sel, ]
-sDat <- sDat[-sel, ]
-
-# To get the final dataset, we remove the reference species with their
-# entire orders ...
-REForders <- unique(sDat$order[sDat$species %in% REFspecies])
-sel <- sDat$order %in% REForders
-REFdat <- sDat[sel , ]
-sDat   <- sDat[ ! sel , ]
-
-# REFdat should now contain only the REFspecies ...
-( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
-
-# ... but all of them
-sum(REFspecies %in% REFdat$species)
-
-# ... and we have enough left in sDat to prune sDat to unique genus
-sDat <- sDat[ ! duplicated(sDat$genus) , ]
-nrow(sDat)   # 84
-
-# I add back "Sporothrix schenckii" ...
-sDat <- rbind(SPOSCdat, sDat)
-
-# ... and save for future use.
-# saveRDS(sDat, file = "data/sDat.rds")
-# saveRDS(REFdat, file = "data/REFdat.rds")
-
-
-
-# =    6  STUDENT NUMBERS  =====================================================
-#
-# An asymmetric function to retrieve a MYSPE species
-#
-sDat <- readRDS(file = "data/sDat.rds")
-
-students <- read.csv("../BCH441-2021-students.csv")
-sN <- students$Integration.ID
-sN <- sN[! is.na(sN)]
-sN <- as.character(sN)
-sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
-
-set.seed(112358)
-theseSpecies <- sDat[sample(1:nrow(sDat)), ]
-all(sort(theseSpecies$name) == sort(sDat$name))
-nrow((theseSpecies))
-(iX <- grep("Sporothrix schenckii", theseSpecies$name))
-theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
-rndMin <-  992000000
-rndMax <- 1020000000
-N <- 10000
-keys <- as.character(sample(rndMin:rndMax, N + 1000))
-keys <- keys[! (keys %in% sN)]
-keys <- keys[1:N]
-keys[1:length(sN)] <- sN
-
-nRep <- floor(N/nrow(theseSpecies))
-MYSPEdat <- theseSpecies
-for(i in 1:nRep) {
-  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
-}
-MYSPEdat <- MYSPEdat[1:N, ]
-for (i in 1:N) {
-  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
-}
-set.seed(NULL)
-MYSPEdat <- MYSPEdat[sample(1:N), ]
-
-# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
-
-# === validate
-x <- character()
-for (n in sN) {
-  sp <- getMYSPE(n)
-  if (length(sp) != 1) {
-    stop(print(as.character(n)))
-  } else {
-    x <- c(x, sp)
-  }
-}
-
-# === species for late-comers
-y <- unique(MYSPEdat$species)
-print(y[!(y %in% x)])
-
-
-# === validate
-l <- length(sN)
-sp <- character(l)
-for(i in 1:l) {
-  sp[i] <- getMYSPE(sN[i])
-}
-any(duplicated(sp))
-length(unique(sp))
-which(! sDat$species %in% sp)  # these can be assigned to late-comers
-
-# Done.
-
-# [END]
+# tocID <- "scripts/ABC-makeMYSPElist.R"
+#
+# Purpose:  Create a list of genome sequenced fungi with protein annotations and
+#               Mbp1 homologues.
+#
+# Version: 1.4
+#
+# Date:    2016  09  -  2021  09
+# Author:  Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions
+#          1.4    New retrieval logic
+#          1.3    Rewrite to change datasource. NCBI has not been updated
+#                   since 2012. Use ensembl fungi as initial source.
+#          1.2    Change from require() to requireNamespace()
+#          1.1.2  Moved BLAST.R to ./scripts directory
+#          1.1    Update 2017
+#          1.0    First code 2016
+#
+# TODO:
+#
+# ==============================================================================
+#
+# DO NOT  source()  THIS FILE!
+#
+# This file is code I provide for your deeper understanding of a process and
+# to provide you with useful sample code. It is not actually necessary for
+# you to run this code, but I encourage you to read it carefully and discuss
+# if there are parts you don't understand.
+#
+# Run the commands that interact with the NCBI servers only if you want to
+# experiment specifically with the code and/or parameters. I have commented out
+# those parts. If you only want to study the general workflow, just load()
+# the respective intermediate results.
+#
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                    Line
+#TOC> --------------------------------------------------------
+#TOC>   1        The strategy                               55
+#TOC>   2        PACKAGES AND INITIALIZATIONS               67
+#TOC>   3        ENSEMBL FUNGI                              75
+#TOC>   3.1        Import                                   78
+#TOC>   4        BLAST SEARCH                              155
+#TOC>   4.1        find homologous proteins                161
+#TOC>   4.2        Identify species in "hits"              192
+#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
+#TOC>   6        STUDENT NUMBERS                           375
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  The strategy  ========================================================
+
+# This script will create a list of "MYSPE" species and save it in an R object
+# MYSPEspecies that is stored in the data subdirectory of this project from
+# where it can be loaded. The strategy is as follows: we download a list of
+# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
+# species that have been annotated.
+# Next we perform a BLAST search, to identify fungal species that have
+# genes that are homologous to yeast MBP1.
+#
+# ...
+
+# =    2  PACKAGES AND INITIALIZATIONS  ========================================
+
+# httr provides interfaces to Webservers on the Internet
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+
+
+# =    3  ENSEMBL FUNGI  =======================================================
+
+
+# ==   3.1  Import  ============================================================
+
+# Navigate to https://fungi.ensembl.org and click on the link to the full
+# list of all species: https://fungi.ensembl.org/species.html
+# On the page, click on the spreadsheet symbol top right and choose
+# "download whole table". The file will be named  "Species.csv", in your
+# usual downloads folder. Move it to the data folder, and read it.
+
+sDat <- read.csv("./data/Species.csv")
+str(sDat)
+
+# The most obvious way to partition these is according to Classification ...
+# (poking around a bit in the UniProt taxonomy database shows that the
+#  classification used here is the taxonomic rank of "order").
+# how many classifications do we have?
+length(unique(sDat$Classification))  # 66
+
+# To have a good set for the class, we should have about 100.
+# Let's see for which of these we can find Mbp1 homologues.
+# First, we'll keep only the colums for name, classification, and taxID, and
+# drop the rest ...
+sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
+colnames(sDat) <- c("name", "order", "taxID")
+
+# Next, we make an extra column: genus - the first part of the binomial name.
+# We'll use the gsub() function, and for that we need a "regular expression"
+# that matches to all characters from the first blank to the end of the string:
+myPatt <- "\\s.*$"  # one whitespace (\\s) ...
+                    # followed by any character (.) 0..n times (*) ...
+                    # until the end of the string
+
+# using gsub() we substitue all matching characters with the empty string "" -
+# this deletes the matching characters
+# Test this:
+gsub(myPatt, "", "Genus")                      # one word: unchanged
+gsub(myPatt, "", "gEnus species")              # two words: return only first
+gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
+
+# apply this to the "name" column and add the result as a separate column
+# called "genus"
+sDat$genus <- gsub(myPatt, "", sDat$name)
+
+# what do we get?
+c(head(unique(sDat$genus)),
+  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
+                             # is a problem that we have to keep in mind.
+                             # (Always inspect your results!)
+# Drop all rows for which the genus contains special chracters -
+# like "[Candida]"
+sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
+
+length(table(sDat$genus))    # how many genus?
+hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
+                                              # most genus have very few, but
+                                              # some have very many species.
+sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
+
+# We should have at least one species from each taxonomic order, but we can
+# add a few genus until we have about 100 validated species.
+
+# Let's add a column for species, by changing our regular expression a bit,
+# using ^ (start of string), \\S (NOT a whitespace),
+# and + (one or more matches), capturing the match (...), and returning
+# it as the substitution (\\1) ...
+
+myPatt <- "^(\\S+\\s\\S+)\\s.*$"
+sDat$species <- gsub(myPatt, "\\1", sDat$name)
+
+# And we reorder the columns, just for aesthetics:
+sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
+
+# Final check:
+any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
+
+#
+# Now we check which of these have Mbp1 homologues ...
+
+# =    4  BLAST SEARCH  ========================================================
+
+
+# We run a BLAST search to find all proteins related to yeast Mbp1 in any
+# fungus. With the results, we'll annotate our sDat table.
+
+# ==   4.1  find homologous proteins  ==========================================
+#
+# Use BLAST to fetch proteins related to Mbp1 and identify the species that
+# contain them.
+
+# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
+# amount of error handling involved that is not supported by the API in a
+# principled way but requires rather ad hoc solutions. The code I threw together
+# to make a BLAST interface (demo-quality, not research-quality) is in the file
+# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
+# standard task of communicating with servers and parsing responses - everyday
+# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
+# parser in currently available packages.
+#
+# DON'T use this for BLAST searches unless you have read the NCBI policy
+# for automated tasks. If you indicriminately pound on the NCBI's BLAST
+# server, they will blacklist your IP-address. See:
+# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#
+# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
+# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
+#                    db = "refseq_protein",        # database to search in
+#                    nHits = 3000,                 # 945 hits in 2020
+#                    E = 0.01,                     #
+#                    limits = "txid4751[ORGN]")    # = fungi
+# saveRDS(BLASThits, file="data/BLASThits.rds")
+#
+# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
+#
+BLASThits <- readRDS(file = "data/BLASThits.rds")
+
+# ==   4.2  Identify species in "hits"  ========================================
+
+# This is a very big list that can't be usefully analyzed manually. Here
+# we are only interested in the species names that it contains.
+
+# How many hits in the list?
+length(BLASThits$hits)      # 1,134
+
+# Let's look at a hit somewhere down the list
+str(BLASThits$hit[[277]])
+
+# A fair amount of parsing has gone into the BLAST.R code to prepare the results
+# in a useful way. The species information is in the $species element of every
+# hit.
+
+# Run a loop to extract all the species names into a vector. We subset ...
+# Blasthits$hits                 ... the list of hits, from which we choose ...
+# Blasthits$hits[[i]]            ... the i-th hit, and get ...
+# Blasthits$hits[[i]]$species    ... the species element from that.
+# Subsetting FTW.
+
+BLASTspecies <- character()
+for (i in seq_along(BLASThits$hits)) {
+    BLASTspecies[i] <- BLASThits$hits[[i]]$species
+}
+
+# You can confirm that BLASTspecies has the expected size.
+length(BLASTspecies)
+
+# if we delete some of these later on, we still want to remember which hit
+# they came from. Thus we name() the elements with their index, which is the
+# same as the index of the hit in BLASThits
+names(BLASTspecies) <- 1:length(BLASTspecies)
+
+
+# let's plot the distribution of E-values
+eVals <- numeric()
+for (i in seq_along(BLASThits$hits)) {
+  eVals[i] <- BLASThits$hits[[i]]$E
+}
+range(eVals)
+sum(eVals == 0)
+
+# let's plot the log of all values > 0 to see how they are distributed
+# plotting only one vectyor of numbers plots their index as x, and
+# their value as y ...
+plot(log(eVals[eVals > 0]), col = "#CC0000")
+
+# This is very informative: I would suspect that the first ten or so are
+# virtually identical to the yeast protein, then we have about 800 hits with
+# decreasing similarity, and then about 200 more that may actually be false
+# positives. Also - we plotted them by index, that means the table is SORTED:
+# Lower E-values strictly come before higher E-values.
+
+# Again, some species appear more than once, e.g. ...
+sum(BLASTspecies == "Saccharomyces cerevisiae")
+
+# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
+
+# Therefore we remove duplicates. Removing duplicates will leave the FIRST
+# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
+# species, we will retain only the protein that has the highest similarity
+# to yeast Mbp1, not any of its more distant paralogues.
+sel <- ! duplicated(BLASTspecies)
+BLASTspecies <- BLASTspecies[sel]
+
+length(BLASTspecies)
+# i.e. we got rid of about two thirds of the hits.
+tail(BLASTspecies)  # see how the names are useful!
+                    # again - there are some special characters ...
+                    # what are they?
+BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
+
+# remove the brackets ...
+BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
+# drop any new duplicates ...
+BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
+
+# check the number again:
+length(BLASTspecies)
+# Think a bit about this: what may be the biological reason to find that
+# on average, in 388 fungi across the entire phylogenetic tree, we have
+# three sequences that are homologous to yeast Mbp1?
+
+# Let's look at the distribution of E-values in this selection (Subsetting FTW):
+# we plot all values that are TRUE in the vector "sel" that we created above,
+# AND greater than 0
+plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
+
+
+# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
+
+# Next we add the blast result to our sDat dataframe. We'll store the index,
+# the E-value, and the Query-bounds from which we can estimate which domains
+# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
+# Mbp1's N-terminal APSES domain.)
+#
+# First we pull the hits we wanted from the BLASTspecies:
+iHits <- as.numeric(names(BLASTspecies))
+length(iHits)     # one index for each TRUE in sel
+
+# add columns to sDat
+l <- nrow(sDat)
+sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
+sDat$eVal   <- numeric(l)  # E-value of the hit
+sDat$lAli   <- numeric(l)  # length of the aligned region
+
+# extract and merge
+for (iHit in iHits) {
+  thisSp <- BLASThits$hits[[iHit]]$species
+  sel <- sDat$species == thisSp
+
+  sDat$iHit[sel]   <- iHit
+  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
+  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
+}
+
+# Are all reference species accounted for?
+selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
+REFspecies %in% sDat$species[selA]     # yes, all there
+
+selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
+
+sum(selA & selB)   # How many rows?
+
+# sDat of course includes all duplicates. Some may be multiply sequenced, some
+# may be different strains. We'll use the same strategy as before and keep
+# only the best hit: order the rows by E-value, then drop all rows which
+# are duplicated.
+
+
+# drop all rows without BLAST hits ...
+sDat <- sDat[ ! (sDat$iHit == 0) , ]
+
+# order sDat by E-value ...
+sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
+
+# drop all rows with duplicated species ...
+sDat <- sDat[ ! duplicated(sDat$species) , ]
+
+# Lets look at the E-values ...
+plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
+
+# and alignment lengths ...
+plot(sDat$lAli, col = "#00DDAA")
+
+# How many ...
+length(unique(sDat$name))
+length(unique(sDat$species))
+length(unique(sDat$genus))
+length(unique(sDat$order))
+
+# I need an extra species for admin purposes later on ...
+sel <- grep("Sporothrix schenckii", sDat$species)
+SPOSCdat <- sDat[sel, ]
+sDat <- sDat[-sel, ]
+
+# To get the final dataset, we remove the reference species with their
+# entire orders ...
+REForders <- unique(sDat$order[sDat$species %in% REFspecies])
+sel <- sDat$order %in% REForders
+REFdat <- sDat[sel , ]
+sDat   <- sDat[ ! sel , ]
+
+# REFdat should now contain only the REFspecies ...
+( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
+
+# ... but all of them
+sum(REFspecies %in% REFdat$species)
+
+# ... and we have enough left in sDat to prune sDat to unique genus
+sDat <- sDat[ ! duplicated(sDat$genus) , ]
+nrow(sDat)   # 84
+
+# I add back "Sporothrix schenckii" ...
+sDat <- rbind(SPOSCdat, sDat)
+
+# ... and save for future use.
+# saveRDS(sDat, file = "data/sDat.rds")
+# saveRDS(REFdat, file = "data/REFdat.rds")
+
+
+
+# =    6  STUDENT NUMBERS  =====================================================
+#
+# An asymmetric function to retrieve a MYSPE species
+#
+sDat <- readRDS(file = "data/sDat.rds")
+
+students <- read.csv("../BCH441-2021-students.csv")
+sN <- students$Integration.ID
+sN <- sN[! is.na(sN)]
+sN <- as.character(sN)
+sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
+
+set.seed(112358)
+theseSpecies <- sDat[sample(1:nrow(sDat)), ]
+all(sort(theseSpecies$name) == sort(sDat$name))
+nrow((theseSpecies))
+(iX <- grep("Sporothrix schenckii", theseSpecies$name))
+theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
+rndMin <-  992000000
+rndMax <- 1020000000
+N <- 10000
+keys <- as.character(sample(rndMin:rndMax, N + 1000))
+keys <- keys[! (keys %in% sN)]
+keys <- keys[1:N]
+keys[1:length(sN)] <- sN
+
+nRep <- floor(N/nrow(theseSpecies))
+MYSPEdat <- theseSpecies
+for(i in 1:nRep) {
+  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
+}
+MYSPEdat <- MYSPEdat[1:N, ]
+for (i in 1:N) {
+  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
+}
+set.seed(NULL)
+MYSPEdat <- MYSPEdat[sample(1:N), ]
+
+# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
+
+# === validate
+x <- character()
+for (n in sN) {
+  sp <- getMYSPE(n)
+  if (length(sp) != 1) {
+    stop(print(as.character(n)))
+  } else {
+    x <- c(x, sp)
+  }
+}
+
+# === species for late-comers
+y <- unique(MYSPEdat$species)
+print(y[!(y %in% x)])
+
+
+# === validate
+l <- length(sN)
+sp <- character(l)
+for(i in 1:l) {
+  sp[i] <- getMYSPE(sN[i])
+}
+any(duplicated(sp))
+length(unique(sp))
+which(! sDat$species %in% sp)  # these can be assigned to late-comers
+
+# Done.
+
+# [END]
--- a/scripts/ABC-makeSTRINGedges.R
+++ b/scripts/ABC-makeSTRINGedges.R
@@ -1,168 +1,168 @@
-# tocID <- "scripts/ABC-makeSTRINGedges.R"
-#
-# Create a subnetwork of high-confidence human STRING edges.
-#
-# Notes:
-#
-#      The large source- datafile is NOT posted to github. If you want to
-#      experiment with the original data, download it and place it into your
-#      local  ./data  directory.
-#
-#      STRING data source:
-#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
-#        Data: (127.6 Mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
-#
-# Version:  1.0
-#
-# Date:     2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.0    Rewrite
-#
-# TODO:
-#
-# ==============================================================================
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
-#TOC>   1        Initialize                          44
-#TOC>   2        Read STRING Data                    51
-#TOC>   3        Define cutoff and subset            63
-#TOC>   4        Drop  duplicates                   103
-#TOC>   5        Simple statistics                  127
-#TOC>   6        Write to file                      160
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  Initialize  ==========================================================
-
-if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
-}
-
-
-# =    2  Read STRING Data  ====================================================
-
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 127.6MB, the uncompressed version is probably
-# 848 Mb. Fortunately readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
-( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
-nrow(STR)  #  11,759,454 rows
-head(STR)
-
-
-# =    3  Define cutoff and subset  ============================================
-
-# approximate distribution of combined_score
-hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
-
-# Let's table the counts >= 850 and plot them for better resolution.
-
-myTb <- table(STR$combined_score[STR$combined_score >= 850])
-is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
-
-plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
-myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
-                         # frequently assigns a combined score of 0.900
-
-# Let's plot these counts as cumulative sums, in reverse order, scaled
-# as combined scores.
-myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
-plot(myX,
-     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
-     xlim = c(1.0, 0.85),            # reverse x-axis
-     type = "l",
-     main = "STRING interactions for 9606 (top 600,000)",
-     xlab = "combined_score",
-     ylab = "cumulative counts",
-     col = "#CC0000")
-abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
-
-# What's the cutoff for 100,000 edges?
-which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
-
-# confirm
-sum(STR$combined_score >= 964) # 101,348
-abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
-
-# subset the table, and use only the protein IDs and the combined_score
-STR <- STR[STR$combined_score >= 964,
-            c("protein1", "protein2", "combined_score")]
-colnames(STR) <- c("a", "b", "score")
-
-
-# =    4  Drop  duplicates  ====================================================
-
-# identify duplicate interactions by creating keys in a defined alphabetical
-# sort order, then checking for  duplicated().
-# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
-# (X:U, X:U) has a duplicate.
-
-AB <- STR$a < STR$b        # logical vector: genes we need to swap
-tmp <- STR$b               # copy column b
-STR$b[AB] <- STR$a[AB]     # copy a's into b
-STR$a[AB] <- tmp[AB]       # copy tmp's into a
-all(STR$a >= STR$b)        # confirm: TRUE
-
-# now, make combined keys, like this:
-paste0(STR$a[1:10], ":", STR$b[1:10])
-
-tmp <- paste0(STR$a, ":", STR$b)
-sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
-                     # both a:b and b:a !
-
-# drop all duplicated interactions from tmp
-STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
-
-
-# =    5  Simple statistics  ===================================================
-
-# how many unique genes?
-length(unique(c(STR$a, STR$b)))   # 8,445
-
-# how many self-edges?
-sum(STR$a == STR$b)  # none
-
-# log(rank) / log(frequency)
-myTbl <- table(c(STR$a, STR$b))
-myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
-
-hist(myTbl, breaks = 40, col = "#FFEEBB")
-
-# number of singletons
-sum(myTbl == 1) # almost a quarter
-
-# maximum?
-myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
-                                   # Google: CDC5L
-
-# Zipf-plot
-plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
-     type = "b", cex = 0.7,
-     main = "STRINGedges - degrees",
-     xlab = "log(rank)",
-     ylab = "log(frequency)",
-     col = "#FFBB88")
-
-sprintf("Average number of interactions: %5.2f",
-         nrow(STR) / length(unique(c(STR$a, STR$b))))
-
-
-# =    6  Write to file  =======================================================
-
-saveRDS(STR, file = "./data/STRINGedges.rds")
-
-# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
-                                                    # object when needed
-
-
-# [END]
+# tocID <- "scripts/ABC-makeSTRINGedges.R"
+#
+# Create a subnetwork of high-confidence human STRING edges.
+#
+# Notes:
+#
+#      The large source- datafile is NOT posted to github. If you want to
+#      experiment with the original data, download it and place it into your
+#      local  ./data  directory.
+#
+#      STRING data source:
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
+#        Data: (127.6 Mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
+#
+# Version:  1.0
+#
+# Date:     2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.0    Rewrite
+#
+# TODO:
+#
+# ==============================================================================
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                             Line
+#TOC> -------------------------------------------------
+#TOC>   1        Initialize                          44
+#TOC>   2        Read STRING Data                    51
+#TOC>   3        Define cutoff and subset            63
+#TOC>   4        Drop  duplicates                   103
+#TOC>   5        Simple statistics                  127
+#TOC>   6        Write to file                      160
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  Initialize  ==========================================================
+
+if (! requireNamespace("readr", quietly = TRUE)) {
+  install.packages("readr")
+}
+
+
+# =    2  Read STRING Data  ====================================================
+
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# The .gz compressed version is 127.6MB, the uncompressed version is probably
+# 848 Mb. Fortunately readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")
+nrow(STR)  #  11,759,454 rows
+head(STR)
+
+
+# =    3  Define cutoff and subset  ============================================
+
+# approximate distribution of combined_score
+hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
+
+# Let's table the counts >= 850 and plot them for better resolution.
+
+myTb <- table(STR$combined_score[STR$combined_score >= 850])
+is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
+
+plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
+myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
+                         # frequently assigns a combined score of 0.900
+
+# Let's plot these counts as cumulative sums, in reverse order, scaled
+# as combined scores.
+myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
+plot(myX,
+     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
+     xlim = c(1.0, 0.85),            # reverse x-axis
+     type = "l",
+     main = "STRING interactions for 9606 (top 600,000)",
+     xlab = "combined_score",
+     ylab = "cumulative counts",
+     col = "#CC0000")
+abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
+
+# What's the cutoff for 100,000 edges?
+which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
+
+# confirm
+sum(STR$combined_score >= 964) # 101,348
+abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
+
+# subset the table, and use only the protein IDs and the combined_score
+STR <- STR[STR$combined_score >= 964,
+            c("protein1", "protein2", "combined_score")]
+colnames(STR) <- c("a", "b", "score")
+
+
+# =    4  Drop  duplicates  ====================================================
+
+# identify duplicate interactions by creating keys in a defined alphabetical
+# sort order, then checking for  duplicated().
+# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
+# (X:U, X:U) has a duplicate.
+
+AB <- STR$a < STR$b        # logical vector: genes we need to swap
+tmp <- STR$b               # copy column b
+STR$b[AB] <- STR$a[AB]     # copy a's into b
+STR$a[AB] <- tmp[AB]       # copy tmp's into a
+all(STR$a >= STR$b)        # confirm: TRUE
+
+# now, make combined keys, like this:
+paste0(STR$a[1:10], ":", STR$b[1:10])
+
+tmp <- paste0(STR$a, ":", STR$b)
+sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
+                     # both a:b and b:a !
+
+# drop all duplicated interactions from tmp
+STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
+
+
+# =    5  Simple statistics  ===================================================
+
+# how many unique genes?
+length(unique(c(STR$a, STR$b)))   # 8,445
+
+# how many self-edges?
+sum(STR$a == STR$b)  # none
+
+# log(rank) / log(frequency)
+myTbl <- table(c(STR$a, STR$b))
+myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
+
+hist(myTbl, breaks = 40, col = "#FFEEBB")
+
+# number of singletons
+sum(myTbl == 1) # almost a quarter
+
+# maximum?
+myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
+                                   # Google: CDC5L
+
+# Zipf-plot
+plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
+     type = "b", cex = 0.7,
+     main = "STRINGedges - degrees",
+     xlab = "log(rank)",
+     ylab = "log(frequency)",
+     col = "#FFBB88")
+
+sprintf("Average number of interactions: %5.2f",
+         nrow(STR) / length(unique(c(STR$a, STR$b))))
+
+
+# =    6  Write to file  =======================================================
+
+saveRDS(STR, file = "./data/STRINGedges.rds")
+
+# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
+                                                    # object when needed
+
+
+# [END]
--- a/scripts/ABC-makeScCCnet.R
+++ b/scripts/ABC-makeScCCnet.R
@@ -1,167 +1,167 @@
-# tocID <- "scripts/ABC-makeScCCnet.R"
-#
-# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
-# GOSlim annotation.
-#
-# Boris Steipe for ABC learning units
-#
-# Notes:
-#
-#      The large source- datafiles are NOT posted to github. If you want to
-#      experiment with your own code, download them and place them into your
-#      local  ./data  directory.
-#
-#      STRING data source:
-#        Download page:
-# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
-#        Data: (20.1 mb)
-# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
-#
-#      GOSlim data source: (Note: this has moved from GO to SGD)
-#        Info page: https://www.yeastgenome.org/downloads
-#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
-#        Data: (3 mb)
-# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
-#
-#
-# Version:  1.2
-#
-# Date:     2017-10  -  2020-09
-# Author:   Boris Steipe (boris.steipe@utoronto.ca)
-#
-# Versions:
-#           1.2    2020 Update. GO Slim Yeast mow at SGD
-#           1.1    Change from require() to requireNamespace(),
-#                      use <package>::<function>() idiom throughout
-#           1.0    First code copied from 2016 material.
-#
-# TODO:
-#
-# ==============================================================================
-# SRCDIR <- "./instructor"
-
-
-#TOC> ==========================================================================
-#TOC> 
-#TOC>   Section  Title                                           Line
-#TOC> ---------------------------------------------------------------
-#TOC>   1        INITIALIZE                                        58
-#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
-#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
-#TOC>   3.1        Intersect interactions and annotations         122
-#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
-#TOC> 
-#TOC> ==========================================================================
-
-
-# =    1  INITIALIZE  ==========================================================
-
-SRCDIR <- "./data"
-if (! requireNamespace("readr", quietly = TRUE)) {
-  install.packages("readr")
-}
-
-
-# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
-
-# Read STRING Data (needs to be downloaded from database, see URL in Notes)
-# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
-# really not necessary to uncompress since readr:: can read from compressed
-# files, and does so automatically, based on the file extension.
-( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
-STR <- readr::read_delim(fn, delim = " ")
-
-# Subset only IDs and combined_score column
-STR <- STR[ , c("protein1", "protein2", "combined_score")]
-
-# head(STR)
-# sum(STR$combined_score > 909)  # 100270 edges
-# subset for 100,000 highest confidence edges
-STR <- STR[(STR$combined_score > 909), ]
-head(STR)
-
-# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
-STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
-STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
-head(STR)
-
-# get a vector of gene names in this list
-myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
-                                                      # names
-length(myIntxGenes)
-sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
-
-
-# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
-#
-# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
-( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
-
-Gsl <- readr::read_tsv(fn,
-                       col_names = c("ID",
-                                     "name",
-                                     "SGDId",
-                                     "Ontology",
-                                     "termName",
-                                     "termID",
-                                     "status"))
-
-head(Gsl)
-
-# What cell cycle names does it contain?
-myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
-myGslTermNames[grep("cycle", myGslTermNames)]
-# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
-
-# Choose "mitotic cell cycle" as the GOslim term to subset with
-
-scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
-length(scCCgenes)  # 324 genes annotated to that term
-
-# ==   3.1  Intersect interactions and annotations  ============================
-
-sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
-#                                # functional interactions
-
-
-# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
-#
-# Define scCCnet ... the S. Cervisiae Cell Cycle network
-# Subset all rows for which BOTH genes are in the GOslim cell cycle set
-#
-scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
-               (STR$protein2 %in% scCCgenes), ]
-
-# How many genes are there?
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
-
-# Each edge is listed twice - now remove duplicates.
-
-# Step 1: make a vector: sort two names so the fiRst one is alphabetically
-#         smaller Than the second one. This brings the two names into a defined
-#         order. Then concatenate them with a "." - the resulting string
-#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
-#         and c("B", "A") also gives "A.B". This identifies duplicates.
-
-x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
-           1,
-           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
-head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
-
-sum(duplicated(x))  # 1453
-
-# Step 2: drop all rows that contain duplicates in x
-scCCnet <- scCCnet[! duplicated(x), ]
-
-# Confirm we didn't loose genes
-length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
-nrow(scCCnet)
-# Network has 283 nodes, 1453 edges
-
-saveRDS(scCCnet, file = "./data/scCCnet.rds")
-
-# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
-                                             #      object when needed
-
-
-# [END]
+# tocID <- "scripts/ABC-makeScCCnet.R"
+#
+# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
+# GOSlim annotation.
+#
+# Boris Steipe for ABC learning units
+#
+# Notes:
+#
+#      The large source- datafiles are NOT posted to github. If you want to
+#      experiment with your own code, download them and place them into your
+#      local  ./data  directory.
+#
+#      STRING data source:
+#        Download page:
+# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
+#        Data: (20.1 mb)
+# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
+#
+#      GOSlim data source: (Note: this has moved from GO to SGD)
+#        Info page: https://www.yeastgenome.org/downloads
+#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
+#        Data: (3 mb)
+# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
+#
+#
+# Version:  1.2
+#
+# Date:     2017-10  -  2020-09
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           1.2    2020 Update. GO Slim Yeast mow at SGD
+#           1.1    Change from require() to requireNamespace(),
+#                      use <package>::<function>() idiom throughout
+#           1.0    First code copied from 2016 material.
+#
+# TODO:
+#
+# ==============================================================================
+# SRCDIR <- "./instructor"
+
+
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                           Line
+#TOC> ---------------------------------------------------------------
+#TOC>   1        INITIALIZE                                        58
+#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
+#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
+#TOC>   3.1        Intersect interactions and annotations         122
+#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
+#TOC> 
+#TOC> ==========================================================================
+
+
+# =    1  INITIALIZE  ==========================================================
+
+SRCDIR <- "./data"
+if (! requireNamespace("readr", quietly = TRUE)) {
+  install.packages("readr")
+}
+
+
+# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
+
+# Read STRING Data (needs to be downloaded from database, see URL in Notes)
+# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
+# really not necessary to uncompress since readr:: can read from compressed
+# files, and does so automatically, based on the file extension.
+( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
+STR <- readr::read_delim(fn, delim = " ")
+
+# Subset only IDs and combined_score column
+STR <- STR[ , c("protein1", "protein2", "combined_score")]
+
+# head(STR)
+# sum(STR$combined_score > 909)  # 100270 edges
+# subset for 100,000 highest confidence edges
+STR <- STR[(STR$combined_score > 909), ]
+head(STR)
+
+# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
+STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
+STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
+head(STR)
+
+# get a vector of gene names in this list
+myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
+                                                      # names
+length(myIntxGenes)
+sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
+
+
+# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
+#
+# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
+( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
+
+Gsl <- readr::read_tsv(fn,
+                       col_names = c("ID",
+                                     "name",
+                                     "SGDId",
+                                     "Ontology",
+                                     "termName",
+                                     "termID",
+                                     "status"))
+
+head(Gsl)
+
+# What cell cycle names does it contain?
+myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
+myGslTermNames[grep("cycle", myGslTermNames)]
+# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
+
+# Choose "mitotic cell cycle" as the GOslim term to subset with
+
+scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
+length(scCCgenes)  # 324 genes annotated to that term
+
+# ==   3.1  Intersect interactions and annotations  ============================
+
+sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
+#                                # functional interactions
+
+
+# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
+#
+# Define scCCnet ... the S. Cervisiae Cell Cycle network
+# Subset all rows for which BOTH genes are in the GOslim cell cycle set
+#
+scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
+               (STR$protein2 %in% scCCgenes), ]
+
+# How many genes are there?
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
+
+# Each edge is listed twice - now remove duplicates.
+
+# Step 1: make a vector: sort two names so the fiRst one is alphabetically
+#         smaller Than the second one. This brings the two names into a defined
+#         order. Then concatenate them with a "." - the resulting string
+#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
+#         and c("B", "A") also gives "A.B". This identifies duplicates.
+
+x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
+           1,
+           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
+head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
+
+sum(duplicated(x))  # 1453
+
+# Step 2: drop all rows that contain duplicates in x
+scCCnet <- scCCnet[! duplicated(x), ]
+
+# Confirm we didn't loose genes
+length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
+nrow(scCCnet)
+# Network has 283 nodes, 1453 edges
+
+saveRDS(scCCnet, file = "./data/scCCnet.rds")
+
+# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
+                                             #      object when needed
+
+
+# [END]
--- a/scripts/ABC-writeALN.R
+++ b/scripts/ABC-writeALN.R
@@ -1,135 +1,135 @@
-# tocID <- "scripts/ABC-writeALN.R"
-#
-# ToDo:    calculate consensus line
-#          append sequence numbers
-# Notes:
-#
-# ==============================================================================
-
-
-writeALN <- function(ali,
-                     range,
-                     note = "",
-                     myCon = stdout(),
-                     blockWidth = 60) {
-  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
-  # Version: 2.0
-  # Date:    2017 10
-  # Author:  Boris Steipe
-  #
-  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector.
-  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full alignment length.
-  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
-  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
-
-  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
-  if (blockWidth < 1) {
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
-  if (blockWidth > 60) {
-    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
-  }
-
-  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
-
-  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
-  }
-
-  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
-    sSet <- ali
-  } else {
-    stop(paste("Input object of class",
-               class(ali),
-               "can't be handled by this function."))
-  }
-
-  if (missing(range)) {
-    range <- 1
-    range[2] <- max(nchar(sSet))
-  } else {
-    range <- as.integer(range)
-    if(length(range) != 2 ||
-       any(is.na(range)) ||
-       range[1] > range[2] ||
-       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
-  }
-
-  # Right-pad any sequence with "-" that is shorter than ranges[2]
-    for (i in seq_along(sSet)) {
-      if (nchar(sSet[i]) < range[2]) {
-        sSet[i] <- paste0(sSet[i],
-                          paste0(rep("-", range[2] - nchar(sSet[i])),
-                                 collapse = ""))
-      }
-    }
-
-  # Right-pad sequence names
-  sNames <- names(sSet)
-  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
-  for (i in seq_along(sNames)) {
-    sNames[i] <- paste0(sNames[i],
-                      paste0(rep(" ", len - nchar(sNames[i])),
-                             collapse = ""))
-  }
-
-
-  # Process each sequence
-  txt <- paste0("CLUSTAL W format. ", note)
-  txt[2] <- ""
-
-  iStarts <- seq(range[1], range[2], by = blockWidth)
-  iEnds <- c((iStarts[-1] - 1), range[2])
-
-  for (i in seq_along(iStarts)) {
-    for (j in seq_along(sSet)) {
-      txt <- c(txt,
-               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
-    }
-    txt <- c(txt, "")  # append a blank consenus line
-    txt <- c(txt, "")  # append a separator line
-  }
-
-  writeLines(txt, con= myCon)
-
-}
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-if (FALSE) {
-  # test ...
-}
-
-
-
-# [END]
+# tocID <- "scripts/ABC-writeALN.R"
+#
+# ToDo:    calculate consensus line
+#          append sequence numbers
+# Notes:
+#
+# ==============================================================================
+
+
+writeALN <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 60) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.0
+  # Date:    2017 10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector.
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full alignment length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (blockWidth < 1) {
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+  if (blockWidth > 60) {
+    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Right-pad any sequence with "-" that is shorter than ranges[2]
+    for (i in seq_along(sSet)) {
+      if (nchar(sSet[i]) < range[2]) {
+        sSet[i] <- paste0(sSet[i],
+                          paste0(rep("-", range[2] - nchar(sSet[i])),
+                                 collapse = ""))
+      }
+    }
+
+  # Right-pad sequence names
+  sNames <- names(sSet)
+  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
+  for (i in seq_along(sNames)) {
+    sNames[i] <- paste0(sNames[i],
+                      paste0(rep(" ", len - nchar(sNames[i])),
+                             collapse = ""))
+  }
+
+
+  # Process each sequence
+  txt <- paste0("CLUSTAL W format. ", note)
+  txt[2] <- ""
+
+  iStarts <- seq(range[1], range[2], by = blockWidth)
+  iEnds <- c((iStarts[-1] - 1), range[2])
+
+  for (i in seq_along(iStarts)) {
+    for (j in seq_along(sSet)) {
+      txt <- c(txt,
+               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
+    }
+    txt <- c(txt, "")  # append a blank consenus line
+    txt <- c(txt, "")  # append a separator line
+  }
+
+  writeLines(txt, con= myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]
--- a/scripts/ABC-writeMFA.R
+++ b/scripts/ABC-writeMFA.R
@@ -1,121 +1,121 @@
-# ABC-writeMFA.R
-#
-# ToDo:
-# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
-#
-#
-# ==============================================================================
-
-
-writeMFA <- function(ali,
-                     range,
-                     note = "",
-                     myCon = stdout(),
-                     blockWidth = 80) {
-  # Purpose:
-  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
-  #     a file in multi-FASTA format.
-  # Version: 2.1
-  # Date:    2017  10
-  # Author:  Boris Steipe
-  #
-  # Parameters:
-  #     ali             MsaAAMultipleAlignment or AAStringSet or character
-  #                       vector
-  #     range      num  a two-integer vector of start and end positions if
-  #                       only a range of the MSA should be written, e.g.
-  #                       a domain. Defaults to the full sequence length.
-  #     note       chr  a vector of character that is appended to the name
-  #                       of a sequence in the FASTA header. Recycling of
-  #                       shorter vectors applies, thus a vector of length one
-  #                       is added to all headers.
-  #     myCon           a connection (cf. the con argument for writeLines).
-  #                       Defaults to stdout()
-  #     blockWidth int  width of sequence block. Default 80 characters.
-  # Value:
-  #     NA   the function is invoked for its side effect of printing an
-  #          alignment to stdout() or file.
-
-  blockWidth <- as.integer(blockWidth)
-  if (is.na(blockWidth)) {
-    stop("PANIC: parameter \"blockWidth\" must be numeric.")
-  }
-  if (! blockWidth > 0){
-    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
-  }
-
-  # Extract the raw data from the objects depending on their respective class
-  # and put it into a named vector of strings.
-
-  # Extract XStringSet from MsaXMultipleAlignment ...
-  if (class(ali) == "MsaAAMultipleAlignment" |
-      class(ali) == "MsaDNAMultipleAlignment" |
-      class(ali) == "MsaRNAMultipleAlignment") {
-      ali <- ali@unmasked
-  }
-
-  # Process XStringSet
-  if (class(ali) == "AAStringSet" |
-      class(ali) == "DNAStringSet" |
-      class(ali) == "RNAStringSet") {
-    sSet <- as.character(ali) # we use as.character(), not toString() thus
-                              # we don't _have_ to load Biostrings
-  } else if (class(ali) == "character") {
-    sSet <- ali
-  } else {
-    stop(paste("Input object of class",
-               class(ali),
-               "can't be handled by this function."))
-  }
-
-  if (missing(range)) {
-    range <- 1
-    range[2] <- max(nchar(sSet))
-  } else {
-    range <- as.integer(range)
-    if(length(range) != 2 ||
-       any(is.na(range)) ||
-       range[1] > range[2] ||
-       range[1] < 1) {
-      stop("PANIC: \"range\" parameter must contain valid start and end index.")
-    }
-  }
-
-  # Process each sequence
-  txt <- character()
-  if (note != "") {  # construct header line
-    headers <- paste(names(sSet), note)
-  } else {
-    headers <- names(sSet)
-  }
-
-  for (i in seq_along(sSet)) {
-
-    # output FASTA header
-    txt <- c(txt, sprintf(">%s", headers[i]))
-
-    # output the sequence in blocks of blockWidth per line ...
-    iStarts <- seq(range[1], range[2], by = blockWidth)
-    iEnds <- c((iStarts[-1] - 1), range[2])
-
-    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
-    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
-    txt <- c(txt, thisSeq)
-
-    txt <- c(txt, "")  # append an empty line for readability
-  }
-
-  writeLines(txt, con = myCon)
-
-}
-
-# ====  TESTS  =================================================================
-# Enter your function tests here...
-
-if (FALSE) {
-  # test ...
-}
-
-
-
-# [END]
+# ABC-writeMFA.R
+#
+# ToDo:
+# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
+#
+#
+# ==============================================================================
+
+
+writeMFA <- function(ali,
+                     range,
+                     note = "",
+                     myCon = stdout(),
+                     blockWidth = 80) {
+  # Purpose:
+  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
+  #     a file in multi-FASTA format.
+  # Version: 2.1
+  # Date:    2017  10
+  # Author:  Boris Steipe
+  #
+  # Parameters:
+  #     ali             MsaAAMultipleAlignment or AAStringSet or character
+  #                       vector
+  #     range      num  a two-integer vector of start and end positions if
+  #                       only a range of the MSA should be written, e.g.
+  #                       a domain. Defaults to the full sequence length.
+  #     note       chr  a vector of character that is appended to the name
+  #                       of a sequence in the FASTA header. Recycling of
+  #                       shorter vectors applies, thus a vector of length one
+  #                       is added to all headers.
+  #     myCon           a connection (cf. the con argument for writeLines).
+  #                       Defaults to stdout()
+  #     blockWidth int  width of sequence block. Default 80 characters.
+  # Value:
+  #     NA   the function is invoked for its side effect of printing an
+  #          alignment to stdout() or file.
+
+  blockWidth <- as.integer(blockWidth)
+  if (is.na(blockWidth)) {
+    stop("PANIC: parameter \"blockWidth\" must be numeric.")
+  }
+  if (! blockWidth > 0){
+    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
+  }
+
+  # Extract the raw data from the objects depending on their respective class
+  # and put it into a named vector of strings.
+
+  # Extract XStringSet from MsaXMultipleAlignment ...
+  if (class(ali) == "MsaAAMultipleAlignment" |
+      class(ali) == "MsaDNAMultipleAlignment" |
+      class(ali) == "MsaRNAMultipleAlignment") {
+      ali <- ali@unmasked
+  }
+
+  # Process XStringSet
+  if (class(ali) == "AAStringSet" |
+      class(ali) == "DNAStringSet" |
+      class(ali) == "RNAStringSet") {
+    sSet <- as.character(ali) # we use as.character(), not toString() thus
+                              # we don't _have_ to load Biostrings
+  } else if (class(ali) == "character") {
+    sSet <- ali
+  } else {
+    stop(paste("Input object of class",
+               class(ali),
+               "can't be handled by this function."))
+  }
+
+  if (missing(range)) {
+    range <- 1
+    range[2] <- max(nchar(sSet))
+  } else {
+    range <- as.integer(range)
+    if(length(range) != 2 ||
+       any(is.na(range)) ||
+       range[1] > range[2] ||
+       range[1] < 1) {
+      stop("PANIC: \"range\" parameter must contain valid start and end index.")
+    }
+  }
+
+  # Process each sequence
+  txt <- character()
+  if (note != "") {  # construct header line
+    headers <- paste(names(sSet), note)
+  } else {
+    headers <- names(sSet)
+  }
+
+  for (i in seq_along(sSet)) {
+
+    # output FASTA header
+    txt <- c(txt, sprintf(">%s", headers[i]))
+
+    # output the sequence in blocks of blockWidth per line ...
+    iStarts <- seq(range[1], range[2], by = blockWidth)
+    iEnds <- c((iStarts[-1] - 1), range[2])
+
+    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
+    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
+    txt <- c(txt, thisSeq)
+
+    txt <- c(txt, "")  # append an empty line for readability
+  }
+
+  writeLines(txt, con = myCon)
+
+}
+
+# ====  TESTS  =================================================================
+# Enter your function tests here...
+
+if (FALSE) {
+  # test ...
+}
+
+
+
+# [END]
--- a/scripts/BLAST.R
+++ b/scripts/BLAST.R
@@ -1,384 +1,384 @@
-# BLAST.R
-#
-# Purpose: Send off one BLAST search and return parsed list of results
-#          This script uses the BLAST URL-API
-#          (Application Programming Interface) at the NCBI.
-#          Read about the constraints here:
-#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
-#
-#
-# Version: 3.2
-# Date:    2016 09 - 2020 09
-# Author:  Boris Steipe
-#
-# Versions:
-#    3.2   2020 updates
-#    3.1   Change from require() to requireNamespace(),
-#          use <package>::<function>() idiom throughout
-#    3.0   parsing logic had not been fully implemented; Fixed.
-#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
-#          refactored parseBLASTalignment() to handle lists with multiple hits.
-#    2.0   Completely rewritten because the interface completely changed.
-#          Code adpated in part from NCBI Perl sample code:
-#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
-#    1.0   first version posted for BCH441 2016, based on BLAST - API
-#
-# ToDo:    Return the organism/strain name in the output, and propagate
-#          into MYSPE selection script.
-#
-# Notes:   This is somewhat pedestrian, but apparently there are currently
-#          no R packages that contain such code.
-#
-# ==============================================================================
-
-
-if (! requireNamespace("httr", quietly = TRUE)) {
-  install.packages("httr")
-}
-
-
-BLAST <- function(Q,
-                  db = "refseq_protein",
-                  nHits = 30,
-                  E = 0.1,
-                  limits = "",
-                  rid = "",
-                  query = "",
-                  quietly = FALSE,
-                  myTimeout = 120) {
-    # Purpose:
-    #     Basic BLAST search
-    #
-    # Parameters:
-    #     Q: query - either a valid ID or a sequence
-    #     db: "refseq_protein" by default,
-    #         other legal values include: "nr", "pdb", "swissprot" ...
-    #     nHits: number of hits to maximally return
-    #     E: E-value cutoff. Do not return hits whose score would be expected
-    #        to occur E or more times in a database of random sequence.
-    #     limits: a valid ENTREZ filter
-    #     rid: a request ID - to retrieve earlier search results
-    #     query: the actual query string (needed when retrieving results
-    #            with an rid)
-    #     quietly: controls printing of wait-time progress bar
-    #     timeout: how much longer _after_ rtoe to wait for a result
-    #              before giving up (seconds)
-    # Value:
-    #     result: list of process status or resulting hits, and some metadata
-
-
-    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
-
-    results <- list()
-    results$query = query
-    results$rid <- rid
-    results$rtoe <- 0
-
-    if (rid == "") {  # If no rid is available, spawn a search.
-                      # Else, proceed directly to retrieval.
-
-      # prepare query, GET(), and parse rid and rtoe from BLAST server response
-      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                              "?",
-                              "CMD=Put",
-                              "&PROGRAM=", "blastp",
-                              "&QUERY=", URLencode(Q),
-                              "&DATABASE=", db,
-                              "&MATRIX=", "BLOSUM62",
-                              "&EXPECT=", as.character(E),
-                              "&HITLIST_SIZE=", as.character(nHits),
-                              "&ALIGNMENTS=", as.character(nHits),
-                              "&FORMAT_TYPE=Text")
-
-      if (limits != "") {
-        results$query <- paste0(
-          results$query,
-          "&ENTREZ_QUERY=", limits)
-      }
-
-      # send it off ...
-      response <- httr::GET(results$query)
-      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
-                     httr::http_status(response)$message))
-      }
-
-      txt <- httr::content(response, "text", encoding = "UTF-8")
-
-      patt <- "RID = (\\w+)" # match the request id
-      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
-
-      patt <- "RTOE = (\\d+)" # match the expected completion time
-      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
-
-      # Now we wait ...
-      if (quietly) {
-        Sys.sleep(results$rtoe)
-      } else {
-        cat(sprintf("BLAST is processing %s:\n", results$rid))
-        waitTimer(results$rtoe)
-      }
-
-    } # done sending query and retrieving rid, rtoe
-
-    # Enter an infinite loop to check for result availability
-    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                         "?",
-                         "CMD=Get",
-                         "&RID=", results$rid,
-                         "&FORMAT_TYPE=Text",
-                         "&FORMAT_OBJECT=SearchInfo",
-                         sep = "")
-
-    while (TRUE) {
-      # Check whether the result is ready
-      response <- httr::GET(checkStatus)
-      if (httr::http_status(response)$category != "Success" ) {
-        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
-                     httr::http_status(response)$message))
-      }
-
-      txt <- httr::content(response, "text", encoding = "UTF-8")
-
-      if (length(grep("Status=WAITING",  txt)) > 0) {
-        myTimeout <- myTimeout - EXTRAWAIT
-
-        if (myTimeout <= 0) { # abort
-          cat("BLAST search not concluded before timeout. Aborting.\n")
-          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
-                      "Trying checking back later with >",
-                      results$rid))
-          return(results)
-        }
-
-        if (quietly) {
-          Sys.sleep(EXTRAWAIT)
-        } else {
-          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
-                      EXTRAWAIT,
-                      myTimeout))
-          waitTimer(EXTRAWAIT)
-          next
-        }
-
-      } else if (length(grep("Status=FAILED",  txt)) > 0) {
-          cat("BLAST search returned status \"FAILED\". Aborting.\n")
-          return(results)
-
-      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
-          cat("BLAST search returned status \"UNKNOWN\".\n")
-          cat("This probably means the rid has expired. Aborting.\n")
-          return(results)
-
-      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
-
-          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
-            cat("BLAST search ready but no hits found. Aborting.\n")
-            return(results)
-
-          } else {
-            break  # done ... retrieve search result
-          }
-      }
-    } # end result-check loop
-
-    # retrieve results from BLAST server
-    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
-                      "?",
-                      "&CMD=Get",
-                      "&RID=", results$rid,
-                      "&FORMAT_TYPE=Text",
-                      sep = "")
-
-    response <- httr::GET(retrieve)
-    if (httr::http_status(response)$category != "Success" ) {
-      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
-                   httr::http_status(response)$message))
-    }
-
-    txt <- httr::content(response, "text", encoding = "UTF-8")
-
-    # txt contains the whole set of results. Process:
-
-    # First, we strsplit() on linebreaks:
-    txt <- unlist(strsplit(txt, "\n"))
-
-    # The alignments range from the first line that begins with ">" ...
-    iFirst <- grep("^>", txt)[1]
-
-    # ... to the last line that begins with "Sbjct"
-    x <- grep("^Sbjct", txt)
-    iLast <- x[length(x)]
-
-    # Get the alignments block
-    txt <- txt[iFirst:iLast]
-
-    # Drop empty lines
-    txt <- txt[!(nchar(txt) == 0)]
-
-    # A line that ends "]" but does not begin ">" seems to be a split
-    # defline ... eg.
-    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
-    #  [2] "EXF-2481]"
-    #  Merge these lines to the preceding lines and delete them.
-    #
-    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
-    if (length(x) > 0) {
-      txt[x-1] <- paste0(txt[x-1], txt[x])
-      txt <- txt[-x]
-    }
-
-    # Special case: there may be multiple deflines when the BLAST hit is to
-    # redundant, identical sequences. Keep only the first instance.
-    iKeep <- ! grepl("^>", txt)
-    x <- rle(iKeep)
-    x$positions <- cumsum(x$lengths)
-    i <- which(x$lengths > 1 & x$values == FALSE)
-    if (length(i) > 0) {
-      firsts <- x$positions[i] - x$lengths[i] + 1
-      iKeep[firsts] <- TRUE
-      txt <- txt[iKeep]
-    }
-
-    # After this preprocessing the following should be true:
-    # - Every alignment block begins with a defline in which the
-    #   first character is ">"
-    # - There is only one defline in each block.
-    # - Lines are not split.
-
-    # Make a dataframe of first and last indices of alignment blocks
-    x <- grep("^>", txt)
-    blocks <- data.frame(iFirst = x,
-                         iLast  = c((x[-1] - 1), length(txt)))
-
-    # Build the hits list by parsing the blocks
-    results$hits <- list()
-
-    for (i in seq_len(nrow(blocks))) {
-      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
-      results$hits[[i]] <- parseBLASTalignment(thisBlock)
-    }
-
-    return(results)
-}
-
-parseBLASTalignment <- function(hit) {
-  # Parse data from a character vector containing a BLAST hit
-  # Parameters:
-  #    hit  char   one BLAST hit as char vector
-  # Value:
-  #          list   $def          chr   defline
-  #                 $accession    chr   accession number
-  #                 $organism     chr   complete organism definition
-  #                 $species      chr   binomial species
-  #                 $E            num   E value
-  #                 $lengthAli    num   length of the alignment
-  #                 $nIdentitites num   number of identities
-  #                 $nGaps        num   number of gaps
-  #                 $Qbounds      num   2-element vector of query start-end
-  #                 $Sbounds      num   2-element vector of subject start-end
-  #                 $Qseq         chr   query sequence
-  #                 $midSeq       chr   midline string
-  #                 $Sseq         chr   subject sequence
-
-  getToken <- function(patt, v) {
-    # get the first token identified by pattern patt in character vector v
-    v <- v[grep(patt, v)]
-    if (length(v) > 1) { v <- v[1] }
-    if (length(v) == 0) { token <- NA
-    } else {
-      token <- regmatches(v, regexec(patt, v))[[1]][2] }
-    return(token)
-  }
-
-  h <- list()
-
-  # FASTA defline
-  h$def <- hit[1]
-
-  # accesion number (ID), use the first if there are several, separated by "|"
-  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
-  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
-  # organism
-  patt <- "\\[(.+)]"
-  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
-
-  # species
-  x <- unlist(strsplit(h$organism, "\\s+"))
-  if (length(x) >= 2) {
-    h$species <- paste(x[1], x[2])
-  } else if (length(x) == 1) {
-    h$species <- paste(x[1], "sp.")
-  } else {
-    h$species <- NA
-  }
-
-  # E-value
-  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
-
-  # length of alignment
-  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
-
-  # number of identities
-  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
-
-  # number of gaps
-  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
-
-  # split up alignment section
-  idx <- grep("^Query ", hit)
-  Que <- hit[idx]
-  Mid <- hit[idx + 1]
-  Sbj <- hit[idx + 2]
-
-  # first and last positions
-  h$Qbounds <- c(start = 0, end = 0)
-  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
-  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
-
-  h$Sbounds <- c(start = 0, end = 0)
-  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
-  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
-
-  # aligned sequences
-  for (i in seq_along(Que)) {
-    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
-    m <- regexec(patt, Que[i])
-    iFirst <- m[[1]][2]
-    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
-    Que[i] <- substring(Que[i], iFirst, iLast)
-    Mid[i] <- substring(Mid[i], iFirst, iLast)
-    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
-  }
-
-  h$Qseq   <- paste0(Que, collapse = "")
-  h$midSeq <- paste0(Mid, collapse = "")
-  h$Sseq   <- paste0(Sbj, collapse = "")
-
-  return(h)
-}
-
-
-# ==== TESTS ===================================================================
-
-if (FALSE) {
-  # define query:
-  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
-               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
-               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
-               sep="")
-  # or ...
-  q <- "NP_010227" # refseq ID
-
-  test <- BLAST(q,
-                nHits = 100,
-                E = 0.001,
-                rid = "",
-                limits = "txid4751[ORGN]")  # Fungi
-  str(test)
-  length(test$hits)
-}
-
-# [END]
-
+# BLAST.R
+#
+# Purpose: Send off one BLAST search and return parsed list of results
+#          This script uses the BLAST URL-API
+#          (Application Programming Interface) at the NCBI.
+#          Read about the constraints here:
+#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
+#
+#
+# Version: 3.2
+# Date:    2016 09 - 2020 09
+# Author:  Boris Steipe
+#
+# Versions:
+#    3.2   2020 updates
+#    3.1   Change from require() to requireNamespace(),
+#          use <package>::<function>() idiom throughout
+#    3.0   parsing logic had not been fully implemented; Fixed.
+#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
+#          refactored parseBLASTalignment() to handle lists with multiple hits.
+#    2.0   Completely rewritten because the interface completely changed.
+#          Code adpated in part from NCBI Perl sample code:
+#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
+#    1.0   first version posted for BCH441 2016, based on BLAST - API
+#
+# ToDo:    Return the organism/strain name in the output, and propagate
+#          into MYSPE selection script.
+#
+# Notes:   This is somewhat pedestrian, but apparently there are currently
+#          no R packages that contain such code.
+#
+# ==============================================================================
+
+
+if (! requireNamespace("httr", quietly = TRUE)) {
+  install.packages("httr")
+}
+
+
+BLAST <- function(Q,
+                  db = "refseq_protein",
+                  nHits = 30,
+                  E = 0.1,
+                  limits = "",
+                  rid = "",
+                  query = "",
+                  quietly = FALSE,
+                  myTimeout = 120) {
+    # Purpose:
+    #     Basic BLAST search
+    #
+    # Parameters:
+    #     Q: query - either a valid ID or a sequence
+    #     db: "refseq_protein" by default,
+    #         other legal values include: "nr", "pdb", "swissprot" ...
+    #     nHits: number of hits to maximally return
+    #     E: E-value cutoff. Do not return hits whose score would be expected
+    #        to occur E or more times in a database of random sequence.
+    #     limits: a valid ENTREZ filter
+    #     rid: a request ID - to retrieve earlier search results
+    #     query: the actual query string (needed when retrieving results
+    #            with an rid)
+    #     quietly: controls printing of wait-time progress bar
+    #     timeout: how much longer _after_ rtoe to wait for a result
+    #              before giving up (seconds)
+    # Value:
+    #     result: list of process status or resulting hits, and some metadata
+
+
+    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
+
+    results <- list()
+    results$query = query
+    results$rid <- rid
+    results$rtoe <- 0
+
+    if (rid == "") {  # If no rid is available, spawn a search.
+                      # Else, proceed directly to retrieval.
+
+      # prepare query, GET(), and parse rid and rtoe from BLAST server response
+      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                              "?",
+                              "CMD=Put",
+                              "&PROGRAM=", "blastp",
+                              "&QUERY=", URLencode(Q),
+                              "&DATABASE=", db,
+                              "&MATRIX=", "BLOSUM62",
+                              "&EXPECT=", as.character(E),
+                              "&HITLIST_SIZE=", as.character(nHits),
+                              "&ALIGNMENTS=", as.character(nHits),
+                              "&FORMAT_TYPE=Text")
+
+      if (limits != "") {
+        results$query <- paste0(
+          results$query,
+          "&ENTREZ_QUERY=", limits)
+      }
+
+      # send it off ...
+      response <- httr::GET(results$query)
+      if (httr::http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
+                     httr::http_status(response)$message))
+      }
+
+      txt <- httr::content(response, "text", encoding = "UTF-8")
+
+      patt <- "RID = (\\w+)" # match the request id
+      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
+
+      patt <- "RTOE = (\\d+)" # match the expected completion time
+      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
+
+      # Now we wait ...
+      if (quietly) {
+        Sys.sleep(results$rtoe)
+      } else {
+        cat(sprintf("BLAST is processing %s:\n", results$rid))
+        waitTimer(results$rtoe)
+      }
+
+    } # done sending query and retrieving rid, rtoe
+
+    # Enter an infinite loop to check for result availability
+    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                         "?",
+                         "CMD=Get",
+                         "&RID=", results$rid,
+                         "&FORMAT_TYPE=Text",
+                         "&FORMAT_OBJECT=SearchInfo",
+                         sep = "")
+
+    while (TRUE) {
+      # Check whether the result is ready
+      response <- httr::GET(checkStatus)
+      if (httr::http_status(response)$category != "Success" ) {
+        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
+                     httr::http_status(response)$message))
+      }
+
+      txt <- httr::content(response, "text", encoding = "UTF-8")
+
+      if (length(grep("Status=WAITING",  txt)) > 0) {
+        myTimeout <- myTimeout - EXTRAWAIT
+
+        if (myTimeout <= 0) { # abort
+          cat("BLAST search not concluded before timeout. Aborting.\n")
+          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
+                      "Trying checking back later with >",
+                      results$rid))
+          return(results)
+        }
+
+        if (quietly) {
+          Sys.sleep(EXTRAWAIT)
+        } else {
+          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
+                      EXTRAWAIT,
+                      myTimeout))
+          waitTimer(EXTRAWAIT)
+          next
+        }
+
+      } else if (length(grep("Status=FAILED",  txt)) > 0) {
+          cat("BLAST search returned status \"FAILED\". Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
+          cat("BLAST search returned status \"UNKNOWN\".\n")
+          cat("This probably means the rid has expired. Aborting.\n")
+          return(results)
+
+      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
+
+          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
+            cat("BLAST search ready but no hits found. Aborting.\n")
+            return(results)
+
+          } else {
+            break  # done ... retrieve search result
+          }
+      }
+    } # end result-check loop
+
+    # retrieve results from BLAST server
+    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
+                      "?",
+                      "&CMD=Get",
+                      "&RID=", results$rid,
+                      "&FORMAT_TYPE=Text",
+                      sep = "")
+
+    response <- httr::GET(retrieve)
+    if (httr::http_status(response)$category != "Success" ) {
+      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
+                   httr::http_status(response)$message))
+    }
+
+    txt <- httr::content(response, "text", encoding = "UTF-8")
+
+    # txt contains the whole set of results. Process:
+
+    # First, we strsplit() on linebreaks:
+    txt <- unlist(strsplit(txt, "\n"))
+
+    # The alignments range from the first line that begins with ">" ...
+    iFirst <- grep("^>", txt)[1]
+
+    # ... to the last line that begins with "Sbjct"
+    x <- grep("^Sbjct", txt)
+    iLast <- x[length(x)]
+
+    # Get the alignments block
+    txt <- txt[iFirst:iLast]
+
+    # Drop empty lines
+    txt <- txt[!(nchar(txt) == 0)]
+
+    # A line that ends "]" but does not begin ">" seems to be a split
+    # defline ... eg.
+    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
+    #  [2] "EXF-2481]"
+    #  Merge these lines to the preceding lines and delete them.
+    #
+    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
+    if (length(x) > 0) {
+      txt[x-1] <- paste0(txt[x-1], txt[x])
+      txt <- txt[-x]
+    }
+
+    # Special case: there may be multiple deflines when the BLAST hit is to
+    # redundant, identical sequences. Keep only the first instance.
+    iKeep <- ! grepl("^>", txt)
+    x <- rle(iKeep)
+    x$positions <- cumsum(x$lengths)
+    i <- which(x$lengths > 1 & x$values == FALSE)
+    if (length(i) > 0) {
+      firsts <- x$positions[i] - x$lengths[i] + 1
+      iKeep[firsts] <- TRUE
+      txt <- txt[iKeep]
+    }
+
+    # After this preprocessing the following should be true:
+    # - Every alignment block begins with a defline in which the
+    #   first character is ">"
+    # - There is only one defline in each block.
+    # - Lines are not split.
+
+    # Make a dataframe of first and last indices of alignment blocks
+    x <- grep("^>", txt)
+    blocks <- data.frame(iFirst = x,
+                         iLast  = c((x[-1] - 1), length(txt)))
+
+    # Build the hits list by parsing the blocks
+    results$hits <- list()
+
+    for (i in seq_len(nrow(blocks))) {
+      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
+      results$hits[[i]] <- parseBLASTalignment(thisBlock)
+    }
+
+    return(results)
+}
+
+parseBLASTalignment <- function(hit) {
+  # Parse data from a character vector containing a BLAST hit
+  # Parameters:
+  #    hit  char   one BLAST hit as char vector
+  # Value:
+  #          list   $def          chr   defline
+  #                 $accession    chr   accession number
+  #                 $organism     chr   complete organism definition
+  #                 $species      chr   binomial species
+  #                 $E            num   E value
+  #                 $lengthAli    num   length of the alignment
+  #                 $nIdentitites num   number of identities
+  #                 $nGaps        num   number of gaps
+  #                 $Qbounds      num   2-element vector of query start-end
+  #                 $Sbounds      num   2-element vector of subject start-end
+  #                 $Qseq         chr   query sequence
+  #                 $midSeq       chr   midline string
+  #                 $Sseq         chr   subject sequence
+
+  getToken <- function(patt, v) {
+    # get the first token identified by pattern patt in character vector v
+    v <- v[grep(patt, v)]
+    if (length(v) > 1) { v <- v[1] }
+    if (length(v) == 0) { token <- NA
+    } else {
+      token <- regmatches(v, regexec(patt, v))[[1]][2] }
+    return(token)
+  }
+
+  h <- list()
+
+  # FASTA defline
+  h$def <- hit[1]
+
+  # accesion number (ID), use the first if there are several, separated by "|"
+  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
+  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # organism
+  patt <- "\\[(.+)]"
+  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
+
+  # species
+  x <- unlist(strsplit(h$organism, "\\s+"))
+  if (length(x) >= 2) {
+    h$species <- paste(x[1], x[2])
+  } else if (length(x) == 1) {
+    h$species <- paste(x[1], "sp.")
+  } else {
+    h$species <- NA
+  }
+
+  # E-value
+  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
+
+  # length of alignment
+  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
+
+  # number of identities
+  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
+
+  # number of gaps
+  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
+
+  # split up alignment section
+  idx <- grep("^Query ", hit)
+  Que <- hit[idx]
+  Mid <- hit[idx + 1]
+  Sbj <- hit[idx + 2]
+
+  # first and last positions
+  h$Qbounds <- c(start = 0, end = 0)
+  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
+  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
+
+  h$Sbounds <- c(start = 0, end = 0)
+  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
+  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
+
+  # aligned sequences
+  for (i in seq_along(Que)) {
+    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
+    m <- regexec(patt, Que[i])
+    iFirst <- m[[1]][2]
+    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
+    Que[i] <- substring(Que[i], iFirst, iLast)
+    Mid[i] <- substring(Mid[i], iFirst, iLast)
+    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
+  }
+
+  h$Qseq   <- paste0(Que, collapse = "")
+  h$midSeq <- paste0(Mid, collapse = "")
+  h$Sseq   <- paste0(Sbj, collapse = "")
+
+  return(h)
+}
+
+
+# ==== TESTS ===================================================================
+
+if (FALSE) {
+  # define query:
+  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
+               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
+               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
+               sep="")
+  # or ...
+  q <- "NP_010227" # refseq ID
+
+  test <- BLAST(q,
+                nHits = 100,
+                E = 0.001,
+                rid = "",
+                limits = "txid4751[ORGN]")  # Fungi
+  str(test)
+  length(test$hits)
+}
+
+# [END]
+