From 6a565c2ab60dc8d9bee18c2bf6d0d611bc7192f3 Mon Sep 17 00:00:00 2001
From: hyginn <boris.steipe@utoronto.ca>
Date: Mon, 25 Sep 2017 01:30:29 -0400
Subject: [PATCH] New unit

---
 BIN-Storing_data.R | 1119 ++++++++++++++++++++------------------------
 1 file changed, 517 insertions(+), 602 deletions(-)

diff --git a/BIN-Storing_data.R b/BIN-Storing_data.R
index b93bb67..fc1f0f1 100644
--- a/BIN-Storing_data.R
+++ b/BIN-Storing_data.R
@@ -3,11 +3,12 @@
 # Purpose: A Bioinformatics Course:
 #              R code accompanying the BIN-Storing_data unit
 #
-# Version: 0.1
+# Version: 1.0
 #
-# Date:    2017  08  25
+# Date:    2017  09  23
 # Author:  Boris Steipe (boris.steipe@utoronto.ca)
 #
+# V 1.0    First live version, complete rebuilt. Now using JSON data sources.
 # V 0.1    First code copied from BCH441_A03_makeYFOlist.R
 #
 # TODO:
@@ -16,500 +17,549 @@
 # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 #
 # DO NOT SIMPLY  source()  THESE FILES!
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 #  going on. That's not how it works ...
 #
 # ==============================================================================
+ 
+#TOC> ==========================================================================
+#TOC> 
+#TOC>   Section  Title                                        Line
+#TOC> ------------------------------------------------------------
+#TOC>   1        A Relational Datamodel in R: review            55
+#TOC>   1.1      Building a sample database structure           95
+#TOC>   1.1.1    completing the database                       206
+#TOC>   1.2      Querying the database                         241
+#TOC>   1.3      Task: submit for credit (part 1/2)            270
+#TOC>   2        Implementing the protein datamodel            282
+#TOC>   2.1      JSON formatted source data                    308
+#TOC>   2.2      "Sanitizing" sequence data                    343
+#TOC>   2.3      Create a protein table for our data model     363
+#TOC>   2.3.1    Initialize the database                       365
+#TOC>   2.3.2    Add data                                      377
+#TOC>   2.4      Complete the database                         397
+#TOC>   2.4.1    Examples of navigating the database           424
+#TOC>   2.5      Updating the database                         456
+#TOC>   3        Add your own data                             468
+#TOC>   3.1      Find a protein                                476
+#TOC>   3.2      Put the information into JSON files           505
+#TOC>   3.3      Create an R script to create the database     522
+#TOC>   3.3.1    Check and validate                            542
+#TOC>   3.4      Task: submit for credit (part 2/2)            583
+#TOC> 
+#TOC> ==========================================================================
+ 
 
-# ==============================================================================
-#        PART TWO: Implementing the (protein part of) the datamodel
-# ==============================================================================
+# =    1  A Relational Datamodel in R: review  =================================
 
-# Every entity in our datmodel can be implemented by a dataframe. To keep things
-# organized, we create a list, that contains the enity tables. Here is a
-# definition for a dataframe that holds the protein data for yeast Mbp1 as
-# described in the data model schema:
+# A disclaimer at first: we are not building an industry-strength database at
+# all here - but we are employing principles of such a database to keep common
+# types of lab-data well organized. Don't think of this as emulating or even
+# replacing a "real" database, but think of this as improving the ad hoc
+# approaches we normally employ to store data in the lab. That does not mean
+# such ad hoc approaches are necessarily bad - the best solution always depends
+# on your objectives, the details of your tasks, and the context in which you
+# are working.
+
+# The principle we follow in implementing a relational data model is to build a
+# list of dataframes . This list is our "database":
+#  - Each _entity_ of the datamodel is a dataframe. In an SQL database, these
+#    would also be called "tables". In a spreadsheet this would be a "sheet".
+#  - Each instance of an entity, i.e. one stored _item_, is a row of the data
+#    frame. In an SQL database this would be a record. In a spreadsheet this is
+#    a row.
+#  - Each _attribute_ of an entity is is a column of the dataframe. In an SQL
+#    database this is a column, in a spreadsheet too.
+#  - This doesn't necessarily solve the question of how we will store and curate
+#    our source data - we will defer that to later. At first we talk only about
+#    data representation internal to our R session, where we need it for
+#    processing and analysis.
+
+# Lets review syntax for creating and accessing such a structure, a list of data
+# frames. You'll have to be absolutely confident with this, or you'll get lost
+# in all the later learning units. We'll start from a compact example, a tiny
+# database of philosophers to keep things brief. That database will have three
+# tables: person, works and book. Person stores biographical data, book stores
+# books, and works is a join table associating persons with their work. You
+# should already be familiar with "join tables" and why we need them. This is
+# the structure:
 #
-# === Creating two tables ===
-db <- list()      # we'll call our database "db" and start with an empty list
+# person: id, name, born, died, school
+# book:   id, title, published
+# works:  id, person$id, book$id
 
-# Then add a dataframe with one row to the list
-db$protein <- data.frame(ID = as.integer(1),
-                         name = "Mbp1",
-                         RefSeqID = "NP_010227",
-                         UniProtID = "P39678",
-                         taxonomy.ID = as.integer(4932),
-                         sequence = "...",              # just a placeholder
-                         stringsAsFactors = FALSE)
+# Perhaps draw out this schema to make things more clear.
 
-str(db)
+# ==   1.1  Building a sample database structure  ==============================
 
-# Next, we create the taxonomy table and add it into db
-db$taxonomy <- data.frame(ID = as.integer(4932),
-                          species = "Saccharomyces cerevisiae",
-                          stringsAsFactors = FALSE)
+# Let's build this structure.
 
-# Let's add a second protein: We create a one-row dataframe with the data, then we rbind() it to the existing data frame:
-myRow <- data.frame(ID = as.integer(2),
-                    name = "Res2",
-                    RefSeqID = "NP_593032",
-                    UniProtID = "P41412",
-                    taxonomy.ID = as.integer(4896),
-                    sequence = "...",              # again, just a placeholder
-                    stringsAsFactors = FALSE)
-db$protein <- rbind(db$protein, myRow)
+philDB <- list()  # This is an empty list
 
-myRow <- data.frame(ID = as.integer(4896),
-                    species = "Schizosaccharomyces pombe",
-                    stringsAsFactors = FALSE)
-db$taxonomy <- rbind(db$taxonomy, myRow)
+# This is a data frame that we initialize with two philosophers
+x <- data.frame(id = c(1,2),
+                name = c("Laozi", "Martin Heidegger"),
+                born = c(NA, "1889"),
+                died = c("531 BCE", "1976"),
+                school = c("Daoism", "Phenomenology"),
+                stringsAsFactors = FALSE)
+str(x)
 
-str(db)
+# Lets add the dataframe to the philDB list and call it "person" there.
+philDB[["person"]] <- x
+str(philDB)
 
-# you can look at the contents of the tables in the usual way we would access
-# elements from lists and dataframes. Here are some examples:
-db$protein
-db$protein$RefSeqID
-db$protein[,"name"]
-db$taxonomy
-db$taxonomy$species
-biCode(db$taxonomy$species)
+# and let's remove x so we don't mix up things later.
+rm(x)
 
-# Here is an example to look up information in one table,
-# based on a condition in another table:
-# what is the species name for the protein
-# whose name is "Mbp1"?
+# We can address elements with the usual subsetting operators. I will use
+# the $ operator for tables and columns, the [] operator for elements in
+# columns. For example ...
 
-# First, get the taxonomy.ID for the Mbp1 protein. This is
-# the key we need for the taxonomy table. We find it in a cell in the
-# table: db$protein[<row>, <column>]
-# <row> is that row for which the value in
-# the "name" column is Mbp1:
+philDB$person$name[1]   # Laozi
 
-db$protein$name == "Mbp1" # TRUE FALSE
-
-# The <column> is called "taxonomy.ID". Simply
-# insert these two expressions in the square
-# brackets.
-
-db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
-
-# Assign the taxonomy.ID value ...
-x <- db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
-
-# ... and fetch the species_name value from db$taxonomy
-
-db$taxonomy[db$taxonomy$ID == x, "species"]
+# task: Write an expression that returns all "school" entries from the
+#       person table.
 
 
+# Let's now add another person. There are several ways to do this, the
+# conceptually cleanest is to create a one-row dataframe with the data, and
+# rbind() it to the existing dataframe. Doing this, we must take care that
+# the data frame column names are identical. What happens if they are not?
+# Let's find out:
+(x <- data.frame(a=1:4, b=11:14))
+(y <- data.frame(a=6, c=17))
+rbind(x, y)
 
-# === Updating the database ====================================================
+(y <- data.frame(a=6, b=17))
+rbind(x, y)
 
-# Modifying a field
+# All clear? That's good - this behaviour provides us with a sanity check on the
+# operation.
 
-# Here is code that modifies the sequence field in the protein table of the
-# database:
+(x <- data.frame(id = 2,
+                 name = "Zhuangzi",
+                 born = "369 BCE",
+                 died = "286 BCE",
+                 school = "Daoism"))
+
+# Add this to the "person" table in our database with rbind() ...
+philDB$person <- rbind(philDB$person, x)
+
+# ... and examine the result:
+str(philDB)
+
+# Now one thing you should note is that we had forgotten to declare
+# stringsAsFactors = FALSE when we created x - but this did not damage
+# the database. This is because the existing columns had type chr and the
+# implicit coercion, e.g. ...
+as.character(x$name)
+# happened to do the right thing. Don't rely on that. The Right Way is to
+# turn factors off, even when you are making just a single row.
+
+# But we made a serious error in our data! Did you spot it?
 #
-mySeq <- "
-1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
-61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
-121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
-181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
-241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
-301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
-361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
-421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
-481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
-541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
-601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
-661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
-721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
-781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
-//
-"
-
-
-str(db$protein) # before
-db$protein$sequence[db$protein$name == "Mbp1"] <- dbSanitizeSequence(mySeq)
-str(db$protein) # after
-
-# Analyze the expression ! Note how we specify an element of a vector (column)
-# in a data frame in a list using a logical expression. And note how we assign
-# the output (return value) of a function. As far as database coding goes this
-# is pretty minimal - there is no error checking done at all. In particular: can
-# we really guarantee that the name "Mbp1" is unique in the protein table? No!
-# We never required it to be unique. This is a check we need to perform so
-# frequently that we will encapsulate it in a function:
-
-dbConfirmUnique
-
-# try this:
-dbConfirmUnique(c("TRUE", "FALSE"))
-dbConfirmUnique(c(TRUE, FALSE))
-dbConfirmUnique(c(TRUE, TRUE))
-dbConfirmUnique(c(FALSE, FALSE))
-
-
-
+# If not, look at ...
+philDB$person$id
+# ... does that look oK?
 #
+# Absolutely not! id is the Primary Key in the table, and it has to be
+# unique. How can we guarantee it to be unique? Certainly not when we
+# enter it by hand. We need a function that generates a unique key. Here's
+# a simple version, without any error-checking. It assumes that a column
+# named "id" exists in the table, and that it holds the Primary Keys:
+autoincrement <- function(table) {
+  return(max(table$id) + 1)
+}
+
+#Try it:
+autoincrement(philDB$person)
+
+# Once that is clear, let's remove the Zhuangzi entry and recreate it correctly.
+# Many ways to remove, here we use a logical expression to select matching record(s),
+# apply the results to subset the data frame, and overwrite the existing table
+# with the new one.
+
+sel <- !(philDB$person$name == "Zhuangzi")   # select ...
+philDB$person <- philDB$person[sel, ]        # ... and replace
+
+str(philDB)
+
+# Now let's add Zhuangzi with correct data. Note how we use the autoincrement
+# function for the id
+
+x <- data.frame(id = autoincrement(philDB$person),
+                name = "Zhuangzi",
+                born = "369 BCE",
+                died = "286 BCE",
+                school = "Daoism",
+                stringsAsFactors = FALSE)
+philDB$person <- rbind(philDB$person, x)
+str(philDB)
+
+# So far so good. Be honest with yourself. If you didn't follow any of this,
+# go back, re-read, play with it, and ask for help. This is essential.
+
+
+# ===  1.1.1  completing the database                  
+
+
+# Next I'll add one more person, and create the other two tables:
+
+x <- data.frame(id = autoincrement(philDB$person),
+                name = "Kongzi",
+                born = "551 BCE",
+                died = "479 BCE",
+                school = "Confucianism",
+                stringsAsFactors = FALSE)
+philDB$person <- rbind(philDB$person, x)
+
+
+philDB[["books"]] <- data.frame(id = 1:5,
+                                title = c("Zhuangzi",
+                                          "Analects",
+                                          "Being and Time",
+                                          "Daodejing",
+                                          "On the Way to Language"),
+                                published = c("300 BCE",
+                                              "220 BCE",
+                                              "1927",
+                                              "530 BCE",
+                                              "1959"),
+                                stringsAsFactors = FALSE)
+
+philDB[["works"]] <- data.frame(id = 1:5,
+                                personID = c(3, 4, 2, 1, 2),
+                                bookID = c(1, 2, 3, 4, 5),
+                                stringsAsFactors = FALSE)
+
+str(philDB)
+
+
+# ==   1.2  Querying the database  =============================================
+
+
+# To retrieve data, we need to subset tables, possibly based on conditions we
+# find in other tables. Sometimes we can simply get the information, e.g.
+# all names ...
+philDB$person$name
+# ... or all book titles ...
+philDB$books$title
+
+# ... but sometimes we need to cross-reference information via join tables. Here
+# is an example where we list authors and their works, sorted alphabetically by
+# author:
+(sel <- order(philDB$person$name))   # check out ?order and describe to
+                                     # someone you know what it does, so that
+                                     # you are sure you understand it.
+(pID <- philDB$person$id[sel])
+sel <- numeric()   # initialize the vector
+for (ID in pID) {
+  sel <- which(philDB$works$personID == ID)          # get all rows for which
+                                                     #   the condition is TRUE
+  cat(sprintf("%s: ", philDB$person$name[ID]))       # output the person
+  cat(sprintf("\"%s\"  ", philDB$books$title[sel]))  # output the book
+  cat("\n")
+}
+
+# Examine the intermediate results and trace the logic until this is clear.
+
+
+# ==   1.3  Task: submit for credit (part 1/2)  ================================
+
+
+#    Write and submit code that adds another philosopher to the datamodel:
+#       Immanuel Kant, (1724 - 1804), Enlightenment Philosophy.
+#       Works: Critique of Pure Reason (1781), Critique of Judgement (1790)
+#    Write and submit code that lists the books in alphabetical order,
+#    followed by the author and the year of publishing. Format your output like:
+#    "Analects" - Kongzi (220 BCE)
+#    Show the result.
+
+
+# =    2  Implementing the protein datamodel  ==================================
+
+
+# Working with the code above has probably illustrated a few concerns about
+# curating data and storing it for analysis. In particular the join tables
+# seem problematic - figuring out the correct IDs, it's easy to make
+# mistakes.
+#  - Data needs to be captured in a human-readable form so it can be verified
+#      and validated;
+#  - Some aspects of the database should _never_ be done by hand because they
+#      errors are easy to make and hard to see. That essentially includes
+#      every operation that has to do with abstract, primary keys;
+#  - Elementary operations we need to support are: adding data, selecting
+#      data, modifying data and deleting data.
+
+# We will therefore construct our database in the following way:
+#  - For each table, we will keep the primary information in JSON files. There
+#      it is easy to read, edit if needed, and modify it.
+#  - We will use simple scripts to read the JSON data and assemble it in
+#      our database for further analysis.
+#  - I have constructed initial files for yeast Mbp1 and nine other reference
+#      species.
+#  - I have written a small number of utility functions to read those files
+#      and assemble them into a database.
+
+
+# ==   2.1  JSON formatted source data  ========================================
+
+
+# Have a look at the structure of the yeast Mbp1 protein data:
+file.edit("./data/MBP1_SACCE.json")
+
+# - The whole thing is an array: [ ... ]. This is not necessary for a single
+#     object, but we will have more objects in other files. And it's perfectly
+#     legal to have an array with a single element.
+# - The data is formatted as "key" : "value" pairs inside an object { ... }.
+#     This keeps the association between data items and their semantics
+#     explicit.
+# - All keys are strings and they are unique in the object.
+# - Values are mostly single strings and integers ...
+# - ... except for "sequence". That one is an array of strings. Why? This is to
+#     make it easier to format and maintain the data. JSON does not allow line
+#     breaks within strings, but the strings we copy/paste from Genbank or other
+#     sources might have line breaks, sequence numbers etc. So we need to
+#     sanitize the sequence at some point. But since we need to do that
+#     anyway, it is easier to see the whole sequence if we store it in chunks.
+
+# Let's load the "jsonlite" package and have a look at how it reads this data.
+
+if (! require("jsonlite", quietly = TRUE)) {
+  install.packages("jsonlite")
+  library(jsonlite)
+}
+
+x <- fromJSON("./data/MBP1_SACCE.json")
+str(x)
+
+x$name
+unlist(x$sequence)
+
+
+# ==   2.2  "Sanitizing" sequence data  ========================================
+
+
+# Examine the dbSanitizeSequence() function:
+
+dbSanitizeSequence
+
+# Try:
+
+dbSanitizeSequence(c("GAA", "ttc"))
+dbSanitizeSequence("MsnQ00%0 I@#>YSary    S
+                     G1 V2DV3Y>")
+x <- "        1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
+       61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
+      121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
+..." # copy/paste from Genbank
+
+dbSanitizeSequence(x)
+
+
+# ==   2.3  Create a protein table for our data model  =========================
+
+# ===  2.3.1  Initialize the database                  
+
+
+# The function dbInit contains all the code to return a list of empty
+# data frames for our data model.
+
+dbInit
+
+myDB <- dbInit()
+str(myDB)
+
+
+# ===  2.3.2  Add data                                 
+
+
+# fromJSON() returns a dataframe that we can readily process to add data
+# to our table. Have a look at the function to add protein entries:
+
+dbAddProtein
+
+myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json"))
+str(myDB)
+
+# Lets check that the 833 amino acids of the yeast MBP1 sequence have
+# safely arrived. Note the genral idiom we use here to retrieve the data:
+# we define a boolean vector that satisfies a condition, then we subset
+# a column with that vector.
+
+sel <- myDB$protein$name == "MBP1_SACCE"
+nchar(myDB$protein$sequence[sel])
+
+
+# ==   2.4  Complete the database  =============================================
+
+
+# Completing the database with Mbp1 data and data for 9 other "reference"
+# species is more of the same. I have assembled the code in a script
+# "./scripts/ABC-createRefDB.R" - open it, check it out, and then source it.
+# It's really very simple, just reading some prepared files of data I have
+# formatted with JSON, and assembling the data in our data model.
 #
-# Here is the update to the sequence field of Res2 but using our
-# confirmUnique() function
+# The code is also very simple and in particular there is no checking for errors
+# or inconsistencies. Have a look:
 
+# Totally straightforward ...
+dbAddTaxonomy
+dbAddFeature
 
-mySeq <- "
-1 maprssavhv avysgvevye cfikgvsvmr rrrdswlnat qilkvadfdk pqrtrvlerq
-61 vqigahekvq ggygkyqgtw vpfqrgvdla tkykvdgims pilsldideg kaiapkkkqt
-121 kqkkpsvrgr rgrkpsslss stlhsvnekq pnssisptie ssmnkvnlpg aeeqvsatpl
-181 paspnallsp ndntikpvee lgmleapldk yeeslldffl hpeegripsf lyspppdfqv
-241 nsvidddght slhwacsmgh iemiklllra nadigvcnrl sqtplmrsvi ftnnydcqtf
-301 gqvlellqst iyavdtngqs ifhhivqsts tpskvaaaky yldcilekli siqpfenvvr
-361 lvnlqdsngd tslliaarng amdcvnslls ynanpsipnr qrrtaseyll eadkkphsll
-421 qsnsnashsa fsfsgispai ispscsshaf vkaipsissk fsqlaeeyes qlrekeedli
-481 ranrlkqdtl neisrtyqel tflqknnpty sqsmenlire aqetyqqlsk rlliwlearq
-541 ifdlerslkp htslsisfps dflkkedgls lnndfkkpac nnvtnsdeye qlinkltslq
-601 asrkkdtlyi rklyeelgid dtvnsyrrli amscginped lsleildave ealtrek
-"
+# Just slightly more complex, since we need to match the protein or feature
+# name in the JSON file with its internal ID, and, when doing that confirm
+# that it CAN be matched and that the match is UNIQUE
+dbAddAnnotation
 
-db$protein$sequence[dbConfirmUnique(db$protein$name == "Res2")] <- dbSanitizeSequence(mySeq)
-str(db$protein)
-
-# These expressions can get rather long, it's easier to read if we write:
-
-select <- dbConfirmUnique(db$protein$name == "Res2")
-value <- dbSanitizeSequence(mySeq)
-db$protein$sequence[select] <- value
-
-# ... and that's the code paradigm that we will adopt to update
-# database fields (for now).
-
-# Adding a record
-
-# Adding a record is easy in principle, simply defining the values we get
-# from the NCBI or EBI databases ... except for the ID field. That is a field
-# we need to define internally, and once again, we'll use a small function
-# to get this right.
-
-dbAutoincrement
-
-# After reading the code, predict the results
-dbAutoincrement(1)      # Hm.
-dbAutoincrement(1L)
-dbAutoincrement(1:4)
-dbAutoincrement(c(1, 2, 3, 4))
-dbAutoincrement(c(1L, 2L, 3L, 4L))
-dbAutoincrement(c(1, "4", 9))
-dbAutoincrement(TRUE)
-
-# Therefore, here is sample code to add one entry to the protein table.
-#
-mySeq <- "
-1 msgdktifka tysgvpvyec iinnvavmrr rsddwlnatq ilkvvgldkp qrtrvlerei
-61 qkgihekvqg gygkyqgtwi pldvaielae ryniqgllqp itsyvpsaad spppapkhti
-121 stsnrskkii padpgalgrs rratsietes evigaapnnv segsmspsps dissssrtps
-181 plpadrahpl hanhalagyn grdannhary adiildyfvt enttvpslli npppdfnpdm
-241 sidddehtal hwacamgrir vvklllsaga difrvnsnqq talmratmfs nnydlrkfpe
-301 lfellhrsil nidrndrtvf hhvvdlalsr gkphaaryym etminrlady gdqladilnf
-361 qddegetplt maararskrl vrlllehgad pkirnkegkn aedyiieder frsspsrtgp
-421 agielgadgl pvlptsslht seagqrtagr avtlmsnllh sladsydsei ntaekkltqa
-481 hgllkqiqte iedsakvaea lhheaqgvde erkrvdslql alkhainkra rddlerrwse
-541 gkqaikrarl qaglepgals tsnatnapat gdqkskddak sliealpagt nvktaiaelr
-601 kqlsqvqank telvdkfvar areqgtgrtm aayrrliaag cggiapdevd avvgvlcell
-661 qeshtgarag aggerddrar dvammlkgag aaalaanaga p
-"
-
-myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
-                    name = "UMAG_1122",
-                    RefSeqID = "XP_011392621",
-                    UniProtID = "A0A0D1DP35",
-                    taxonomy.ID = as.integer(5270),
-                    sequence = dbSanitizeSequence(mySeq),
-                    stringsAsFactors = FALSE)
-db$protein <- rbind(db$protein, myRow)
-
-myRow <- data.frame(ID = as.integer(5270),
-                    species = "Ustilago maydis",
-                    stringsAsFactors = FALSE)
-db$taxonomy <- rbind(db$taxonomy, myRow)
-
-# Here is a bit of code template for the same ... it won't execute of course but
-# you can copy it into your own code script file to modify when you need to add
-# your own protein.
-
-# ===== begin code template =====================================
-mySeq <- "
-RAW SEQUENCE FROM NCBI RECORD
-"
-
-myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
-                    name = "NAME",
-                    RefSeqID = "REFSEQ",
-                    UniProtID = "UNIPROT",
-                    taxonomy.ID = as.integer(TAX_ID),
-                    sequence = dbSanitizeSequence(mySeq),
-                    stringsAsFactors = FALSE)
-db$protein <- rbind(db$protein, myRow)
-
-myRow <- data.frame(ID = as.integer(TAX_ID),
-                    species = "BINOMIAL NAME",
-                    stringsAsFactors = FALSE)
-db$taxonomy <- rbind(db$taxonomy, myRow)
-
-# ===== end code template ===========================================
-
-
-
-
-# deleting a record
-
-# This is simple code without error checking and of course we can make mistakes.
-# Often we can just overwrite the offending field with correct data. But
-# sometimes it will be easier (and more robust) to delete the erroneous entry
-# and add the correct one. For example, if your code is in a script, and you
-# realize the entry had an error, I would not "patch" the error in the script
-# but delete the entry on the console command line, correct the script and
-# execute the correct block. That way we fix the mistake at its source.
-#
-# Removing a row from a datframe is trivial: just overwrite the dataframe with
-# a selection statement in which the unique selection of the offending row is
-# inverted:
-
-# create an erroneous entry
-myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
-                    name = "nonesuch",
-                    RefSeqID = "NP_000000",
-                    UniProtID = "A123456",
-                    taxonomy.ID = as.integer(999999),
-                    sequence = dbSanitizeSequence("utter nonsense"),
-                    stringsAsFactors = FALSE)
-db$protein <- rbind(db$protein, myRow)
-
-# Make a logical vector that identifies it
-select <- dbConfirmUnique(db$protein$name == "nonesuch")
-select     # the selection
-!select    # its logical inversion
-
-str(db)    # before
-db$protein <- db$protein[ ! select, ]  # overwrite the table with a copy
-# without the selected record
-str(db)    # after
-
-# Note: if you delete records "by hand" you need to be careful that you do
-# not remove keys that are used as foreign keys in another table - if there
-# are such dependecies, you need to update the other table(s) too. "Real"
-# database systems include such dependencies in the creation instructions
-# of the table schema: "on delete cascade ..."
-
-#
-# ==============================================================================
-
-
-
-# ==============================================================================
-#        PART ONE: Database maintenance
-# ==============================================================================
-
-# === Merging databases ========================================================
-
-# From time to time, we will want to update the systems database - either
-# because the schema has changed, e.g. we may include additional tables, or
-# because I have added reference data: sequences, annotations and the like. The
-# goal here is to make this as simple as possible, keeping all information you
-# may have entered intact, and avoiding to duplicate entries. This is not quite
-# as easy as one would hope. First of all, since we need to merge records from
-# different databases - your own, and the reference - we can't assume that the
-# primary keys are unique. You may have introduced the same proteinID that I
-# used later for the reference database. Thus we need to update keys. But if we
-# update primary keys, we need to make sure that every record that references a
-# key as a foreign key is updated as well. And ensuring database integrity for
-# "cascading updates" takes a _lot_ of care to guarantee correctness. You don't
-# want to read through that code. I don't want to write it. And we should not
-# end up pretending to build a real and proper database engine here!
-#
-# We'll do something simple instead. The sanest way is to create
-# primary keys with "namespaces", ie. the keys will reflect the source of the
-# information. This way, even though the databases are merged, the information
-# sources can be identified. Still, it can be accessed through a common
-# mechanism. A database merge with separate namespaces can simply proceed as
-# follows:
-
-#
-#    for each table in the database
-#        merge the table from both source databases
-#        throw out any row with a duplicate ID
-#
-# In this way, if we merge your working database with the reference database
-# all reference database records will be updated, and all your working records
-# will remain.
-
-# And while we are adding semnatics to keys, we should also identify in which
-# table they are being used. You may remember that I told you: keys should not
-# try to express semantics? Here we have a slightly different situation: since
-# we are not actually constructing an underlying database engine, we will often
-# encounter keys that we use "manually". And in that case it is very helpful to
-# make the key's domain explicit.
-
-# The code I wrote to implement this was loaded when you typed init().
-
-# Let's have a brief look at the new dbAutoincrement() function. You'll see the
-# code as usual when you execute the function name without its parentheses:
-
-dbAutoincrement
-
-# The default namespace is "my" - that's the namespace you use. I will use
-# "ref" for all entries of the reference database. Thus you normally won't
-# need to enter the namespace identifier - only I do that.
-
-refDB$protein$ID
-dbAutoincrement(refDB$protein$ID)
-dbAutoincrement(refDB$protein$ID, ns = "ref")
-
-# Once the namespaces are thus kept well apart, we won't overwrite each other's
-# work. Merging two table becomes a breeze:
-
-tableUnion
-
-# Here is an example
-this <- data.frame(ID = "ref_1", type = "one crow", stringsAsFactors = FALSE)
-this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
-                               type = "for",
-                               stringsAsFactors = FALSE))
-this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
-                               type = "sorrow",
-                               stringsAsFactors = FALSE))
-that <- data.frame(ID = "my_1", type = "two crows", stringsAsFactors = FALSE)
-that <- rbind(that, this[2:3, ])  # repeat some rows
-that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
-                               type = "for ",
-                               stringsAsFactors = FALSE))
-that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
-                               type = "joy ...",
-                               stringsAsFactors = FALSE))
-
-that
-this
-
-rhyme <- tableUnion(this, that)
-rhyme$type
-
-# Finally, we need to do this for all tables in the datamodel. Well, almost all:
-# we introduce a field $version, through which we can ensure that the datamodels
-# have the same schema. If they would have different schemas the merging would
-# break. But we don't merge this single field. (Incidentally: a schema version
-# is not "data", rather we call this "metadata": information _about_ the data.)
-# So we exclude the "version" from the names of elements to merge.
-#
-# But how to iterate over all tables in a list by name? You might be tempted to
-# try something like
-#
-# n <- names(db)
-# myDB$n[1], myDB$n[2] etc.
-# ... but NO! That's not how the $ operator works.
-#
-# The R community keeps particularly poignant comments from the R-help mailing
-# list in a package called "fortunes", and fortune(312) reads:
-#
-# " The problem here is that the $ notation is a magical shortcut and like any
-#   other magic if used incorrectly is likely to do the programmatic equivalent
-#   of turning yourself into a toad.
-#
-# -- Greg Snow (in response to a user that wanted
-#    to access a column whose name is stored in y
-#    via x$y rather than x[[y]])
-#    R-help (February 2012)
-
-# So in order to avoid turning into toads, we create a vector "tables", iterate
-# over "table" elements from "tables" and use them as ref[[table]] ... etc.
-
-# === Putting data into the new schema =========================================
-
-# Entering the YFO data into the new schema takes almost exactly
-# the same syntax as the code you wrote for the last assignment.
-#
-
-# === TASK: ===
-# Initialize myDB with the YFO homologue of yeast Mbp1
-
-# First: initialize "myDB", this is the name of the R object that
-#        will contain data you collect.
-
-myDB <- dbInit()   # This creates an empty database with the latest schema
-
-# Next: Add your protein and the YFO to the database. Copy the code-template
-#       below to your myCode.R file, edit it to replace the placeholder
-#       items with your data, and execute the code. Then continue below
-#       the code template.
-
-
-# ===== begin code template: add a protein and an organism to the database =====
-
-# == edit placeholder items!
-myBinomial <- "<BINOMIAL NAME>"    # Name, NOT biCode()
-myTaxonomyId <- as.integer(<TAX_ID>)
-myProteinName <- "<PROTEIN NAME>"  # Name your protein "MBP1_<YFO>" (where <YFO>
-# is a placeholder for the biCode() of YFO)
-myProteinRefSeqID <- "<REFSEQID>"
-myProteinUniProtID <- "<UNIPROTID>"
-mySeq <- "
-<SEQUENCE>
-"
-
-# == create the protein entry
-proteinRow <- data.frame(ID = dbAutoincrement(myDB$protein$ID),
-                         name = myProteinName,
-                         RefSeqID = myProteinRefSeqID,
-                         UniProtID = myProteinUniProtID,
-                         taxonomy.ID = myTaxonomyId,
-                         sequence = dbSanitizeSequence(mySeq),
-                         stringsAsFactors = FALSE)
-myDB$protein <- rbind(myDB$protein, proteinRow)
-
-# == create the taxonomy entry
-taxonomyRow <- data.frame(ID = myTaxonomyId,
-                          species = myBinomial,
-                          stringsAsFactors = FALSE)
-myDB$taxonomy <- rbind(myDB$taxonomy, taxonomyRow)
-# ===== end code template ===========================================
-
-# ... continue here.
-
-# myDB now contains one record each in two tables. The remaining tables exist
-# but they are empty. Check that all the values are correct: just execute
-myDB
-
-# Now let's merge myDB with the data from refDB. refDB should already have been
-# loaded from .utilities.R ; you can also explore the original script with which
-# refDB was created, for your reference: it is create_refDB.R  The database
-# refDB is the default argument for dbMerge(), so you don't need to
-# specify it. By assigning the result of dbMerge() back to myDB we overwrite the
-# previous version.
-
-myDB <- dbMerge(myDB)
+# Now: create the database
+source("./scripts/ABC-createRefDB.R")
 
 str(myDB)
 
-# check the protein table
-View(myDB$protein[ , c("ID", "name", "RefSeqID")])
+
+# ===  2.4.1  Examples of navigating the database      
+
+
+# You can look at the contents of the tables in the usual way we access
+# elements from lists and dataframes. Here are some examples:
+myDB$protein
+myDB$protein$RefSeqID
+myDB$protein[,"name"]
+myDB$taxonomy
+myDB$taxonomy$species
+biCode(myDB$taxonomy$species)
+
+# Comparing two tables:
+# Are all of the taxonomyIDs in the protein table present in the
+# taxonomy table? We ought to check, because the way we imported the
+# data from JSON objects, we could have omitted or forgotten some. But we can
+# check this with one simple expression. Unravel it and study its components.
+
+all(myDB$protein$taxonomyID %in% myDB$taxonomy$ID)
+
+# If this is not TRUE, you MUST fix the problem before continuing.
+
+# Cross-referencing information:
+# What is the species name of the protein whose name is "MBP1_COPCI"?
+
+sel <- myDB$protein$name == "MBP1_COPCI"
+x <- myDB$protein$taxonomyID[sel]
+
+sel <- myDB$taxonomy$ID == x
+myDB$taxonomy$species[sel]
+
+
+# ==   2.5  Updating the database  =============================================
+
+
+# Basic tasks for databases include retrieving data, selecting data, updating
+# and deleting data. Here we will take a simple, pedestrian approach:
+#
+#     In case we need to modify any of the data, we modify it in the JSON file
+#     save that, and recreate the database. The myDB database will only be
+#     used for analysis.
+#
+
+
+# =    3  Add your own data  ===================================================
+
+
+# You have chosen an organism as "YFO", and you final task will be to find the
+# protein in YFO that is most similar to yeast Mbp1 and enter its information
+# into the database.
+
+
+# ==   3.1  Find a protein  ====================================================
+
+
+# The BLAST algorithm will be properly introduced in a later learning unit -
+# for now just use it in the following way:
+#
+# - Navigate to https://blast.ncbi.nlm.nih.gov/Blast.cgi and click on
+#   Protein BLAST.
+# - Enter NP_010227 into the "Query Sequence" field.
+# - Choose "Reference proteins (refseq_protein)" as the "Database".
+# - Paste the YFO species name into the "Organism" field.
+#
+# - Click "BLAST".
+
+# You will probably get more than one result. If you get dozens of results or
+# more, or if you get no results, something went wrong. Reconsider whether the
+# problem was with your input, try something different, or ask for help.
+
+# Otherwise, look for the top-hit in the "Alignments" section. In some cases
+# there will be more than one hit with nearly similar E-values. If this is the
+# case for YFO, choose the one with the higher degree of similarity (more
+# identities) with the N-terminus of the query - i.e. the Query sequence of
+# the first ~ 100 amino acids.
+
+# -  Follow the link to the protein data page, linked from "Sequence ID".
+# -  From there, in a separate tab, open the link to the taxonomy database page
+#      for YFO which is linked from the "ORGANISM" record.
+
+
+# ==   3.2  Put the information into JSON files  ===============================
+
+
+# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project
+#     directory and give it a new name that corresponds to YFO - e.g. if
+#     YFO is called "Crptycoccus neoformans", your file should be called
+#     "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the
+#     "name" of your protein. Open the file in the RStudio editor and replace
+#     all of the MBP1_SACCE data with the corresponding data of your protein.
+#
+# - Do a similar thing for the YFO taxonomy entry. Copy
+#     "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json".
+#     Create a valid JSON file with only one single entry - that of YFO.
+#
+# - Validate your two files online at https://jsonlint.com/
+
+
+# ==   3.3  Create an R script to create the database  =========================
+
+
+# Next: to create the database.
+# - Make a new R script, call it "makeProteinDB.R"
+# - enter the following expression as the first command:
+#     source("./scripts/ABC-createRefDB.R")
+# - than add the two commands that add your protein and taxonomy data,
+#     they should look like:
+#     myDB <- dbAddProtein(    myDB, fromJSON("MBP1_<code>.json"))
+#     myDB <- dbAddTaxonomy(   myDB, fromJSON("YFOtaxonomy.json"))
+#
+# - save the file and source() it:
+#     source("makeProteinDB.R")
+
+# This command needs to be executed whenever you recreate
+# the database. In particular, whenver you have added or modified data
+# in any of the JSON files. Later you will add more information ...
+
+
+# ===  3.3.1  Check and validate                       
+
 
 # Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
-# And does the taxonomy table contain the binomial name? It should be the same
+# And does the taxonomy table contain the systematic name? It should be the same
 # that you get when you type YFO into the console.
 
-# Let's compute sequence lengths on the fly (with the function nchar() ) and
-# add them to our view. Then we'll open this with the table viewer function
-# View()
+# Let's compute sequence lengths on the fly (with the function nchar() ), and
+# open this with the table viewer function View()
 
 View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
            length = nchar(myDB$protein$sequence)))
 
-# Where does your protein's length fall relative to the reference proteins?
-# About the same? Much shorter? Much longer?
+# Does your protein appear in the last row of this table? Where does your
+# protein's length fall relative to the reference proteins? About the same? Much
+# shorter? Much longer? If it is less then 500 amino acids long, I would suspect
+# an error. Contact me for advice.
 
-# Is that the right sequence?
-sel <- myDB$protein$ID == "my_pro_1"
-myDB$protein$sequence[sel]
+# Is that the right sequence? Is it the same as the one on the NCBI protein
+# database page?
+myDB$protein$sequence[nrow(myDB$protein)]
 
 # If not, don't continue! Fix the problem first.
 # Let me repeat: If this does not give you the right sequence of the YFO
@@ -522,162 +572,27 @@ myDB$taxonomy[sel, ]
 # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
 # Fix the problem first.
 
-# Does this give you the right protein ID for MBP1_<YFO>?
-sel <- dbConfirmUnique(myDB$protein$name == paste("MBP1_", biCode(YFO), sep = ""))
-myDB$protein$ID[sel]
+# Does this give you the right refseq ID for MBP1_<YFO>?
+sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO))
+myDB$protein$RefSeqID[sel]
 
 # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
 # Fix the problem first.
 
-#
-#
-# === Saving and loading data ==================================================
-#
-# Once you have confirmed that myDB has been successfully created and updated
-# and is in a good state, you should make a backup copy. There are many ways to
-# save data to a file on disk and read it back in. One of the most convenient is
-# the function pair save() and load().
-#
-# save() saves an R object to a file. Its signature is
 
-#    save(<object-name/s>, file = <file-name>)
-
-# The object, or objects that are saved like this are identically recreated when
-# load() is executed. The object name is not changed - you don't assign the
-# result of load(). You use load() for its "side-effect": re-creating the saved
-# object, not for using the return-value. Therefore the signature for load() is
-# simply
-
-#     load(<file-name>)
-
-# All R objects in <file-name> are created by load(), if they don't yet exist.
-# If they already exist, the will be overwritten. The only effect you see is
-# that the object appears in the Environment pane of RStudio (if it wasn't there
-# already), or you may notice that its content has changed if it did exist.
-
-# == TASK: ==
-# Save myDB so you can recreate it easily when you next open RStudio.
-
-save(myDB, file = "myDB.01.RData")  # Note that I give this a version number!
-
-# Let's confirm:
-rm(myDB)
-nrow(myDB$protein)  # must give an error
-load("myDB.01.RData")
-nrow(myDB$protein)  # must be 11. If not, don't continue etc.
-
-===New Database ===
-
-  Here is some sample code to work with the new database, enter new protein data for YFO, save it and load it again when needed.
-
-
-<source lang="R">
-  # You don't need to load the reference database refDB. If
-  # everything is set up correctly, it gets loaded at startup.
-  # (Just so you know: you can turn off that behaviour if you
-  # ever should want to...)
-
-
-  # First you need to load the newest version of dbUtilities.R
-
-  updateDButilities("7bb32ab3d0861ad81bdcb9294f0f6a737b820bf9")
-
-# If you get an error:
-#    Error: could not find function "updateDButilities"
-# ... then it seems you didn't do the previous update.
-
-# Try getting the update with the new key but the previous function:
-# updateDbUtilities()
-#
-# If that function is not found either, confirm that your ~/.Rprofile
-# actually loads dbUtilites.R from your project directory.
-
-# As a desperate last resort, you could uncomment
-# the following piece of code and run the update
-# without verification...
-#
-# URL <- "http://steipe.biochemistry.utoronto.ca/abc/images/f/f9/DbUtilities.R"
-# download.file(URL, paste(PROJECTDIR, "dbUtilities.R", sep="")), method="auto")
-# source(paste(PROJECTDIR, "dbUtilities.R", sep=""))
-#
-# But be cautious: there is no verification. You yourself need
-# to satisfy yourself that this "file from the internet" is what
-# it should be, before source()'ing it...
-
-
-# After the file has been source()'d,  refDB exists.
-ls(refDB)
-
-
-# check the contents of refDB:
-refDB$protein$name
-refDB$taxonomy
-
-
-# list refSeqIDs for saccharomyces cerevisiae genes.
-refDB$protein[refDB$protein$taxID == 559292, "refSeqID"]
-
-
-# To add some genes from YFO, I proceed as follows.
-# Obviously, you need to adapt this to your YFO
-# and the sequences in YFO that you have found
-# with your PSI-BLAST search.
-
-# Let's assume my YFO is the fly agaric (amanita muscaria)
-# and its APSES domain proteins have the following IDs
-# (these are not refSeq btw. and thus unlikely
-# to be found in UniProt) ...
-# KIL68212
-# KIL69256
-# KIL65817
-#
-
-
-# First, I create a copy of the database with a name that
-# I will recognize to be associated with my YFO.
-amamuDB <- refDB
-
-
-# Then I fetch my protein data ...
-tmp1 <- fetchProteinData("KIL68212")
-tmp2 <- fetchProteinData("KIL69256")
-tmp3 <- fetchProteinData("KIL65817")
-
-
-# ... and if I am satisfied that it contains what I
-# want, I add it to the database.
-amamuDB <- addToDB(amamuDB, tmp1)
-amamuDB <- addToDB(amamuDB, tmp2)
-amamuDB <- addToDB(amamuDB, tmp3)
-
-
-# Then I make a local backup copy. Note the filename and
-# version number  :-)
-save(amamuDB, file="amamuDB.01.RData")
-
-
-# Now I can explore my new database ...
-amamuDB$protein[amamuDB$protein$taxID == 946122, "refSeqID"]
-
-
-# ... but if anything goes wrong, for example
-# if I make a mistake in checking which
-# rows contain taxID 946122 ...
-amamuDB$protein$taxID = 946122
-
-# Ooops ... what did I just do wrong?
-#       ... wnat happened instead?
-
-amamuDB$protein$taxID
-
-
-# ... I can simply recover from my backup copy:
-load("amamuDB.01.RData")
-amamuDB$protein$taxID
+# ==   3.4  Task: submit for credit (part 2/2)  ================================
 
 
+# - On your submission page, note the E-value of your protein and link
+#     to its NCBI protein database page.
+# - Copy and paste the contents of your two JSON files on your submission
+#     page on the Student Wiki
+# - Execute the two commands below and show the result on your submission page
 
+biCode(myDB$taxonomy$species) %in% biCode(YFO)
+myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)]
 
+# That is all.
 
 
 # [END]