From 6a565c2ab60dc8d9bee18c2bf6d0d611bc7192f3 Mon Sep 17 00:00:00 2001 From: hyginn Date: Mon, 25 Sep 2017 01:30:29 -0400 Subject: [PATCH] New unit --- BIN-Storing_data.R | 1119 ++++++++++++++++++++------------------------ 1 file changed, 517 insertions(+), 602 deletions(-) diff --git a/BIN-Storing_data.R b/BIN-Storing_data.R index b93bb67..fc1f0f1 100644 --- a/BIN-Storing_data.R +++ b/BIN-Storing_data.R @@ -3,11 +3,12 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Storing_data unit # -# Version: 0.1 +# Version: 1.0 # -# Date: 2017 08 25 +# Date: 2017 09 23 # Author: Boris Steipe (boris.steipe@utoronto.ca) # +# V 1.0 First live version, complete rebuilt. Now using JSON data sources. # V 0.1 First code copied from BCH441_A03_makeYFOlist.R # # TODO: @@ -16,500 +17,549 @@ # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # # DO NOT SIMPLY source() THESE FILES! - +# # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------------------ +#TOC> 1 A Relational Datamodel in R: review 55 +#TOC> 1.1 Building a sample database structure 95 +#TOC> 1.1.1 completing the database 206 +#TOC> 1.2 Querying the database 241 +#TOC> 1.3 Task: submit for credit (part 1/2) 270 +#TOC> 2 Implementing the protein datamodel 282 +#TOC> 2.1 JSON formatted source data 308 +#TOC> 2.2 "Sanitizing" sequence data 343 +#TOC> 2.3 Create a protein table for our data model 363 +#TOC> 2.3.1 Initialize the database 365 +#TOC> 2.3.2 Add data 377 +#TOC> 2.4 Complete the database 397 +#TOC> 2.4.1 Examples of navigating the database 424 +#TOC> 2.5 Updating the database 456 +#TOC> 3 Add your own data 468 +#TOC> 3.1 Find a protein 476 +#TOC> 3.2 Put the information into JSON files 505 +#TOC> 3.3 Create an R script to create the database 522 +#TOC> 3.3.1 Check and validate 542 +#TOC> 3.4 Task: submit for credit (part 2/2) 583 +#TOC> +#TOC> ========================================================================== + -# ============================================================================== -# PART TWO: Implementing the (protein part of) the datamodel -# ============================================================================== +# = 1 A Relational Datamodel in R: review ================================= -# Every entity in our datmodel can be implemented by a dataframe. To keep things -# organized, we create a list, that contains the enity tables. Here is a -# definition for a dataframe that holds the protein data for yeast Mbp1 as -# described in the data model schema: +# A disclaimer at first: we are not building an industry-strength database at +# all here - but we are employing principles of such a database to keep common +# types of lab-data well organized. Don't think of this as emulating or even +# replacing a "real" database, but think of this as improving the ad hoc +# approaches we normally employ to store data in the lab. That does not mean +# such ad hoc approaches are necessarily bad - the best solution always depends +# on your objectives, the details of your tasks, and the context in which you +# are working. + +# The principle we follow in implementing a relational data model is to build a +# list of dataframes . This list is our "database": +# - Each _entity_ of the datamodel is a dataframe. In an SQL database, these +# would also be called "tables". In a spreadsheet this would be a "sheet". +# - Each instance of an entity, i.e. one stored _item_, is a row of the data +# frame. In an SQL database this would be a record. In a spreadsheet this is +# a row. +# - Each _attribute_ of an entity is is a column of the dataframe. In an SQL +# database this is a column, in a spreadsheet too. +# - This doesn't necessarily solve the question of how we will store and curate +# our source data - we will defer that to later. At first we talk only about +# data representation internal to our R session, where we need it for +# processing and analysis. + +# Lets review syntax for creating and accessing such a structure, a list of data +# frames. You'll have to be absolutely confident with this, or you'll get lost +# in all the later learning units. We'll start from a compact example, a tiny +# database of philosophers to keep things brief. That database will have three +# tables: person, works and book. Person stores biographical data, book stores +# books, and works is a join table associating persons with their work. You +# should already be familiar with "join tables" and why we need them. This is +# the structure: # -# === Creating two tables === -db <- list() # we'll call our database "db" and start with an empty list +# person: id, name, born, died, school +# book: id, title, published +# works: id, person$id, book$id -# Then add a dataframe with one row to the list -db$protein <- data.frame(ID = as.integer(1), - name = "Mbp1", - RefSeqID = "NP_010227", - UniProtID = "P39678", - taxonomy.ID = as.integer(4932), - sequence = "...", # just a placeholder - stringsAsFactors = FALSE) +# Perhaps draw out this schema to make things more clear. -str(db) +# == 1.1 Building a sample database structure ============================== -# Next, we create the taxonomy table and add it into db -db$taxonomy <- data.frame(ID = as.integer(4932), - species = "Saccharomyces cerevisiae", - stringsAsFactors = FALSE) +# Let's build this structure. -# Let's add a second protein: We create a one-row dataframe with the data, then we rbind() it to the existing data frame: -myRow <- data.frame(ID = as.integer(2), - name = "Res2", - RefSeqID = "NP_593032", - UniProtID = "P41412", - taxonomy.ID = as.integer(4896), - sequence = "...", # again, just a placeholder - stringsAsFactors = FALSE) -db$protein <- rbind(db$protein, myRow) +philDB <- list() # This is an empty list -myRow <- data.frame(ID = as.integer(4896), - species = "Schizosaccharomyces pombe", - stringsAsFactors = FALSE) -db$taxonomy <- rbind(db$taxonomy, myRow) +# This is a data frame that we initialize with two philosophers +x <- data.frame(id = c(1,2), + name = c("Laozi", "Martin Heidegger"), + born = c(NA, "1889"), + died = c("531 BCE", "1976"), + school = c("Daoism", "Phenomenology"), + stringsAsFactors = FALSE) +str(x) -str(db) +# Lets add the dataframe to the philDB list and call it "person" there. +philDB[["person"]] <- x +str(philDB) -# you can look at the contents of the tables in the usual way we would access -# elements from lists and dataframes. Here are some examples: -db$protein -db$protein$RefSeqID -db$protein[,"name"] -db$taxonomy -db$taxonomy$species -biCode(db$taxonomy$species) +# and let's remove x so we don't mix up things later. +rm(x) -# Here is an example to look up information in one table, -# based on a condition in another table: -# what is the species name for the protein -# whose name is "Mbp1"? +# We can address elements with the usual subsetting operators. I will use +# the $ operator for tables and columns, the [] operator for elements in +# columns. For example ... -# First, get the taxonomy.ID for the Mbp1 protein. This is -# the key we need for the taxonomy table. We find it in a cell in the -# table: db$protein[, ] -# is that row for which the value in -# the "name" column is Mbp1: +philDB$person$name[1] # Laozi -db$protein$name == "Mbp1" # TRUE FALSE - -# The is called "taxonomy.ID". Simply -# insert these two expressions in the square -# brackets. - -db$protein[db$protein$name == "Mbp1", "taxonomy.ID"] - -# Assign the taxonomy.ID value ... -x <- db$protein[db$protein$name == "Mbp1", "taxonomy.ID"] - -# ... and fetch the species_name value from db$taxonomy - -db$taxonomy[db$taxonomy$ID == x, "species"] +# task: Write an expression that returns all "school" entries from the +# person table. +# Let's now add another person. There are several ways to do this, the +# conceptually cleanest is to create a one-row dataframe with the data, and +# rbind() it to the existing dataframe. Doing this, we must take care that +# the data frame column names are identical. What happens if they are not? +# Let's find out: +(x <- data.frame(a=1:4, b=11:14)) +(y <- data.frame(a=6, c=17)) +rbind(x, y) -# === Updating the database ==================================================== +(y <- data.frame(a=6, b=17)) +rbind(x, y) -# Modifying a field +# All clear? That's good - this behaviour provides us with a sanity check on the +# operation. -# Here is code that modifies the sequence field in the protein table of the -# database: +(x <- data.frame(id = 2, + name = "Zhuangzi", + born = "369 BCE", + died = "286 BCE", + school = "Daoism")) + +# Add this to the "person" table in our database with rbind() ... +philDB$person <- rbind(philDB$person, x) + +# ... and examine the result: +str(philDB) + +# Now one thing you should note is that we had forgotten to declare +# stringsAsFactors = FALSE when we created x - but this did not damage +# the database. This is because the existing columns had type chr and the +# implicit coercion, e.g. ... +as.character(x$name) +# happened to do the right thing. Don't rely on that. The Right Way is to +# turn factors off, even when you are making just a single row. + +# But we made a serious error in our data! Did you spot it? # -mySeq <- " -1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk -61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha -121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr -181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq -241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss -301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy -361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts -421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp -481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt -541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp -601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk -661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr -721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak -781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha -// -" - - -str(db$protein) # before -db$protein$sequence[db$protein$name == "Mbp1"] <- dbSanitizeSequence(mySeq) -str(db$protein) # after - -# Analyze the expression ! Note how we specify an element of a vector (column) -# in a data frame in a list using a logical expression. And note how we assign -# the output (return value) of a function. As far as database coding goes this -# is pretty minimal - there is no error checking done at all. In particular: can -# we really guarantee that the name "Mbp1" is unique in the protein table? No! -# We never required it to be unique. This is a check we need to perform so -# frequently that we will encapsulate it in a function: - -dbConfirmUnique - -# try this: -dbConfirmUnique(c("TRUE", "FALSE")) -dbConfirmUnique(c(TRUE, FALSE)) -dbConfirmUnique(c(TRUE, TRUE)) -dbConfirmUnique(c(FALSE, FALSE)) - - - +# If not, look at ... +philDB$person$id +# ... does that look oK? # +# Absolutely not! id is the Primary Key in the table, and it has to be +# unique. How can we guarantee it to be unique? Certainly not when we +# enter it by hand. We need a function that generates a unique key. Here's +# a simple version, without any error-checking. It assumes that a column +# named "id" exists in the table, and that it holds the Primary Keys: +autoincrement <- function(table) { + return(max(table$id) + 1) +} + +#Try it: +autoincrement(philDB$person) + +# Once that is clear, let's remove the Zhuangzi entry and recreate it correctly. +# Many ways to remove, here we use a logical expression to select matching record(s), +# apply the results to subset the data frame, and overwrite the existing table +# with the new one. + +sel <- !(philDB$person$name == "Zhuangzi") # select ... +philDB$person <- philDB$person[sel, ] # ... and replace + +str(philDB) + +# Now let's add Zhuangzi with correct data. Note how we use the autoincrement +# function for the id + +x <- data.frame(id = autoincrement(philDB$person), + name = "Zhuangzi", + born = "369 BCE", + died = "286 BCE", + school = "Daoism", + stringsAsFactors = FALSE) +philDB$person <- rbind(philDB$person, x) +str(philDB) + +# So far so good. Be honest with yourself. If you didn't follow any of this, +# go back, re-read, play with it, and ask for help. This is essential. + + +# === 1.1.1 completing the database + + +# Next I'll add one more person, and create the other two tables: + +x <- data.frame(id = autoincrement(philDB$person), + name = "Kongzi", + born = "551 BCE", + died = "479 BCE", + school = "Confucianism", + stringsAsFactors = FALSE) +philDB$person <- rbind(philDB$person, x) + + +philDB[["books"]] <- data.frame(id = 1:5, + title = c("Zhuangzi", + "Analects", + "Being and Time", + "Daodejing", + "On the Way to Language"), + published = c("300 BCE", + "220 BCE", + "1927", + "530 BCE", + "1959"), + stringsAsFactors = FALSE) + +philDB[["works"]] <- data.frame(id = 1:5, + personID = c(3, 4, 2, 1, 2), + bookID = c(1, 2, 3, 4, 5), + stringsAsFactors = FALSE) + +str(philDB) + + +# == 1.2 Querying the database ============================================= + + +# To retrieve data, we need to subset tables, possibly based on conditions we +# find in other tables. Sometimes we can simply get the information, e.g. +# all names ... +philDB$person$name +# ... or all book titles ... +philDB$books$title + +# ... but sometimes we need to cross-reference information via join tables. Here +# is an example where we list authors and their works, sorted alphabetically by +# author: +(sel <- order(philDB$person$name)) # check out ?order and describe to + # someone you know what it does, so that + # you are sure you understand it. +(pID <- philDB$person$id[sel]) +sel <- numeric() # initialize the vector +for (ID in pID) { + sel <- which(philDB$works$personID == ID) # get all rows for which + # the condition is TRUE + cat(sprintf("%s: ", philDB$person$name[ID])) # output the person + cat(sprintf("\"%s\" ", philDB$books$title[sel])) # output the book + cat("\n") +} + +# Examine the intermediate results and trace the logic until this is clear. + + +# == 1.3 Task: submit for credit (part 1/2) ================================ + + +# Write and submit code that adds another philosopher to the datamodel: +# Immanuel Kant, (1724 - 1804), Enlightenment Philosophy. +# Works: Critique of Pure Reason (1781), Critique of Judgement (1790) +# Write and submit code that lists the books in alphabetical order, +# followed by the author and the year of publishing. Format your output like: +# "Analects" - Kongzi (220 BCE) +# Show the result. + + +# = 2 Implementing the protein datamodel ================================== + + +# Working with the code above has probably illustrated a few concerns about +# curating data and storing it for analysis. In particular the join tables +# seem problematic - figuring out the correct IDs, it's easy to make +# mistakes. +# - Data needs to be captured in a human-readable form so it can be verified +# and validated; +# - Some aspects of the database should _never_ be done by hand because they +# errors are easy to make and hard to see. That essentially includes +# every operation that has to do with abstract, primary keys; +# - Elementary operations we need to support are: adding data, selecting +# data, modifying data and deleting data. + +# We will therefore construct our database in the following way: +# - For each table, we will keep the primary information in JSON files. There +# it is easy to read, edit if needed, and modify it. +# - We will use simple scripts to read the JSON data and assemble it in +# our database for further analysis. +# - I have constructed initial files for yeast Mbp1 and nine other reference +# species. +# - I have written a small number of utility functions to read those files +# and assemble them into a database. + + +# == 2.1 JSON formatted source data ======================================== + + +# Have a look at the structure of the yeast Mbp1 protein data: +file.edit("./data/MBP1_SACCE.json") + +# - The whole thing is an array: [ ... ]. This is not necessary for a single +# object, but we will have more objects in other files. And it's perfectly +# legal to have an array with a single element. +# - The data is formatted as "key" : "value" pairs inside an object { ... }. +# This keeps the association between data items and their semantics +# explicit. +# - All keys are strings and they are unique in the object. +# - Values are mostly single strings and integers ... +# - ... except for "sequence". That one is an array of strings. Why? This is to +# make it easier to format and maintain the data. JSON does not allow line +# breaks within strings, but the strings we copy/paste from Genbank or other +# sources might have line breaks, sequence numbers etc. So we need to +# sanitize the sequence at some point. But since we need to do that +# anyway, it is easier to see the whole sequence if we store it in chunks. + +# Let's load the "jsonlite" package and have a look at how it reads this data. + +if (! require("jsonlite", quietly = TRUE)) { + install.packages("jsonlite") + library(jsonlite) +} + +x <- fromJSON("./data/MBP1_SACCE.json") +str(x) + +x$name +unlist(x$sequence) + + +# == 2.2 "Sanitizing" sequence data ======================================== + + +# Examine the dbSanitizeSequence() function: + +dbSanitizeSequence + +# Try: + +dbSanitizeSequence(c("GAA", "ttc")) +dbSanitizeSequence("MsnQ00%0 I@#>YSary S + G1 V2DV3Y>") +x <- " 1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk + 61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha + 121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr +..." # copy/paste from Genbank + +dbSanitizeSequence(x) + + +# == 2.3 Create a protein table for our data model ========================= + +# === 2.3.1 Initialize the database + + +# The function dbInit contains all the code to return a list of empty +# data frames for our data model. + +dbInit + +myDB <- dbInit() +str(myDB) + + +# === 2.3.2 Add data + + +# fromJSON() returns a dataframe that we can readily process to add data +# to our table. Have a look at the function to add protein entries: + +dbAddProtein + +myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json")) +str(myDB) + +# Lets check that the 833 amino acids of the yeast MBP1 sequence have +# safely arrived. Note the genral idiom we use here to retrieve the data: +# we define a boolean vector that satisfies a condition, then we subset +# a column with that vector. + +sel <- myDB$protein$name == "MBP1_SACCE" +nchar(myDB$protein$sequence[sel]) + + +# == 2.4 Complete the database ============================================= + + +# Completing the database with Mbp1 data and data for 9 other "reference" +# species is more of the same. I have assembled the code in a script +# "./scripts/ABC-createRefDB.R" - open it, check it out, and then source it. +# It's really very simple, just reading some prepared files of data I have +# formatted with JSON, and assembling the data in our data model. # -# Here is the update to the sequence field of Res2 but using our -# confirmUnique() function +# The code is also very simple and in particular there is no checking for errors +# or inconsistencies. Have a look: +# Totally straightforward ... +dbAddTaxonomy +dbAddFeature -mySeq <- " -1 maprssavhv avysgvevye cfikgvsvmr rrrdswlnat qilkvadfdk pqrtrvlerq -61 vqigahekvq ggygkyqgtw vpfqrgvdla tkykvdgims pilsldideg kaiapkkkqt -121 kqkkpsvrgr rgrkpsslss stlhsvnekq pnssisptie ssmnkvnlpg aeeqvsatpl -181 paspnallsp ndntikpvee lgmleapldk yeeslldffl hpeegripsf lyspppdfqv -241 nsvidddght slhwacsmgh iemiklllra nadigvcnrl sqtplmrsvi ftnnydcqtf -301 gqvlellqst iyavdtngqs ifhhivqsts tpskvaaaky yldcilekli siqpfenvvr -361 lvnlqdsngd tslliaarng amdcvnslls ynanpsipnr qrrtaseyll eadkkphsll -421 qsnsnashsa fsfsgispai ispscsshaf vkaipsissk fsqlaeeyes qlrekeedli -481 ranrlkqdtl neisrtyqel tflqknnpty sqsmenlire aqetyqqlsk rlliwlearq -541 ifdlerslkp htslsisfps dflkkedgls lnndfkkpac nnvtnsdeye qlinkltslq -601 asrkkdtlyi rklyeelgid dtvnsyrrli amscginped lsleildave ealtrek -" +# Just slightly more complex, since we need to match the protein or feature +# name in the JSON file with its internal ID, and, when doing that confirm +# that it CAN be matched and that the match is UNIQUE +dbAddAnnotation -db$protein$sequence[dbConfirmUnique(db$protein$name == "Res2")] <- dbSanitizeSequence(mySeq) -str(db$protein) - -# These expressions can get rather long, it's easier to read if we write: - -select <- dbConfirmUnique(db$protein$name == "Res2") -value <- dbSanitizeSequence(mySeq) -db$protein$sequence[select] <- value - -# ... and that's the code paradigm that we will adopt to update -# database fields (for now). - -# Adding a record - -# Adding a record is easy in principle, simply defining the values we get -# from the NCBI or EBI databases ... except for the ID field. That is a field -# we need to define internally, and once again, we'll use a small function -# to get this right. - -dbAutoincrement - -# After reading the code, predict the results -dbAutoincrement(1) # Hm. -dbAutoincrement(1L) -dbAutoincrement(1:4) -dbAutoincrement(c(1, 2, 3, 4)) -dbAutoincrement(c(1L, 2L, 3L, 4L)) -dbAutoincrement(c(1, "4", 9)) -dbAutoincrement(TRUE) - -# Therefore, here is sample code to add one entry to the protein table. -# -mySeq <- " -1 msgdktifka tysgvpvyec iinnvavmrr rsddwlnatq ilkvvgldkp qrtrvlerei -61 qkgihekvqg gygkyqgtwi pldvaielae ryniqgllqp itsyvpsaad spppapkhti -121 stsnrskkii padpgalgrs rratsietes evigaapnnv segsmspsps dissssrtps -181 plpadrahpl hanhalagyn grdannhary adiildyfvt enttvpslli npppdfnpdm -241 sidddehtal hwacamgrir vvklllsaga difrvnsnqq talmratmfs nnydlrkfpe -301 lfellhrsil nidrndrtvf hhvvdlalsr gkphaaryym etminrlady gdqladilnf -361 qddegetplt maararskrl vrlllehgad pkirnkegkn aedyiieder frsspsrtgp -421 agielgadgl pvlptsslht seagqrtagr avtlmsnllh sladsydsei ntaekkltqa -481 hgllkqiqte iedsakvaea lhheaqgvde erkrvdslql alkhainkra rddlerrwse -541 gkqaikrarl qaglepgals tsnatnapat gdqkskddak sliealpagt nvktaiaelr -601 kqlsqvqank telvdkfvar areqgtgrtm aayrrliaag cggiapdevd avvgvlcell -661 qeshtgarag aggerddrar dvammlkgag aaalaanaga p -" - -myRow <- data.frame(ID = dbAutoincrement(db$protein$ID), - name = "UMAG_1122", - RefSeqID = "XP_011392621", - UniProtID = "A0A0D1DP35", - taxonomy.ID = as.integer(5270), - sequence = dbSanitizeSequence(mySeq), - stringsAsFactors = FALSE) -db$protein <- rbind(db$protein, myRow) - -myRow <- data.frame(ID = as.integer(5270), - species = "Ustilago maydis", - stringsAsFactors = FALSE) -db$taxonomy <- rbind(db$taxonomy, myRow) - -# Here is a bit of code template for the same ... it won't execute of course but -# you can copy it into your own code script file to modify when you need to add -# your own protein. - -# ===== begin code template ===================================== -mySeq <- " -RAW SEQUENCE FROM NCBI RECORD -" - -myRow <- data.frame(ID = dbAutoincrement(db$protein$ID), - name = "NAME", - RefSeqID = "REFSEQ", - UniProtID = "UNIPROT", - taxonomy.ID = as.integer(TAX_ID), - sequence = dbSanitizeSequence(mySeq), - stringsAsFactors = FALSE) -db$protein <- rbind(db$protein, myRow) - -myRow <- data.frame(ID = as.integer(TAX_ID), - species = "BINOMIAL NAME", - stringsAsFactors = FALSE) -db$taxonomy <- rbind(db$taxonomy, myRow) - -# ===== end code template =========================================== - - - - -# deleting a record - -# This is simple code without error checking and of course we can make mistakes. -# Often we can just overwrite the offending field with correct data. But -# sometimes it will be easier (and more robust) to delete the erroneous entry -# and add the correct one. For example, if your code is in a script, and you -# realize the entry had an error, I would not "patch" the error in the script -# but delete the entry on the console command line, correct the script and -# execute the correct block. That way we fix the mistake at its source. -# -# Removing a row from a datframe is trivial: just overwrite the dataframe with -# a selection statement in which the unique selection of the offending row is -# inverted: - -# create an erroneous entry -myRow <- data.frame(ID = dbAutoincrement(db$protein$ID), - name = "nonesuch", - RefSeqID = "NP_000000", - UniProtID = "A123456", - taxonomy.ID = as.integer(999999), - sequence = dbSanitizeSequence("utter nonsense"), - stringsAsFactors = FALSE) -db$protein <- rbind(db$protein, myRow) - -# Make a logical vector that identifies it -select <- dbConfirmUnique(db$protein$name == "nonesuch") -select # the selection -!select # its logical inversion - -str(db) # before -db$protein <- db$protein[ ! select, ] # overwrite the table with a copy -# without the selected record -str(db) # after - -# Note: if you delete records "by hand" you need to be careful that you do -# not remove keys that are used as foreign keys in another table - if there -# are such dependecies, you need to update the other table(s) too. "Real" -# database systems include such dependencies in the creation instructions -# of the table schema: "on delete cascade ..." - -# -# ============================================================================== - - - -# ============================================================================== -# PART ONE: Database maintenance -# ============================================================================== - -# === Merging databases ======================================================== - -# From time to time, we will want to update the systems database - either -# because the schema has changed, e.g. we may include additional tables, or -# because I have added reference data: sequences, annotations and the like. The -# goal here is to make this as simple as possible, keeping all information you -# may have entered intact, and avoiding to duplicate entries. This is not quite -# as easy as one would hope. First of all, since we need to merge records from -# different databases - your own, and the reference - we can't assume that the -# primary keys are unique. You may have introduced the same proteinID that I -# used later for the reference database. Thus we need to update keys. But if we -# update primary keys, we need to make sure that every record that references a -# key as a foreign key is updated as well. And ensuring database integrity for -# "cascading updates" takes a _lot_ of care to guarantee correctness. You don't -# want to read through that code. I don't want to write it. And we should not -# end up pretending to build a real and proper database engine here! -# -# We'll do something simple instead. The sanest way is to create -# primary keys with "namespaces", ie. the keys will reflect the source of the -# information. This way, even though the databases are merged, the information -# sources can be identified. Still, it can be accessed through a common -# mechanism. A database merge with separate namespaces can simply proceed as -# follows: - -# -# for each table in the database -# merge the table from both source databases -# throw out any row with a duplicate ID -# -# In this way, if we merge your working database with the reference database -# all reference database records will be updated, and all your working records -# will remain. - -# And while we are adding semnatics to keys, we should also identify in which -# table they are being used. You may remember that I told you: keys should not -# try to express semantics? Here we have a slightly different situation: since -# we are not actually constructing an underlying database engine, we will often -# encounter keys that we use "manually". And in that case it is very helpful to -# make the key's domain explicit. - -# The code I wrote to implement this was loaded when you typed init(). - -# Let's have a brief look at the new dbAutoincrement() function. You'll see the -# code as usual when you execute the function name without its parentheses: - -dbAutoincrement - -# The default namespace is "my" - that's the namespace you use. I will use -# "ref" for all entries of the reference database. Thus you normally won't -# need to enter the namespace identifier - only I do that. - -refDB$protein$ID -dbAutoincrement(refDB$protein$ID) -dbAutoincrement(refDB$protein$ID, ns = "ref") - -# Once the namespaces are thus kept well apart, we won't overwrite each other's -# work. Merging two table becomes a breeze: - -tableUnion - -# Here is an example -this <- data.frame(ID = "ref_1", type = "one crow", stringsAsFactors = FALSE) -this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"), - type = "for", - stringsAsFactors = FALSE)) -this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"), - type = "sorrow", - stringsAsFactors = FALSE)) -that <- data.frame(ID = "my_1", type = "two crows", stringsAsFactors = FALSE) -that <- rbind(that, this[2:3, ]) # repeat some rows -that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID), - type = "for ", - stringsAsFactors = FALSE)) -that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID), - type = "joy ...", - stringsAsFactors = FALSE)) - -that -this - -rhyme <- tableUnion(this, that) -rhyme$type - -# Finally, we need to do this for all tables in the datamodel. Well, almost all: -# we introduce a field $version, through which we can ensure that the datamodels -# have the same schema. If they would have different schemas the merging would -# break. But we don't merge this single field. (Incidentally: a schema version -# is not "data", rather we call this "metadata": information _about_ the data.) -# So we exclude the "version" from the names of elements to merge. -# -# But how to iterate over all tables in a list by name? You might be tempted to -# try something like -# -# n <- names(db) -# myDB$n[1], myDB$n[2] etc. -# ... but NO! That's not how the $ operator works. -# -# The R community keeps particularly poignant comments from the R-help mailing -# list in a package called "fortunes", and fortune(312) reads: -# -# " The problem here is that the $ notation is a magical shortcut and like any -# other magic if used incorrectly is likely to do the programmatic equivalent -# of turning yourself into a toad. -# -# -- Greg Snow (in response to a user that wanted -# to access a column whose name is stored in y -# via x$y rather than x[[y]]) -# R-help (February 2012) - -# So in order to avoid turning into toads, we create a vector "tables", iterate -# over "table" elements from "tables" and use them as ref[[table]] ... etc. - -# === Putting data into the new schema ========================================= - -# Entering the YFO data into the new schema takes almost exactly -# the same syntax as the code you wrote for the last assignment. -# - -# === TASK: === -# Initialize myDB with the YFO homologue of yeast Mbp1 - -# First: initialize "myDB", this is the name of the R object that -# will contain data you collect. - -myDB <- dbInit() # This creates an empty database with the latest schema - -# Next: Add your protein and the YFO to the database. Copy the code-template -# below to your myCode.R file, edit it to replace the placeholder -# items with your data, and execute the code. Then continue below -# the code template. - - -# ===== begin code template: add a protein and an organism to the database ===== - -# == edit placeholder items! -myBinomial <- "" # Name, NOT biCode() -myTaxonomyId <- as.integer() -myProteinName <- "" # Name your protein "MBP1_" (where -# is a placeholder for the biCode() of YFO) -myProteinRefSeqID <- "" -myProteinUniProtID <- "" -mySeq <- " - -" - -# == create the protein entry -proteinRow <- data.frame(ID = dbAutoincrement(myDB$protein$ID), - name = myProteinName, - RefSeqID = myProteinRefSeqID, - UniProtID = myProteinUniProtID, - taxonomy.ID = myTaxonomyId, - sequence = dbSanitizeSequence(mySeq), - stringsAsFactors = FALSE) -myDB$protein <- rbind(myDB$protein, proteinRow) - -# == create the taxonomy entry -taxonomyRow <- data.frame(ID = myTaxonomyId, - species = myBinomial, - stringsAsFactors = FALSE) -myDB$taxonomy <- rbind(myDB$taxonomy, taxonomyRow) -# ===== end code template =========================================== - -# ... continue here. - -# myDB now contains one record each in two tables. The remaining tables exist -# but they are empty. Check that all the values are correct: just execute -myDB - -# Now let's merge myDB with the data from refDB. refDB should already have been -# loaded from .utilities.R ; you can also explore the original script with which -# refDB was created, for your reference: it is create_refDB.R The database -# refDB is the default argument for dbMerge(), so you don't need to -# specify it. By assigning the result of dbMerge() back to myDB we overwrite the -# previous version. - -myDB <- dbMerge(myDB) +# Now: create the database +source("./scripts/ABC-createRefDB.R") str(myDB) -# check the protein table -View(myDB$protein[ , c("ID", "name", "RefSeqID")]) + +# === 2.4.1 Examples of navigating the database + + +# You can look at the contents of the tables in the usual way we access +# elements from lists and dataframes. Here are some examples: +myDB$protein +myDB$protein$RefSeqID +myDB$protein[,"name"] +myDB$taxonomy +myDB$taxonomy$species +biCode(myDB$taxonomy$species) + +# Comparing two tables: +# Are all of the taxonomyIDs in the protein table present in the +# taxonomy table? We ought to check, because the way we imported the +# data from JSON objects, we could have omitted or forgotten some. But we can +# check this with one simple expression. Unravel it and study its components. + +all(myDB$protein$taxonomyID %in% myDB$taxonomy$ID) + +# If this is not TRUE, you MUST fix the problem before continuing. + +# Cross-referencing information: +# What is the species name of the protein whose name is "MBP1_COPCI"? + +sel <- myDB$protein$name == "MBP1_COPCI" +x <- myDB$protein$taxonomyID[sel] + +sel <- myDB$taxonomy$ID == x +myDB$taxonomy$species[sel] + + +# == 2.5 Updating the database ============================================= + + +# Basic tasks for databases include retrieving data, selecting data, updating +# and deleting data. Here we will take a simple, pedestrian approach: +# +# In case we need to modify any of the data, we modify it in the JSON file +# save that, and recreate the database. The myDB database will only be +# used for analysis. +# + + +# = 3 Add your own data =================================================== + + +# You have chosen an organism as "YFO", and you final task will be to find the +# protein in YFO that is most similar to yeast Mbp1 and enter its information +# into the database. + + +# == 3.1 Find a protein ==================================================== + + +# The BLAST algorithm will be properly introduced in a later learning unit - +# for now just use it in the following way: +# +# - Navigate to https://blast.ncbi.nlm.nih.gov/Blast.cgi and click on +# Protein BLAST. +# - Enter NP_010227 into the "Query Sequence" field. +# - Choose "Reference proteins (refseq_protein)" as the "Database". +# - Paste the YFO species name into the "Organism" field. +# +# - Click "BLAST". + +# You will probably get more than one result. If you get dozens of results or +# more, or if you get no results, something went wrong. Reconsider whether the +# problem was with your input, try something different, or ask for help. + +# Otherwise, look for the top-hit in the "Alignments" section. In some cases +# there will be more than one hit with nearly similar E-values. If this is the +# case for YFO, choose the one with the higher degree of similarity (more +# identities) with the N-terminus of the query - i.e. the Query sequence of +# the first ~ 100 amino acids. + +# - Follow the link to the protein data page, linked from "Sequence ID". +# - From there, in a separate tab, open the link to the taxonomy database page +# for YFO which is linked from the "ORGANISM" record. + + +# == 3.2 Put the information into JSON files =============================== + + +# - Next make a copy of the file "./data/MBP1_SACCE.json" in your project +# directory and give it a new name that corresponds to YFO - e.g. if +# YFO is called "Crptycoccus neoformans", your file should be called +# "MBP1_CRYNE.json"; in that case "MBP1_CRYNE" would also be the +# "name" of your protein. Open the file in the RStudio editor and replace +# all of the MBP1_SACCE data with the corresponding data of your protein. +# +# - Do a similar thing for the YFO taxonomy entry. Copy +# "./data/refTaxonomy.json" and make a new file named "YFOtaxonomy.json". +# Create a valid JSON file with only one single entry - that of YFO. +# +# - Validate your two files online at https://jsonlint.com/ + + +# == 3.3 Create an R script to create the database ========================= + + +# Next: to create the database. +# - Make a new R script, call it "makeProteinDB.R" +# - enter the following expression as the first command: +# source("./scripts/ABC-createRefDB.R") +# - than add the two commands that add your protein and taxonomy data, +# they should look like: +# myDB <- dbAddProtein( myDB, fromJSON("MBP1_.json")) +# myDB <- dbAddTaxonomy( myDB, fromJSON("YFOtaxonomy.json")) +# +# - save the file and source() it: +# source("makeProteinDB.R") + +# This command needs to be executed whenever you recreate +# the database. In particular, whenver you have added or modified data +# in any of the JSON files. Later you will add more information ... + + +# === 3.3.1 Check and validate + # Is your protein named according to the pattern "MBP1_"? It should be. -# And does the taxonomy table contain the binomial name? It should be the same +# And does the taxonomy table contain the systematic name? It should be the same # that you get when you type YFO into the console. -# Let's compute sequence lengths on the fly (with the function nchar() ) and -# add them to our view. Then we'll open this with the table viewer function -# View() +# Let's compute sequence lengths on the fly (with the function nchar() ), and +# open this with the table viewer function View() View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")], length = nchar(myDB$protein$sequence))) -# Where does your protein's length fall relative to the reference proteins? -# About the same? Much shorter? Much longer? +# Does your protein appear in the last row of this table? Where does your +# protein's length fall relative to the reference proteins? About the same? Much +# shorter? Much longer? If it is less then 500 amino acids long, I would suspect +# an error. Contact me for advice. -# Is that the right sequence? -sel <- myDB$protein$ID == "my_pro_1" -myDB$protein$sequence[sel] +# Is that the right sequence? Is it the same as the one on the NCBI protein +# database page? +myDB$protein$sequence[nrow(myDB$protein)] # If not, don't continue! Fix the problem first. # Let me repeat: If this does not give you the right sequence of the YFO @@ -522,162 +572,27 @@ myDB$taxonomy[sel, ] # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE. # Fix the problem first. -# Does this give you the right protein ID for MBP1_? -sel <- dbConfirmUnique(myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")) -myDB$protein$ID[sel] +# Does this give you the right refseq ID for MBP1_? +sel <- myDB$protein$name == paste0("MBP1_", biCode(YFO)) +myDB$protein$RefSeqID[sel] # If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE. # Fix the problem first. -# -# -# === Saving and loading data ================================================== -# -# Once you have confirmed that myDB has been successfully created and updated -# and is in a good state, you should make a backup copy. There are many ways to -# save data to a file on disk and read it back in. One of the most convenient is -# the function pair save() and load(). -# -# save() saves an R object to a file. Its signature is -# save(, file = ) - -# The object, or objects that are saved like this are identically recreated when -# load() is executed. The object name is not changed - you don't assign the -# result of load(). You use load() for its "side-effect": re-creating the saved -# object, not for using the return-value. Therefore the signature for load() is -# simply - -# load() - -# All R objects in are created by load(), if they don't yet exist. -# If they already exist, the will be overwritten. The only effect you see is -# that the object appears in the Environment pane of RStudio (if it wasn't there -# already), or you may notice that its content has changed if it did exist. - -# == TASK: == -# Save myDB so you can recreate it easily when you next open RStudio. - -save(myDB, file = "myDB.01.RData") # Note that I give this a version number! - -# Let's confirm: -rm(myDB) -nrow(myDB$protein) # must give an error -load("myDB.01.RData") -nrow(myDB$protein) # must be 11. If not, don't continue etc. - -===New Database === - - Here is some sample code to work with the new database, enter new protein data for YFO, save it and load it again when needed. - - - - # You don't need to load the reference database refDB. If - # everything is set up correctly, it gets loaded at startup. - # (Just so you know: you can turn off that behaviour if you - # ever should want to...) - - - # First you need to load the newest version of dbUtilities.R - - updateDButilities("7bb32ab3d0861ad81bdcb9294f0f6a737b820bf9") - -# If you get an error: -# Error: could not find function "updateDButilities" -# ... then it seems you didn't do the previous update. - -# Try getting the update with the new key but the previous function: -# updateDbUtilities() -# -# If that function is not found either, confirm that your ~/.Rprofile -# actually loads dbUtilites.R from your project directory. - -# As a desperate last resort, you could uncomment -# the following piece of code and run the update -# without verification... -# -# URL <- "http://steipe.biochemistry.utoronto.ca/abc/images/f/f9/DbUtilities.R" -# download.file(URL, paste(PROJECTDIR, "dbUtilities.R", sep="")), method="auto") -# source(paste(PROJECTDIR, "dbUtilities.R", sep="")) -# -# But be cautious: there is no verification. You yourself need -# to satisfy yourself that this "file from the internet" is what -# it should be, before source()'ing it... - - -# After the file has been source()'d, refDB exists. -ls(refDB) - - -# check the contents of refDB: -refDB$protein$name -refDB$taxonomy - - -# list refSeqIDs for saccharomyces cerevisiae genes. -refDB$protein[refDB$protein$taxID == 559292, "refSeqID"] - - -# To add some genes from YFO, I proceed as follows. -# Obviously, you need to adapt this to your YFO -# and the sequences in YFO that you have found -# with your PSI-BLAST search. - -# Let's assume my YFO is the fly agaric (amanita muscaria) -# and its APSES domain proteins have the following IDs -# (these are not refSeq btw. and thus unlikely -# to be found in UniProt) ... -# KIL68212 -# KIL69256 -# KIL65817 -# - - -# First, I create a copy of the database with a name that -# I will recognize to be associated with my YFO. -amamuDB <- refDB - - -# Then I fetch my protein data ... -tmp1 <- fetchProteinData("KIL68212") -tmp2 <- fetchProteinData("KIL69256") -tmp3 <- fetchProteinData("KIL65817") - - -# ... and if I am satisfied that it contains what I -# want, I add it to the database. -amamuDB <- addToDB(amamuDB, tmp1) -amamuDB <- addToDB(amamuDB, tmp2) -amamuDB <- addToDB(amamuDB, tmp3) - - -# Then I make a local backup copy. Note the filename and -# version number :-) -save(amamuDB, file="amamuDB.01.RData") - - -# Now I can explore my new database ... -amamuDB$protein[amamuDB$protein$taxID == 946122, "refSeqID"] - - -# ... but if anything goes wrong, for example -# if I make a mistake in checking which -# rows contain taxID 946122 ... -amamuDB$protein$taxID = 946122 - -# Ooops ... what did I just do wrong? -# ... wnat happened instead? - -amamuDB$protein$taxID - - -# ... I can simply recover from my backup copy: -load("amamuDB.01.RData") -amamuDB$protein$taxID +# == 3.4 Task: submit for credit (part 2/2) ================================ +# - On your submission page, note the E-value of your protein and link +# to its NCBI protein database page. +# - Copy and paste the contents of your two JSON files on your submission +# page on the Student Wiki +# - Execute the two commands below and show the result on your submission page +biCode(myDB$taxonomy$species) %in% biCode(YFO) +myDB$protein$taxonomyID %in% myDB$taxonomy$ID[(myDB$taxonomy$species == YFO)] +# That is all. # [END]