bch441-work-abc-units/BIN-Storing_data.R
2017-09-12 16:09:20 -04:00

684 lines
25 KiB
R

# BIN-Storing_data.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Storing_data unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART TWO: Implementing the (protein part of) the datamodel
# ==============================================================================
# Every entity in our datmodel can be implemented by a dataframe. To keep things
# organized, we create a list, that contains the enity tables. Here is a
# definition for a dataframe that holds the protein data for yeast Mbp1 as
# described in the data model schema:
#
# === Creating two tables ===
db <- list() # we'll call our database "db" and start with an empty list
# Then add a dataframe with one row to the list
db$protein <- data.frame(ID = as.integer(1),
name = "Mbp1",
RefSeqID = "NP_010227",
UniProtID = "P39678",
taxonomy.ID = as.integer(4932),
sequence = "...", # just a placeholder
stringsAsFactors = FALSE)
str(db)
# Next, we create the taxonomy table and add it into db
db$taxonomy <- data.frame(ID = as.integer(4932),
species = "Saccharomyces cerevisiae",
stringsAsFactors = FALSE)
# Let's add a second protein: We create a one-row dataframe with the data, then we rbind() it to the existing data frame:
myRow <- data.frame(ID = as.integer(2),
name = "Res2",
RefSeqID = "NP_593032",
UniProtID = "P41412",
taxonomy.ID = as.integer(4896),
sequence = "...", # again, just a placeholder
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(4896),
species = "Schizosaccharomyces pombe",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
str(db)
# you can look at the contents of the tables in the usual way we would access
# elements from lists and dataframes. Here are some examples:
db$protein
db$protein$RefSeqID
db$protein[,"name"]
db$taxonomy
db$taxonomy$species
biCode(db$taxonomy$species)
# Here is an example to look up information in one table,
# based on a condition in another table:
# what is the species name for the protein
# whose name is "Mbp1"?
# First, get the taxonomy.ID for the Mbp1 protein. This is
# the key we need for the taxonomy table. We find it in a cell in the
# table: db$protein[<row>, <column>]
# <row> is that row for which the value in
# the "name" column is Mbp1:
db$protein$name == "Mbp1" # TRUE FALSE
# The <column> is called "taxonomy.ID". Simply
# insert these two expressions in the square
# brackets.
db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
# Assign the taxonomy.ID value ...
x <- db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
# ... and fetch the species_name value from db$taxonomy
db$taxonomy[db$taxonomy$ID == x, "species"]
# === Updating the database ====================================================
# Modifying a field
# Here is code that modifies the sequence field in the protein table of the
# database:
#
mySeq <- "
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
//
"
str(db$protein) # before
db$protein$sequence[db$protein$name == "Mbp1"] <- dbSanitizeSequence(mySeq)
str(db$protein) # after
# Analyze the expression ! Note how we specify an element of a vector (column)
# in a data frame in a list using a logical expression. And note how we assign
# the output (return value) of a function. As far as database coding goes this
# is pretty minimal - there is no error checking done at all. In particular: can
# we really guarantee that the name "Mbp1" is unique in the protein table? No!
# We never required it to be unique. This is a check we need to perform so
# frequently that we will encapsulate it in a function:
dbConfirmUnique
# try this:
dbConfirmUnique(c("TRUE", "FALSE"))
dbConfirmUnique(c(TRUE, FALSE))
dbConfirmUnique(c(TRUE, TRUE))
dbConfirmUnique(c(FALSE, FALSE))
#
#
# Here is the update to the sequence field of Res2 but using our
# confirmUnique() function
mySeq <- "
1 maprssavhv avysgvevye cfikgvsvmr rrrdswlnat qilkvadfdk pqrtrvlerq
61 vqigahekvq ggygkyqgtw vpfqrgvdla tkykvdgims pilsldideg kaiapkkkqt
121 kqkkpsvrgr rgrkpsslss stlhsvnekq pnssisptie ssmnkvnlpg aeeqvsatpl
181 paspnallsp ndntikpvee lgmleapldk yeeslldffl hpeegripsf lyspppdfqv
241 nsvidddght slhwacsmgh iemiklllra nadigvcnrl sqtplmrsvi ftnnydcqtf
301 gqvlellqst iyavdtngqs ifhhivqsts tpskvaaaky yldcilekli siqpfenvvr
361 lvnlqdsngd tslliaarng amdcvnslls ynanpsipnr qrrtaseyll eadkkphsll
421 qsnsnashsa fsfsgispai ispscsshaf vkaipsissk fsqlaeeyes qlrekeedli
481 ranrlkqdtl neisrtyqel tflqknnpty sqsmenlire aqetyqqlsk rlliwlearq
541 ifdlerslkp htslsisfps dflkkedgls lnndfkkpac nnvtnsdeye qlinkltslq
601 asrkkdtlyi rklyeelgid dtvnsyrrli amscginped lsleildave ealtrek
"
db$protein$sequence[dbConfirmUnique(db$protein$name == "Res2")] <- dbSanitizeSequence(mySeq)
str(db$protein)
# These expressions can get rather long, it's easier to read if we write:
select <- dbConfirmUnique(db$protein$name == "Res2")
value <- dbSanitizeSequence(mySeq)
db$protein$sequence[select] <- value
# ... and that's the code paradigm that we will adopt to update
# database fields (for now).
# Adding a record
# Adding a record is easy in principle, simply defining the values we get
# from the NCBI or EBI databases ... except for the ID field. That is a field
# we need to define internally, and once again, we'll use a small function
# to get this right.
dbAutoincrement
# After reading the code, predict the results
dbAutoincrement(1) # Hm.
dbAutoincrement(1L)
dbAutoincrement(1:4)
dbAutoincrement(c(1, 2, 3, 4))
dbAutoincrement(c(1L, 2L, 3L, 4L))
dbAutoincrement(c(1, "4", 9))
dbAutoincrement(TRUE)
# Therefore, here is sample code to add one entry to the protein table.
#
mySeq <- "
1 msgdktifka tysgvpvyec iinnvavmrr rsddwlnatq ilkvvgldkp qrtrvlerei
61 qkgihekvqg gygkyqgtwi pldvaielae ryniqgllqp itsyvpsaad spppapkhti
121 stsnrskkii padpgalgrs rratsietes evigaapnnv segsmspsps dissssrtps
181 plpadrahpl hanhalagyn grdannhary adiildyfvt enttvpslli npppdfnpdm
241 sidddehtal hwacamgrir vvklllsaga difrvnsnqq talmratmfs nnydlrkfpe
301 lfellhrsil nidrndrtvf hhvvdlalsr gkphaaryym etminrlady gdqladilnf
361 qddegetplt maararskrl vrlllehgad pkirnkegkn aedyiieder frsspsrtgp
421 agielgadgl pvlptsslht seagqrtagr avtlmsnllh sladsydsei ntaekkltqa
481 hgllkqiqte iedsakvaea lhheaqgvde erkrvdslql alkhainkra rddlerrwse
541 gkqaikrarl qaglepgals tsnatnapat gdqkskddak sliealpagt nvktaiaelr
601 kqlsqvqank telvdkfvar areqgtgrtm aayrrliaag cggiapdevd avvgvlcell
661 qeshtgarag aggerddrar dvammlkgag aaalaanaga p
"
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "UMAG_1122",
RefSeqID = "XP_011392621",
UniProtID = "A0A0D1DP35",
taxonomy.ID = as.integer(5270),
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(5270),
species = "Ustilago maydis",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
# Here is a bit of code template for the same ... it won't execute of course but
# you can copy it into your own code script file to modify when you need to add
# your own protein.
# ===== begin code template =====================================
mySeq <- "
RAW SEQUENCE FROM NCBI RECORD
"
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "NAME",
RefSeqID = "REFSEQ",
UniProtID = "UNIPROT",
taxonomy.ID = as.integer(TAX_ID),
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(TAX_ID),
species = "BINOMIAL NAME",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
# ===== end code template ===========================================
# deleting a record
# This is simple code without error checking and of course we can make mistakes.
# Often we can just overwrite the offending field with correct data. But
# sometimes it will be easier (and more robust) to delete the erroneous entry
# and add the correct one. For example, if your code is in a script, and you
# realize the entry had an error, I would not "patch" the error in the script
# but delete the entry on the console command line, correct the script and
# execute the correct block. That way we fix the mistake at its source.
#
# Removing a row from a datframe is trivial: just overwrite the dataframe with
# a selection statement in which the unique selection of the offending row is
# inverted:
# create an erroneous entry
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "nonesuch",
RefSeqID = "NP_000000",
UniProtID = "A123456",
taxonomy.ID = as.integer(999999),
sequence = dbSanitizeSequence("utter nonsense"),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
# Make a logical vector that identifies it
select <- dbConfirmUnique(db$protein$name == "nonesuch")
select # the selection
!select # its logical inversion
str(db) # before
db$protein <- db$protein[ ! select, ] # overwrite the table with a copy
# without the selected record
str(db) # after
# Note: if you delete records "by hand" you need to be careful that you do
# not remove keys that are used as foreign keys in another table - if there
# are such dependecies, you need to update the other table(s) too. "Real"
# database systems include such dependencies in the creation instructions
# of the table schema: "on delete cascade ..."
#
# ==============================================================================
# ==============================================================================
# PART ONE: Database maintenance
# ==============================================================================
# === Merging databases ========================================================
# From time to time, we will want to update the systems database - either
# because the schema has changed, e.g. we may include additional tables, or
# because I have added reference data: sequences, annotations and the like. The
# goal here is to make this as simple as possible, keeping all information you
# may have entered intact, and avoiding to duplicate entries. This is not quite
# as easy as one would hope. First of all, since we need to merge records from
# different databases - your own, and the reference - we can't assume that the
# primary keys are unique. You may have introduced the same proteinID that I
# used later for the reference database. Thus we need to update keys. But if we
# update primary keys, we need to make sure that every record that references a
# key as a foreign key is updated as well. And ensuring database integrity for
# "cascading updates" takes a _lot_ of care to guarantee correctness. You don't
# want to read through that code. I don't want to write it. And we should not
# end up pretending to build a real and proper database engine here!
#
# We'll do something simple instead. The sanest way is to create
# primary keys with "namespaces", ie. the keys will reflect the source of the
# information. This way, even though the databases are merged, the information
# sources can be identified. Still, it can be accessed through a common
# mechanism. A database merge with separate namespaces can simply proceed as
# follows:
#
# for each table in the database
# merge the table from both source databases
# throw out any row with a duplicate ID
#
# In this way, if we merge your working database with the reference database
# all reference database records will be updated, and all your working records
# will remain.
# And while we are adding semnatics to keys, we should also identify in which
# table they are being used. You may remember that I told you: keys should not
# try to express semantics? Here we have a slightly different situation: since
# we are not actually constructing an underlying database engine, we will often
# encounter keys that we use "manually". And in that case it is very helpful to
# make the key's domain explicit.
# The code I wrote to implement this was loaded when you typed init().
# Let's have a brief look at the new dbAutoincrement() function. You'll see the
# code as usual when you execute the function name without its parentheses:
dbAutoincrement
# The default namespace is "my" - that's the namespace you use. I will use
# "ref" for all entries of the reference database. Thus you normally won't
# need to enter the namespace identifier - only I do that.
refDB$protein$ID
dbAutoincrement(refDB$protein$ID)
dbAutoincrement(refDB$protein$ID, ns = "ref")
# Once the namespaces are thus kept well apart, we won't overwrite each other's
# work. Merging two table becomes a breeze:
tableUnion
# Here is an example
this <- data.frame(ID = "ref_1", type = "one crow", stringsAsFactors = FALSE)
this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
type = "for",
stringsAsFactors = FALSE))
this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
type = "sorrow",
stringsAsFactors = FALSE))
that <- data.frame(ID = "my_1", type = "two crows", stringsAsFactors = FALSE)
that <- rbind(that, this[2:3, ]) # repeat some rows
that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
type = "for ",
stringsAsFactors = FALSE))
that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
type = "joy ...",
stringsAsFactors = FALSE))
that
this
rhyme <- tableUnion(this, that)
rhyme$type
# Finally, we need to do this for all tables in the datamodel. Well, almost all:
# we introduce a field $version, through which we can ensure that the datamodels
# have the same schema. If they would have different schemas the merging would
# break. But we don't merge this single field. (Incidentally: a schema version
# is not "data", rather we call this "metadata": information _about_ the data.)
# So we exclude the "version" from the names of elements to merge.
#
# But how to iterate over all tables in a list by name? You might be tempted to
# try something like
#
# n <- names(db)
# myDB$n[1], myDB$n[2] etc.
# ... but NO! That's not how the $ operator works.
#
# The R community keeps particularly poignant comments from the R-help mailing
# list in a package called "fortunes", and fortune(312) reads:
#
# " The problem here is that the $ notation is a magical shortcut and like any
# other magic if used incorrectly is likely to do the programmatic equivalent
# of turning yourself into a toad.
#
# -- Greg Snow (in response to a user that wanted
# to access a column whose name is stored in y
# via x$y rather than x[[y]])
# R-help (February 2012)
# So in order to avoid turning into toads, we create a vector "tables", iterate
# over "table" elements from "tables" and use them as ref[[table]] ... etc.
# === Putting data into the new schema =========================================
# Entering the YFO data into the new schema takes almost exactly
# the same syntax as the code you wrote for the last assignment.
#
# === TASK: ===
# Initialize myDB with the YFO homologue of yeast Mbp1
# First: initialize "myDB", this is the name of the R object that
# will contain data you collect.
myDB <- dbInit() # This creates an empty database with the latest schema
# Next: Add your protein and the YFO to the database. Copy the code-template
# below to your myCode.R file, edit it to replace the placeholder
# items with your data, and execute the code. Then continue below
# the code template.
# ===== begin code template: add a protein and an organism to the database =====
# == edit placeholder items!
myBinomial <- "<BINOMIAL NAME>" # Name, NOT biCode()
myTaxonomyId <- as.integer(<TAX_ID>)
myProteinName <- "<PROTEIN NAME>" # Name your protein "MBP1_<YFO>" (where <YFO>
# is a placeholder for the biCode() of YFO)
myProteinRefSeqID <- "<REFSEQID>"
myProteinUniProtID <- "<UNIPROTID>"
mySeq <- "
<SEQUENCE>
"
# == create the protein entry
proteinRow <- data.frame(ID = dbAutoincrement(myDB$protein$ID),
name = myProteinName,
RefSeqID = myProteinRefSeqID,
UniProtID = myProteinUniProtID,
taxonomy.ID = myTaxonomyId,
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
myDB$protein <- rbind(myDB$protein, proteinRow)
# == create the taxonomy entry
taxonomyRow <- data.frame(ID = myTaxonomyId,
species = myBinomial,
stringsAsFactors = FALSE)
myDB$taxonomy <- rbind(myDB$taxonomy, taxonomyRow)
# ===== end code template ===========================================
# ... continue here.
# myDB now contains one record each in two tables. The remaining tables exist
# but they are empty. Check that all the values are correct: just execute
myDB
# Now let's merge myDB with the data from refDB. refDB should already have been
# loaded from .utilities.R ; you can also explore the original script with which
# refDB was created, for your reference: it is create_refDB.R The database
# refDB is the default argument for dbMerge(), so you don't need to
# specify it. By assigning the result of dbMerge() back to myDB we overwrite the
# previous version.
myDB <- dbMerge(myDB)
str(myDB)
# check the protein table
View(myDB$protein[ , c("ID", "name", "RefSeqID")])
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
# And does the taxonomy table contain the binomial name? It should be the same
# that you get when you type YFO into the console.
# Let's compute sequence lengths on the fly (with the function nchar() ) and
# add them to our view. Then we'll open this with the table viewer function
# View()
View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
length = nchar(myDB$protein$sequence)))
# Where does your protein's length fall relative to the reference proteins?
# About the same? Much shorter? Much longer?
# Is that the right sequence?
sel <- myDB$protein$ID == "my_pro_1"
myDB$protein$sequence[sel]
# If not, don't continue! Fix the problem first.
# Let me repeat: If this does not give you the right sequence of the YFO
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
# Is that the right taxonomy ID and binomial name for YFO?
sel <- myDB$taxonomy$species == YFO
myDB$taxonomy[sel, ]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first.
# Does this give you the right protein ID for MBP1_<YFO>?
sel <- dbConfirmUnique(myDB$protein$name == paste("MBP1_", biCode(YFO), sep = ""))
myDB$protein$ID[sel]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first.
#
#
# === Saving and loading data ==================================================
#
# Once you have confirmed that myDB has been successfully created and updated
# and is in a good state, you should make a backup copy. There are many ways to
# save data to a file on disk and read it back in. One of the most convenient is
# the function pair save() and load().
#
# save() saves an R object to a file. Its signature is
# save(<object-name/s>, file = <file-name>)
# The object, or objects that are saved like this are identically recreated when
# load() is executed. The object name is not changed - you don't assign the
# result of load(). You use load() for its "side-effect": re-creating the saved
# object, not for using the return-value. Therefore the signature for load() is
# simply
# load(<file-name>)
# All R objects in <file-name> are created by load(), if they don't yet exist.
# If they already exist, the will be overwritten. The only effect you see is
# that the object appears in the Environment pane of RStudio (if it wasn't there
# already), or you may notice that its content has changed if it did exist.
# == TASK: ==
# Save myDB so you can recreate it easily when you next open RStudio.
save(myDB, file = "myDB.01.RData") # Note that I give this a version number!
# Let's confirm:
rm(myDB)
nrow(myDB$protein) # must give an error
load("myDB.01.RData")
nrow(myDB$protein) # must be 11. If not, don't continue etc.
===New Database ===
Here is some sample code to work with the new database, enter new protein data for YFO, save it and load it again when needed.
<source lang="R">
# You don't need to load the reference database refDB. If
# everything is set up correctly, it gets loaded at startup.
# (Just so you know: you can turn off that behaviour if you
# ever should want to...)
# First you need to load the newest version of dbUtilities.R
updateDButilities("7bb32ab3d0861ad81bdcb9294f0f6a737b820bf9")
# If you get an error:
# Error: could not find function "updateDButilities"
# ... then it seems you didn't do the previous update.
# Try getting the update with the new key but the previous function:
# updateDbUtilities()
#
# If that function is not found either, confirm that your ~/.Rprofile
# actually loads dbUtilites.R from your project directory.
# As a desperate last resort, you could uncomment
# the following piece of code and run the update
# without verification...
#
# URL <- "http://steipe.biochemistry.utoronto.ca/abc/images/f/f9/DbUtilities.R"
# download.file(URL, paste(PROJECTDIR, "dbUtilities.R", sep="")), method="auto")
# source(paste(PROJECTDIR, "dbUtilities.R", sep=""))
#
# But be cautious: there is no verification. You yourself need
# to satisfy yourself that this "file from the internet" is what
# it should be, before source()'ing it...
# After the file has been source()'d, refDB exists.
ls(refDB)
# check the contents of refDB:
refDB$protein$name
refDB$taxonomy
# list refSeqIDs for saccharomyces cerevisiae genes.
refDB$protein[refDB$protein$taxID == 559292, "refSeqID"]
# To add some genes from YFO, I proceed as follows.
# Obviously, you need to adapt this to your YFO
# and the sequences in YFO that you have found
# with your PSI-BLAST search.
# Let's assume my YFO is the fly agaric (amanita muscaria)
# and its APSES domain proteins have the following IDs
# (these are not refSeq btw. and thus unlikely
# to be found in UniProt) ...
# KIL68212
# KIL69256
# KIL65817
#
# First, I create a copy of the database with a name that
# I will recognize to be associated with my YFO.
amamuDB <- refDB
# Then I fetch my protein data ...
tmp1 <- fetchProteinData("KIL68212")
tmp2 <- fetchProteinData("KIL69256")
tmp3 <- fetchProteinData("KIL65817")
# ... and if I am satisfied that it contains what I
# want, I add it to the database.
amamuDB <- addToDB(amamuDB, tmp1)
amamuDB <- addToDB(amamuDB, tmp2)
amamuDB <- addToDB(amamuDB, tmp3)
# Then I make a local backup copy. Note the filename and
# version number :-)
save(amamuDB, file="amamuDB.01.RData")
# Now I can explore my new database ...
amamuDB$protein[amamuDB$protein$taxID == 946122, "refSeqID"]
# ... but if anything goes wrong, for example
# if I make a mistake in checking which
# rows contain taxID 946122 ...
amamuDB$protein$taxID = 946122
# Ooops ... what did I just do wrong?
# ... wnat happened instead?
amamuDB$protein$taxID
# ... I can simply recover from my backup copy:
load("amamuDB.01.RData")
amamuDB$protein$taxID
# [END]