Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page

2020-10-13 16:08:35 +10:00 · 2020-10-13 16:08:35 +10:00 · 4d071cf8d5
commit 4d071cf8d5
parent 3bee83495f
2 changed files with 344 additions and 82 deletions
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@ -3,12 +3,14 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC-Domain_annotation unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
 # Date:     2017-11  -  2020-10
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           1.3    Add code for database export to JSON and instructions
 #                  for uploading annotations to the Public Student Wiki page
 #           1.2    Consistently: data in ./myScripts/ ;
 #                    begin SHARING DATA section
 #           1.1    2020 Updates
@ -28,16 +30,18 @@
 #TOC> ==========================================================================
-#TOC>
+#TOC> 
 #TOC>   Section  Title                                                 Line
 #TOC> ---------------------------------------------------------------------
-#TOC>   1        Update your database script                             42
+#TOC>   1        Update your database script                             48
-#TOC>   1.1        Preparing an annotation file ...                      49
+#TOC>   1.1        Preparing an annotation file ...                      55
-#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      52
+#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      58
-#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"        97
+#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       106
-#TOC>   1.2        Execute and Validate                                 124
+#TOC>   1.2        Execute and Validate                                 133
-#TOC>   2        Plot Annotations                                       149
+#TOC>   2        Plot Annotations                                       158
-#TOC>
+#TOC>   3        SHARING DATA                                           283
 #TOC>   3.1        Post MBP1_MYSPE as JSON data                         298
 #TOC> 
 #TOC> ==========================================================================
@ -99,7 +103,7 @@
 # Then SKIP the next section.
 #
 #
-# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"
+# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
 #
 #   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 #
@ -276,16 +280,47 @@ par(oPar)  # reset the plot parameters
 #    It would be better to align the motif borders, at least approximately (not
 #    all proteins have all motifs). How would you go about doing that?
-# = 1 SHARING DATA ======
+# =    3  SHARING DATA  ========================================================
 # It's particularly interesting to compare such annotations across many
-# homologous proteins. I have created a file on the Student Wiki that you can
+# homologous proteins. I have created a page on the Student Wiki () that you can
 # edit, and then download the data from the entire class directly to your
 # RStudio project.
 #
 # I have provided a function that extracts all information that refers to a
 # single protein from the database, and prints it out as well-formatted JSON,
 # suitable to be pasted into our shareable Wiki-page. There is a fair amount of
 # bookkeeping involved, but the code is not otherwise very enlightening so I
 # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
 # would want to have a look.
 # ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
 # Task:
 # =====
-#   TBC ...
+# 1: Run the following code:
 cat("{{Vspace}}",
    "<!-- ==== BEGIN  PROTEIN ==== -->",
    "<pre>",
    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
    "</pre>",
    "<!-- ===== END PROTEIN ====== -->",
    "", sep = "\n"
 )
 # 2: Copy the entire output,
 # 3: Navigate to
 #      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
 #    ... edit the page, and paste your output at the top.
 # 4: Save your edits.
 # Next, once we have collected a number of protein annotations, we can access
 # the page and import the data into our database.
 #
 # Code to come soon ...
 # [END]
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@ -1,56 +1,87 @@
 # tocID <- "scripts/ABC-dbUtilities.R"
 #
-# database utilities for ABC learning units
+# Purpose: Database utilities for ABC learning units.
 #
 # Version 2.1
 #
 # Date:     2017-11  -  2020-10
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
 #           2.1    Add JSON export functions
 #           2.0    Test all JSON import and prevent addition of duplicates. This
 #                  is necessary for import of data from the public page
 #           1.1    2020 Updates
 #           1.0    Live version 2017
 #
 # Notes:
 #   There are no functions to modify or delete entries. To do either,
 #   recreate the database with correct data in the creation script. This is the
 #   preferred way that ensures the entire database can be reproduced by
 #   source()'ing its generating script.
 #
 #   Inserting data goes only through the very most minimal validation steps. For
 #   production applications, more validation would need to be added, as well
 #   as an overall validation of database integrity
 #
 # ToDo:
 #
 # ==============================================================================
 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                             Line
+#TOC>   Section  Title                                   Line
-#TOC> -------------------------------------------------
+#TOC> -------------------------------------------------------
-#TOC>   1        PACKAGES                            32
+#TOC>   1        INITIALISATIONS AND PARAMETERS            60
-#TOC>   2        FUNCTIONS                           50
+#TOC>   2        PACKAGES                                  65
-#TOC>   2.01       dbSanitizeSequence()              53
+#TOC>   3        FUNCTIONS                                 81
-#TOC>   2.02       dbConfirmUnique()                 88
+#TOC>   3.01       dbSanitizeSequence()                    84
-#TOC>   2.03       dbInit()                         106
+#TOC>   3.02       dbConfirmUnique()                      119
-#TOC>   2.04       dbAutoincrement()                147
+#TOC>   3.03       dbInit()                               137
-#TOC>   2.05       dbAddProtein()                   160
+#TOC>   3.04       dbAutoincrement()                      177
-#TOC>   2.06       dbAddFeature()                   180
+#TOC>   3.05       dbAddProtein()                         190
-#TOC>   2.07       dbAddTaxonomy()                  199
+#TOC>   3.06       dbAddFeature()                         222
-#TOC>   2.08       dbAddAnnotation()                215
+#TOC>   3.07       dbAddTaxonomy()                        253
-#TOC>   2.09       dbFetchUniProtSeq()              243
+#TOC>   3.08       dbAddAnnotation()                      288
-#TOC>   2.10       dbFetchPrositeFeatures()         289
+#TOC>   3.09       dbFetchUniProtSeq()                    335
-#TOC>   2.11       node2text()                      339
+#TOC>   3.10       dbFetchPrositeFeatures()               381
-#TOC>   2.12       dbFetchNCBItaxData()             351
+#TOC>   3.11       node2text()                            431
-#TOC>   2.13       UniProtIDmap()                   390
+#TOC>   3.12       dbFetchNCBItaxData()                   443
-#TOC>   3        TESTS                              429
+#TOC>   3.13       UniProtIDmap()                         482
 #TOC>   3.14       dbProt2JSON()                          521
 #TOC>   3.15       dbSeq2JSON()                           606
 #TOC>   3.16       dbRow2JSON()                           636
 #TOC>   4        TESTS                                    656
 #TOC> 
 #TOC> ==========================================================================
-# =    1  PACKAGES  ============================================================
+# =    1  INITIALISATIONS AND PARAMETERS  ======================================
 doTESTS <- FALSE  # run tests if TRUE
 # =    2  PACKAGES  ============================================================
 if (! requireNamespace("jsonlite", quietly = TRUE)) {
  install.packages("jsonlite")
 }
 if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }
 if (! requireNamespace("xml2", quietly = TRUE)) {
  install.packages("xml2")
 }
-# =    2  FUNCTIONS  ===========================================================
+# =    3  FUNCTIONS  ===========================================================
-# ==   2.01  dbSanitizeSequence()  =============================================
+# ==   3.01  dbSanitizeSequence()  =============================================
 dbSanitizeSequence <- function(s, unambiguous = TRUE) {
  # Remove FASTA header lines, if any,
  # flatten any structure that s has,
@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 }
-# ==   2.02  dbConfirmUnique()  ================================================
+# ==   3.02  dbConfirmUnique()  ================================================
 dbConfirmUnique <- function(x) {
  # x is a vector of logicals.
  # returns x if x has exactly one TRUE element.
@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) {
 }
-# ==   2.03  dbInit()  =========================================================
+# ==   3.03  dbInit()  =========================================================
 dbInit <- function() {
  # Return an empty instance of the protein database
-  # Open the link and study the schema:
+  # The schema is here:
  # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
  db <- list()
@ -125,7 +156,6 @@ dbInit <- function() {
    ID = numeric(),
    species = character())
  db$annotation <- data.frame(
    ID = numeric(),
    proteinID = numeric(),
@ -144,7 +174,7 @@ dbInit <- function() {
 }
-# ==   2.04  dbAutoincrement()  ================================================
+# ==   3.04  dbAutoincrement()  ================================================
 dbAutoincrement <- function(tb) {
  # Return a unique integer that can be used as a primary key
  # Value:
@ -157,90 +187,152 @@ dbAutoincrement <- function(tb) {
 }
-# ==   2.05  dbAddProtein()  ===================================================
+# ==   3.05  dbAddProtein()  ===================================================
 dbAddProtein <- function(db, jsonDF) {
-  # Add one or more protein entries to the database db.
+  # Add one or more protein entries to the database db if a protein with the
  # same name does not yet exist. This enforces that protein names are unique.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  protein data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(ID          = dbAutoincrement(db$protein),
+    if (jsonDF$name[i] %in% db$protein$name) {
-                    name        = jsonDF$name[i],
+      cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
-                    RefSeqID    = jsonDF$RefSeqID[i],
+                  i, jsonDF$name[i],
-                    UniProtID   = jsonDF$UniProtID[i],
+                  "a protein with this name already exists in the database. ",
-                    taxonomyID  = jsonDF$taxonomyID[i],
+                  "Skipping this input."))
-                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
+      isValid <- FALSE
-    db$protein <- rbind(db$protein, x)
+    }
    if (isValid) {
      x <- data.frame(ID          = dbAutoincrement(db$protein),
                      name        = jsonDF$name[i],
                      RefSeqID    = jsonDF$RefSeqID[i],
                      UniProtID   = jsonDF$UniProtID[i],
                      taxonomyID  = jsonDF$taxonomyID[i],
                      sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
      db$protein <- rbind(db$protein, x)
    }
  }
  return(db)
 }
-# ==   2.06  dbAddFeature()  ===================================================
+# ==   3.06  dbAddFeature()  ===================================================
 dbAddFeature <- function(db, jsonDF) {
-  # Add one or more feature entries to the database db.
+  # Add one or more feature entries to the database db. Skip if a feature with
  # the same name already exists.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  feature data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(ID          = dbAutoincrement(db$feature),
+    isValid <- TRUE
-                    name        = jsonDF$name[i],
+    if (jsonDF$name[i] %in% db$feature$name) {
-                    description = jsonDF$description[i],
+      cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
-                    sourceDB    = jsonDF$sourceDB[i],
+                  i, jsonDF$name[i],
-                    accession   = jsonDF$accession[i])
+                  "a feature with this name already exists in the database. ",
-    db$feature <- rbind(db$feature, x)
+                  "Skipping this input."))
      isValid <- FALSE
    }
    if (isVALID) {
      x <- data.frame(ID          = dbAutoincrement(db$feature),
                      name        = jsonDF$name[i],
                      description = jsonDF$description[i],
                      sourceDB    = jsonDF$sourceDB[i],
                      accession   = jsonDF$accession[i])
      db$feature <- rbind(db$feature, x)
    }
  }
  return(db)
 }
-# ==   2.07  dbAddTaxonomy()  ==================================================
+# ==   3.07  dbAddTaxonomy()  ==================================================
 dbAddTaxonomy <- function(db, jsonDF) {
-  # Add one or more taxonomy entries to the database db.
+  # Add one or more taxonomy entries to the database db. Skip if species name
  # or taxonomy ID already exist in the database.
  # Parameters:
  #     db      list         A database created with dbInit()
  #     jsonDF  data frame   Taxonomy data imported into a data frame with
  #                            fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(
+    isValid <- TRUE
-      ID =  jsonDF$ID[i],
+
-      species = jsonDF$species[i])
+    if (jsonDF$species[i] %in% db$taxonomy$species) {
-    db$taxonomy <- rbind(db$taxonomy, x)
+      cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
                  i, jsonDF$name[i],
                  "a species with this name already exists in the database. ",
                  "Skipping this input."))
      isValid <- FALSE
    } else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
      cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
                  i, jsonDF$ID[i],
                  "this taxonomy ID already exists in the database. ",
                  "Skipping this input."))
      isValid <- FALSE
    }
    if (isValid) {
      x <- data.frame(
        ID =  as.integer(jsonDF$ID[i]),
        species = jsonDF$species[i])
      db$taxonomy <- rbind(db$taxonomy, x)
    }
  }
  return(db)
 }
-# ==   2.08  dbAddAnnotation()  ================================================
+
 # ==   3.08  dbAddAnnotation()  ================================================
 dbAddAnnotation <- function(db, jsonDF) {
-  # Add one or more annotation entries to the database db.
+  # Add one or more annotation entries to the database db. Skip the entry if
  # it already exists in the database.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  annotation data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
    isValid <- TRUE
    sel <- jsonDF$pName[i] == db$protein$name
-    sel <- dbConfirmUnique(sel)
+    sel <- dbConfirmUnique(sel)    # Confirm that this protein ID exists
    pID <- db$protein$ID[sel]
    sel <- jsonDF$fName[i] == db$feature$name
-    sel <- dbConfirmUnique(sel)
+    sel <- dbConfirmUnique(sel)    # Confirm that this feature ID exists
    fID <- db$feature$ID[sel]
-    x <- data.frame(ID        = dbAutoincrement(db$annotation),
+    sel <- db$annotation$proteinID == pID &
-                    proteinID = pID,
+           db$annotation$featureID == fID &
-                    featureID = fID,
+           db$annotation$start == as.integer(jsonDF$start[idx]) &
-                    start     = as.integer(jsonDF$start[i]),
+           db$annotation$end   == as.integer(jsonDF$end[idx])
-                    end       = as.integer(jsonDF$end[i]))
+
-    db$annotation <- rbind(db$annotation, x)
+    if (any(sel)) {
      cat(sprintf("Note: annotation No. %d in the input has %s%s%\n",
                  i,
                  "the same protein name, feature name, start, and end ",
                  "as one that already exists in the database. ",
                  "Skipping this input."))
      isValid <- FALSE
    }
    if (isValid) {
      x <- data.frame(ID        = dbAutoincrement(db$annotation),
                      proteinID = pID,
                      featureID = fID,
                      start     = as.integer(jsonDF$start[i]),
                      end       = as.integer(jsonDF$end[i]))
      db$annotation <- rbind(db$annotation, x)
    }
  }
  return(db)
 }
-# ==   2.09  dbFetchUniProtSeq()  ==============================================
+# ==   3.09  dbFetchUniProtSeq()  ==============================================
 dbFetchUniProtSeq <- function(IDs) {
  # Fetch a protein sequence from UniProt.
  # Parameters:
@ -286,7 +378,7 @@ if (FALSE) {
-# ==   2.10  dbFetchPrositeFeatures()  =========================================
+# ==   3.10  dbFetchPrositeFeatures()  =========================================
 dbFetchPrositeFeatures <- function(ID) {
  # Fetch feature annotations from ScanProsite.
  # Parameters:
@ -336,7 +428,7 @@ if (FALSE) {
 }
-# ==   2.11  node2text()  ======================================================
+# ==   3.11  node2text()  ======================================================
 node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
@ -348,7 +440,7 @@ node2text <- function(doc, tag) {
 }
-# ==   2.12  dbFetchNCBItaxData()  =============================================
+# ==   3.12  dbFetchNCBItaxData()  =============================================
 dbFetchNCBItaxData <- function(ID) {
  # Fetch feature taxID and Organism from the NCBI.
  # Parameters:
@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) {
-# ==   2.13  UniProtIDmap()  ===================================================
+# ==   3.13  UniProtIDmap()  ===================================================
 UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  # Use UniProt ID mapping service to map one or more IDs
  # Parameters:
@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 }
-# =    3  TESTS  ===============================================================
+# ==   3.14  dbProt2JSON()  ====================================================
 dbProt2JSON <- function(thisProt) {
  # Extract all protein related data from myDB and return in JSON format.
-if (FALSE) {
+  thisData <- list()
  # add a protein table
  sel <- which(myDB$protein$name == thisProt)
  thisData$protein <- myDB$protein[sel, ]
  # add a taxonomy table
  sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
  thisData$taxonomy <- myDB$taxonomy[sel, ]
  # add the entries for this protein from the  annotation table
  sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
  thisData$annotation <- myDB$annotation[sel, ]
  # our .json convention uses pName and fName as keys, not the db-internal IDs
  # add empty columns for pName and fName
  l <- nrow(thisData$annotation)
  thisData$annotation$pName <- character(l)
  thisData$annotation$fName <- character(l)
  # get the appropriate protein and feature names
  for (i in seq_len(l)) {
    pID <- thisData$annotation$proteinID[i]
    sel <- which(myDB$protein$ID == pID)
    thisData$annotation$pName[i] <- myDB$protein$name[sel]    # store pName
    fID <- thisData$annotation$featureID[i]
    sel <- which(myDB$feature$ID == fID)
    thisData$annotation$fName[i] <- myDB$feature$name[sel]    # store fName
  }
  # add the corresponding feature table
  sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
  thisData$feature <- myDB$feature[sel, ]
  # remove columns that are not going into JSON output
  thisData$protein$ID           <- NULL
  thisData$annotation$ID        <- NULL
  thisData$annotation$proteinID <- NULL
  thisData$annotation$featureID <- NULL
  thisData$feature$ID           <- NULL
  # create JSON-formatted output
  # ( jsonlite::prettify() is too  wordy for a compact Wikipage )
  out <- character()
  out <- c(out, '{')
  out <- c(out, '  "protein": {')
  sel <- colnames(thisData$protein) != "sequence"
  out <- c(out, sprintf("    %s,", dbRow2JSON(thisData$protein[1, sel],
                                              coll = ",\n    ")))
  out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
  out <- c(out, '  },')
  out <- c(out, '  "taxonomy": {')
  out <- c(out, sprintf("    %s", dbRow2JSON(thisData$taxonomy)))
  out <- c(out, '  },')
  out <- c(out, '  "annotation": [')
  for (i in seq_len(nrow(thisData$annotation))) {
    out <- c(out, sprintf("    {%s},", dbRow2JSON(thisData$annotation[i, ])))
  }
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
  out <- c(out, '  ],')
  out <- c(out, '  "feature": [')
  sel <- colnames(thisData$feature) != "description"
  for (i in seq_len(nrow(thisData$feature))) {
    out <- c(out, sprintf("    {%s,",
                          dbRow2JSON(thisData$feature[i, sel])))
    out <- c(out, sprintf("     %s},",
                          dbRow2JSON(thisData$feature[i, "description",
                                                      drop = FALSE])))
  }
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
  out <- c(out, '  ]')
  out <- c(out, '}')
  return(paste0(out, collapse = "\n"))
 }
 # ==   3.15  dbSeq2JSON()  =====================================================
 dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
  # Turn a sequence into a JSON key-value pair, with the value being a JSON
  # array of elements not exceeding a width of "width", and an indent of
  # "indents" spaces.
  ind <- paste0(rep(" ", nIndents), collapse = "")
  out <- character()
  out <- c(out, sprintf("%s\"sequence\" : [", ind))
  for (i in seq_along(s)) {
    l <- nchar(s[i])
    if (l <= width) {
      out <- c(out, s[i])
    } else {
      starts <- seq(1, l, by = width)
      ends <- seq(width, l, by = width)
      if (length(ends) < length(starts)) { ends <- c(ends, l) }
      out <- c(out, sprintf("%s  \"%s\",", ind, substring(s[i], starts, ends)))
    }
  }
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
  out <- c(out, sprintf("%s]", ind))
  return(paste0(out, collapse = "\n"))
 }
 cat(dbSeq2JSON(myDB$protein$sequence[1]))
 # ==   3.16  dbRow2JSON()  =====================================================
 dbRow2JSON <- function(df, coll = ", ") {
  # Turn a single dataframe row into JSON key value pairs, where the keys are the
  # column names. Respects character / numeric mode.
  out <- character()
  for (i in 1:ncol(df)) {
    if (class(df[1, i]) == "integer") {
      val <- sprintf("%d", df[1, i])
    } else if (class(df[1, i]) == "numeric") {
      val <- sprintf("%f", df[1, i])
    } else {
      val <- sprintf("\"%s\"", as.character(df[1, i]))
    }
    out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
  }
  return(paste0(out, collapse = coll))
 }
 # =    4  TESTS  ===============================================================
 if (doTESTS) {
  if (! requireNamespace("testthat", quietly = TRUE)) {
    install.packages("testthat")
  }