Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page

2020-10-13 16:08:35 +10:00
parent 3bee83495f
commit 4d071cf8d5
2 changed files with 344 additions and 82 deletions
--- a/BIN-FUNC-Domain_annotation.R
+++ b/BIN-FUNC-Domain_annotation.R
@@ -3,12 +3,14 @@
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-FUNC-Domain_annotation unit.
 #
-# Version:  1.2
+# Version:  1.3
 #
 # Date:     2017-11  -  2020-10
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.3    Add code for database export to JSON and instructions
+#                  for uploading annotations to the Public Student Wiki page
 #           1.2    Consistently: data in ./myScripts/ ;
 #                    begin SHARING DATA section
 #           1.1    2020 Updates
@@ -31,12 +33,14 @@
 #TOC> 
 #TOC>   Section  Title                                                 Line
 #TOC> ---------------------------------------------------------------------
-#TOC>   1        Update your database script                             42
-#TOC>   1.1        Preparing an annotation file ...                      49
-#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      52
-#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"        97
-#TOC>   1.2        Execute and Validate                                 124
-#TOC>   2        Plot Annotations                                       149
+#TOC>   1        Update your database script                             48
+#TOC>   1.1        Preparing an annotation file ...                      55
+#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      58
+#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       106
+#TOC>   1.2        Execute and Validate                                 133
+#TOC>   2        Plot Annotations                                       158
+#TOC>   3        SHARING DATA                                           283
+#TOC>   3.1        Post MBP1_MYSPE as JSON data                         298
 #TOC> 
 #TOC> ==========================================================================

@@ -276,16 +280,47 @@ par(oPar)  # reset the plot parameters
 #    It would be better to align the motif borders, at least approximately (not
 #    all proteins have all motifs). How would you go about doing that?

-# = 1 SHARING DATA ======
+# =    3  SHARING DATA  ========================================================

 # It's particularly interesting to compare such annotations across many
-# homologous proteins. I have created a file on the Student Wiki that you can
+# homologous proteins. I have created a page on the Student Wiki () that you can
 # edit, and then download the data from the entire class directly to your
 # RStudio project.
 #
+
+# I have provided a function that extracts all information that refers to a
+# single protein from the database, and prints it out as well-formatted JSON,
+# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
+# bookkeeping involved, but the code is not otherwise very enlightening so I
+# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
+# would want to have a look.
+
+# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
+
 # Task:
 # =====
-#   TBC ...
+# 1: Run the following code:
+
+cat("{{Vspace}}",
+    "<!-- ==== BEGIN  PROTEIN ==== -->",
+    "<pre>",
+    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
+    "</pre>",
+    "<!-- ===== END PROTEIN ====== -->",
+    "", sep = "\n"
+)
+
+# 2: Copy the entire output,
+# 3: Navigate to
+#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
+#    ... edit the page, and paste your output at the top.
+# 4: Save your edits.
+
+# Next, once we have collected a number of protein annotations, we can access
+# the page and import the data into our database.
+#
+# Code to come soon ...
+


 # [END]
--- a/scripts/ABC-dbUtilities.R
+++ b/scripts/ABC-dbUtilities.R
@@ -1,56 +1,87 @@
 # tocID <- "scripts/ABC-dbUtilities.R"
 #
-# database utilities for ABC learning units
+# Purpose: Database utilities for ABC learning units.
+#
+# Version 2.1
+#
+# Date:     2017-11  -  2020-10
+# Author:   Boris Steipe (boris.steipe@utoronto.ca)
+#
+# Versions:
+#           2.1    Add JSON export functions
+#           2.0    Test all JSON import and prevent addition of duplicates. This
+#                  is necessary for import of data from the public page
+#           1.1    2020 Updates
+#           1.0    Live version 2017
+#
+# Notes:
+#   There are no functions to modify or delete entries. To do either,
+#   recreate the database with correct data in the creation script. This is the
+#   preferred way that ensures the entire database can be reproduced by
+#   source()'ing its generating script.
+#
+#   Inserting data goes only through the very most minimal validation steps. For
+#   production applications, more validation would need to be added, as well
+#   as an overall validation of database integrity
+#
+# ToDo:
 #
 # ==============================================================================


 #TOC> ==========================================================================
 #TOC> 
-#TOC>   Section  Title                             Line
-#TOC> -------------------------------------------------
-#TOC>   1        PACKAGES                            32
-#TOC>   2        FUNCTIONS                           50
-#TOC>   2.01       dbSanitizeSequence()              53
-#TOC>   2.02       dbConfirmUnique()                 88
-#TOC>   2.03       dbInit()                         106
-#TOC>   2.04       dbAutoincrement()                147
-#TOC>   2.05       dbAddProtein()                   160
-#TOC>   2.06       dbAddFeature()                   180
-#TOC>   2.07       dbAddTaxonomy()                  199
-#TOC>   2.08       dbAddAnnotation()                215
-#TOC>   2.09       dbFetchUniProtSeq()              243
-#TOC>   2.10       dbFetchPrositeFeatures()         289
-#TOC>   2.11       node2text()                      339
-#TOC>   2.12       dbFetchNCBItaxData()             351
-#TOC>   2.13       UniProtIDmap()                   390
-#TOC>   3        TESTS                              429
+#TOC>   Section  Title                                   Line
+#TOC> -------------------------------------------------------
+#TOC>   1        INITIALISATIONS AND PARAMETERS            60
+#TOC>   2        PACKAGES                                  65
+#TOC>   3        FUNCTIONS                                 81
+#TOC>   3.01       dbSanitizeSequence()                    84
+#TOC>   3.02       dbConfirmUnique()                      119
+#TOC>   3.03       dbInit()                               137
+#TOC>   3.04       dbAutoincrement()                      177
+#TOC>   3.05       dbAddProtein()                         190
+#TOC>   3.06       dbAddFeature()                         222
+#TOC>   3.07       dbAddTaxonomy()                        253
+#TOC>   3.08       dbAddAnnotation()                      288
+#TOC>   3.09       dbFetchUniProtSeq()                    335
+#TOC>   3.10       dbFetchPrositeFeatures()               381
+#TOC>   3.11       node2text()                            431
+#TOC>   3.12       dbFetchNCBItaxData()                   443
+#TOC>   3.13       UniProtIDmap()                         482
+#TOC>   3.14       dbProt2JSON()                          521
+#TOC>   3.15       dbSeq2JSON()                           606
+#TOC>   3.16       dbRow2JSON()                           636
+#TOC>   4        TESTS                                    656
 #TOC> 
 #TOC> ==========================================================================


-# =    1  PACKAGES  ============================================================
+# =    1  INITIALISATIONS AND PARAMETERS  ======================================
+
+doTESTS <- FALSE  # run tests if TRUE
+
+
+# =    2  PACKAGES  ============================================================


 if (! requireNamespace("jsonlite", quietly = TRUE)) {
  install.packages("jsonlite")
 }

-
 if (! requireNamespace("httr", quietly = TRUE)) {
  install.packages("httr")
 }

-
 if (! requireNamespace("xml2", quietly = TRUE)) {
  install.packages("xml2")
 }


-# =    2  FUNCTIONS  ===========================================================
+# =    3  FUNCTIONS  ===========================================================


-# ==   2.01  dbSanitizeSequence()  =============================================
+# ==   3.01  dbSanitizeSequence()  =============================================
 dbSanitizeSequence <- function(s, unambiguous = TRUE) {
  # Remove FASTA header lines, if any,
  # flatten any structure that s has,
@@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 }


-# ==   2.02  dbConfirmUnique()  ================================================
+# ==   3.02  dbConfirmUnique()  ================================================
 dbConfirmUnique <- function(x) {
  # x is a vector of logicals.
  # returns x if x has exactly one TRUE element.
@@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) {
 }


-# ==   2.03  dbInit()  =========================================================
+# ==   3.03  dbInit()  =========================================================
 dbInit <- function() {
  # Return an empty instance of the protein database
-  # Open the link and study the schema:
+  # The schema is here:
  # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0

  db <- list()
@@ -125,7 +156,6 @@ dbInit <- function() {
    ID = numeric(),
    species = character())

-
  db$annotation <- data.frame(
    ID = numeric(),
    proteinID = numeric(),
@@ -144,7 +174,7 @@ dbInit <- function() {
 }


-# ==   2.04  dbAutoincrement()  ================================================
+# ==   3.04  dbAutoincrement()  ================================================
 dbAutoincrement <- function(tb) {
  # Return a unique integer that can be used as a primary key
  # Value:
@@ -157,90 +187,152 @@ dbAutoincrement <- function(tb) {
 }


-# ==   2.05  dbAddProtein()  ===================================================
+# ==   3.05  dbAddProtein()  ===================================================
 dbAddProtein <- function(db, jsonDF) {
-  # Add one or more protein entries to the database db.
+  # Add one or more protein entries to the database db if a protein with the
+  # same name does not yet exist. This enforces that protein names are unique.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  protein data imported into a data frame with
  #                           fromJSON()
+
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(ID          = dbAutoincrement(db$protein),
-                    name        = jsonDF$name[i],
-                    RefSeqID    = jsonDF$RefSeqID[i],
-                    UniProtID   = jsonDF$UniProtID[i],
-                    taxonomyID  = jsonDF$taxonomyID[i],
-                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
-    db$protein <- rbind(db$protein, x)
+    if (jsonDF$name[i] %in% db$protein$name) {
+      cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
+                  i, jsonDF$name[i],
+                  "a protein with this name already exists in the database. ",
+                  "Skipping this input."))
+      isValid <- FALSE
+    }
+
+    if (isValid) {
+      x <- data.frame(ID          = dbAutoincrement(db$protein),
+                      name        = jsonDF$name[i],
+                      RefSeqID    = jsonDF$RefSeqID[i],
+                      UniProtID   = jsonDF$UniProtID[i],
+                      taxonomyID  = jsonDF$taxonomyID[i],
+                      sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
+      db$protein <- rbind(db$protein, x)
+    }
  }
  return(db)
 }


-# ==   2.06  dbAddFeature()  ===================================================
+# ==   3.06  dbAddFeature()  ===================================================
 dbAddFeature <- function(db, jsonDF) {
-  # Add one or more feature entries to the database db.
+  # Add one or more feature entries to the database db. Skip if a feature with
+  # the same name already exists.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  feature data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(ID          = dbAutoincrement(db$feature),
-                    name        = jsonDF$name[i],
-                    description = jsonDF$description[i],
-                    sourceDB    = jsonDF$sourceDB[i],
-                    accession   = jsonDF$accession[i])
-    db$feature <- rbind(db$feature, x)
+    isValid <- TRUE
+    if (jsonDF$name[i] %in% db$feature$name) {
+      cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
+                  i, jsonDF$name[i],
+                  "a feature with this name already exists in the database. ",
+                  "Skipping this input."))
+      isValid <- FALSE
+    }
+
+    if (isVALID) {
+      x <- data.frame(ID          = dbAutoincrement(db$feature),
+                      name        = jsonDF$name[i],
+                      description = jsonDF$description[i],
+                      sourceDB    = jsonDF$sourceDB[i],
+                      accession   = jsonDF$accession[i])
+      db$feature <- rbind(db$feature, x)
+    }
  }
  return(db)
 }


-# ==   2.07  dbAddTaxonomy()  ==================================================
+# ==   3.07  dbAddTaxonomy()  ==================================================
 dbAddTaxonomy <- function(db, jsonDF) {
-  # Add one or more taxonomy entries to the database db.
+  # Add one or more taxonomy entries to the database db. Skip if species name
+  # or taxonomy ID already exist in the database.
  # Parameters:
  #     db      list         A database created with dbInit()
  #     jsonDF  data frame   Taxonomy data imported into a data frame with
  #                            fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
-    x <- data.frame(
-      ID =  jsonDF$ID[i],
-      species = jsonDF$species[i])
-    db$taxonomy <- rbind(db$taxonomy, x)
+    isValid <- TRUE
+
+    if (jsonDF$species[i] %in% db$taxonomy$species) {
+      cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
+                  i, jsonDF$name[i],
+                  "a species with this name already exists in the database. ",
+                  "Skipping this input."))
+      isValid <- FALSE
+    } else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
+      cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
+                  i, jsonDF$ID[i],
+                  "this taxonomy ID already exists in the database. ",
+                  "Skipping this input."))
+      isValid <- FALSE
+    }
+    if (isValid) {
+      x <- data.frame(
+        ID =  as.integer(jsonDF$ID[i]),
+        species = jsonDF$species[i])
+      db$taxonomy <- rbind(db$taxonomy, x)
+    }
  }
  return(db)
 }

-# ==   2.08  dbAddAnnotation()  ================================================
+
+# ==   3.08  dbAddAnnotation()  ================================================
 dbAddAnnotation <- function(db, jsonDF) {
-  # Add one or more annotation entries to the database db.
+  # Add one or more annotation entries to the database db. Skip the entry if
+  # it already exists in the database.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  annotation data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
+    isValid <- TRUE

    sel <- jsonDF$pName[i] == db$protein$name
-    sel <- dbConfirmUnique(sel)
+    sel <- dbConfirmUnique(sel)    # Confirm that this protein ID exists
    pID <- db$protein$ID[sel]

    sel <- jsonDF$fName[i] == db$feature$name
-    sel <- dbConfirmUnique(sel)
+    sel <- dbConfirmUnique(sel)    # Confirm that this feature ID exists
    fID <- db$feature$ID[sel]

-    x <- data.frame(ID        = dbAutoincrement(db$annotation),
-                    proteinID = pID,
-                    featureID = fID,
-                    start     = as.integer(jsonDF$start[i]),
-                    end       = as.integer(jsonDF$end[i]))
-    db$annotation <- rbind(db$annotation, x)
+    sel <- db$annotation$proteinID == pID &
+           db$annotation$featureID == fID &
+           db$annotation$start == as.integer(jsonDF$start[idx]) &
+           db$annotation$end   == as.integer(jsonDF$end[idx])
+
+    if (any(sel)) {
+      cat(sprintf("Note: annotation No. %d in the input has %s%s%\n",
+                  i,
+                  "the same protein name, feature name, start, and end ",
+                  "as one that already exists in the database. ",
+                  "Skipping this input."))
+
+      isValid <- FALSE
+    }
+
+    if (isValid) {
+      x <- data.frame(ID        = dbAutoincrement(db$annotation),
+                      proteinID = pID,
+                      featureID = fID,
+                      start     = as.integer(jsonDF$start[i]),
+                      end       = as.integer(jsonDF$end[i]))
+      db$annotation <- rbind(db$annotation, x)
+    }
  }
  return(db)
 }


-# ==   2.09  dbFetchUniProtSeq()  ==============================================
+# ==   3.09  dbFetchUniProtSeq()  ==============================================
 dbFetchUniProtSeq <- function(IDs) {
  # Fetch a protein sequence from UniProt.
  # Parameters:
@@ -286,7 +378,7 @@ if (FALSE) {



-# ==   2.10  dbFetchPrositeFeatures()  =========================================
+# ==   3.10  dbFetchPrositeFeatures()  =========================================
 dbFetchPrositeFeatures <- function(ID) {
  # Fetch feature annotations from ScanProsite.
  # Parameters:
@@ -336,7 +428,7 @@ if (FALSE) {

 }

-# ==   2.11  node2text()  ======================================================
+# ==   3.11  node2text()  ======================================================
 node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
@@ -348,7 +440,7 @@ node2text <- function(doc, tag) {
 }


-# ==   2.12  dbFetchNCBItaxData()  =============================================
+# ==   3.12  dbFetchNCBItaxData()  =============================================
 dbFetchNCBItaxData <- function(ID) {
  # Fetch feature taxID and Organism from the NCBI.
  # Parameters:
@@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) {



-# ==   2.13  UniProtIDmap()  ===================================================
+# ==   3.13  UniProtIDmap()  ===================================================
 UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  # Use UniProt ID mapping service to map one or more IDs
  # Parameters:
@@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 }


-# =    3  TESTS  ===============================================================
+# ==   3.14  dbProt2JSON()  ====================================================
+dbProt2JSON <- function(thisProt) {
+  # Extract all protein related data from myDB and return in JSON format.

-if (FALSE) {
+  thisData <- list()
+
+  # add a protein table
+  sel <- which(myDB$protein$name == thisProt)
+  thisData$protein <- myDB$protein[sel, ]
+
+  # add a taxonomy table
+  sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
+  thisData$taxonomy <- myDB$taxonomy[sel, ]
+
+  # add the entries for this protein from the  annotation table
+  sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
+  thisData$annotation <- myDB$annotation[sel, ]
+  # our .json convention uses pName and fName as keys, not the db-internal IDs
+  # add empty columns for pName and fName
+  l <- nrow(thisData$annotation)
+  thisData$annotation$pName <- character(l)
+  thisData$annotation$fName <- character(l)
+  # get the appropriate protein and feature names
+  for (i in seq_len(l)) {
+    pID <- thisData$annotation$proteinID[i]
+    sel <- which(myDB$protein$ID == pID)
+    thisData$annotation$pName[i] <- myDB$protein$name[sel]    # store pName
+    fID <- thisData$annotation$featureID[i]
+    sel <- which(myDB$feature$ID == fID)
+    thisData$annotation$fName[i] <- myDB$feature$name[sel]    # store fName
+  }
+
+  # add the corresponding feature table
+  sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
+  thisData$feature <- myDB$feature[sel, ]
+
+  # remove columns that are not going into JSON output
+  thisData$protein$ID           <- NULL
+  thisData$annotation$ID        <- NULL
+  thisData$annotation$proteinID <- NULL
+  thisData$annotation$featureID <- NULL
+  thisData$feature$ID           <- NULL
+
+  # create JSON-formatted output
+  # ( jsonlite::prettify() is too  wordy for a compact Wikipage )
+
+  out <- character()
+  out <- c(out, '{')
+
+  out <- c(out, '  "protein": {')
+  sel <- colnames(thisData$protein) != "sequence"
+  out <- c(out, sprintf("    %s,", dbRow2JSON(thisData$protein[1, sel],
+                                              coll = ",\n    ")))
+  out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
+  out <- c(out, '  },')
+
+  out <- c(out, '  "taxonomy": {')
+  out <- c(out, sprintf("    %s", dbRow2JSON(thisData$taxonomy)))
+  out <- c(out, '  },')
+
+  out <- c(out, '  "annotation": [')
+  for (i in seq_len(nrow(thisData$annotation))) {
+    out <- c(out, sprintf("    {%s},", dbRow2JSON(thisData$annotation[i, ])))
+  }
+  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
+  out <- c(out, '  ],')
+
+  out <- c(out, '  "feature": [')
+  sel <- colnames(thisData$feature) != "description"
+  for (i in seq_len(nrow(thisData$feature))) {
+    out <- c(out, sprintf("    {%s,",
+                          dbRow2JSON(thisData$feature[i, sel])))
+    out <- c(out, sprintf("     %s},",
+                          dbRow2JSON(thisData$feature[i, "description",
+                                                      drop = FALSE])))
+  }
+  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
+  out <- c(out, '  ]')
+
+  out <- c(out, '}')
+
+  return(paste0(out, collapse = "\n"))
+}
+
+
+# ==   3.15  dbSeq2JSON()  =====================================================
+
+dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
+  # Turn a sequence into a JSON key-value pair, with the value being a JSON
+  # array of elements not exceeding a width of "width", and an indent of
+  # "indents" spaces.
+  ind <- paste0(rep(" ", nIndents), collapse = "")
+
+  out <- character()
+  out <- c(out, sprintf("%s\"sequence\" : [", ind))
+
+  for (i in seq_along(s)) {
+    l <- nchar(s[i])
+    if (l <= width) {
+      out <- c(out, s[i])
+    } else {
+      starts <- seq(1, l, by = width)
+      ends <- seq(width, l, by = width)
+      if (length(ends) < length(starts)) { ends <- c(ends, l) }
+      out <- c(out, sprintf("%s  \"%s\",", ind, substring(s[i], starts, ends)))
+    }
+  }
+  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
+
+  out <- c(out, sprintf("%s]", ind))
+  return(paste0(out, collapse = "\n"))
+}
+cat(dbSeq2JSON(myDB$protein$sequence[1]))
+
+
+# ==   3.16  dbRow2JSON()  =====================================================
+
+dbRow2JSON <- function(df, coll = ", ") {
+  # Turn a single dataframe row into JSON key value pairs, where the keys are the
+  # column names. Respects character / numeric mode.
+  out <- character()
+  for (i in 1:ncol(df)) {
+    if (class(df[1, i]) == "integer") {
+      val <- sprintf("%d", df[1, i])
+    } else if (class(df[1, i]) == "numeric") {
+      val <- sprintf("%f", df[1, i])
+    } else {
+      val <- sprintf("\"%s\"", as.character(df[1, i]))
+    }
+    out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
+  }
+  return(paste0(out, collapse = coll))
+}
+
+
+# =    4  TESTS  ===============================================================
+
+if (doTESTS) {
  if (! requireNamespace("testthat", quietly = TRUE)) {
    install.packages("testthat")
  }