Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page
This commit is contained in:
		@@ -3,12 +3,14 @@
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-FUNC-Domain_annotation unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    Add code for database export to JSON and instructions
 | 
			
		||||
#                  for uploading annotations to the Public Student Wiki page
 | 
			
		||||
#           1.2    Consistently: data in ./myScripts/ ;
 | 
			
		||||
#                    begin SHARING DATA section
 | 
			
		||||
#           1.1    2020 Updates
 | 
			
		||||
@@ -28,16 +30,18 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                 Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Update your database script                             42
 | 
			
		||||
#TOC>   1.1        Preparing an annotation file ...                      49
 | 
			
		||||
#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      52
 | 
			
		||||
#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"        97
 | 
			
		||||
#TOC>   1.2        Execute and Validate                                 124
 | 
			
		||||
#TOC>   2        Plot Annotations                                       149
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   1        Update your database script                             48
 | 
			
		||||
#TOC>   1.1        Preparing an annotation file ...                      55
 | 
			
		||||
#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      58
 | 
			
		||||
#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       106
 | 
			
		||||
#TOC>   1.2        Execute and Validate                                 133
 | 
			
		||||
#TOC>   2        Plot Annotations                                       158
 | 
			
		||||
#TOC>   3        SHARING DATA                                           283
 | 
			
		||||
#TOC>   3.1        Post MBP1_MYSPE as JSON data                         298
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -99,7 +103,7 @@
 | 
			
		||||
# Then SKIP the next section.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"
 | 
			
		||||
# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
 | 
			
		||||
#
 | 
			
		||||
#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 | 
			
		||||
#
 | 
			
		||||
@@ -276,16 +280,47 @@ par(oPar)  # reset the plot parameters
 | 
			
		||||
#    It would be better to align the motif borders, at least approximately (not
 | 
			
		||||
#    all proteins have all motifs). How would you go about doing that?
 | 
			
		||||
 | 
			
		||||
# = 1 SHARING DATA ======
 | 
			
		||||
# =    3  SHARING DATA  ========================================================
 | 
			
		||||
 | 
			
		||||
# It's particularly interesting to compare such annotations across many
 | 
			
		||||
# homologous proteins. I have created a file on the Student Wiki that you can
 | 
			
		||||
# homologous proteins. I have created a page on the Student Wiki () that you can
 | 
			
		||||
# edit, and then download the data from the entire class directly to your
 | 
			
		||||
# RStudio project.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# I have provided a function that extracts all information that refers to a
 | 
			
		||||
# single protein from the database, and prints it out as well-formatted JSON,
 | 
			
		||||
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
 | 
			
		||||
# bookkeeping involved, but the code is not otherwise very enlightening so I
 | 
			
		||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
 | 
			
		||||
# would want to have a look.
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
#   TBC ...
 | 
			
		||||
# 1: Run the following code:
 | 
			
		||||
 | 
			
		||||
cat("{{Vspace}}",
 | 
			
		||||
    "<!-- ==== BEGIN  PROTEIN ==== -->",
 | 
			
		||||
    "<pre>",
 | 
			
		||||
    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
 | 
			
		||||
    "</pre>",
 | 
			
		||||
    "<!-- ===== END PROTEIN ====== -->",
 | 
			
		||||
    "", sep = "\n"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# 2: Copy the entire output,
 | 
			
		||||
# 3: Navigate to
 | 
			
		||||
#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
 | 
			
		||||
#    ... edit the page, and paste your output at the top.
 | 
			
		||||
# 4: Save your edits.
 | 
			
		||||
 | 
			
		||||
# Next, once we have collected a number of protein annotations, we can access
 | 
			
		||||
# the page and import the data into our database.
 | 
			
		||||
#
 | 
			
		||||
# Code to come soon ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,56 +1,87 @@
 | 
			
		||||
# tocID <- "scripts/ABC-dbUtilities.R"
 | 
			
		||||
#
 | 
			
		||||
# database utilities for ABC learning units
 | 
			
		||||
# Purpose: Database utilities for ABC learning units.
 | 
			
		||||
#
 | 
			
		||||
# Version 2.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           2.1    Add JSON export functions
 | 
			
		||||
#           2.0    Test all JSON import and prevent addition of duplicates. This
 | 
			
		||||
#                  is necessary for import of data from the public page
 | 
			
		||||
#           1.1    2020 Updates
 | 
			
		||||
#           1.0    Live version 2017
 | 
			
		||||
#
 | 
			
		||||
# Notes:
 | 
			
		||||
#   There are no functions to modify or delete entries. To do either,
 | 
			
		||||
#   recreate the database with correct data in the creation script. This is the
 | 
			
		||||
#   preferred way that ensures the entire database can be reproduced by
 | 
			
		||||
#   source()'ing its generating script.
 | 
			
		||||
#
 | 
			
		||||
#   Inserting data goes only through the very most minimal validation steps. For
 | 
			
		||||
#   production applications, more validation would need to be added, as well
 | 
			
		||||
#   as an overall validation of database integrity
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        PACKAGES                            32
 | 
			
		||||
#TOC>   2        FUNCTIONS                           50
 | 
			
		||||
#TOC>   2.01       dbSanitizeSequence()              53
 | 
			
		||||
#TOC>   2.02       dbConfirmUnique()                 88
 | 
			
		||||
#TOC>   2.03       dbInit()                         106
 | 
			
		||||
#TOC>   2.04       dbAutoincrement()                147
 | 
			
		||||
#TOC>   2.05       dbAddProtein()                   160
 | 
			
		||||
#TOC>   2.06       dbAddFeature()                   180
 | 
			
		||||
#TOC>   2.07       dbAddTaxonomy()                  199
 | 
			
		||||
#TOC>   2.08       dbAddAnnotation()                215
 | 
			
		||||
#TOC>   2.09       dbFetchUniProtSeq()              243
 | 
			
		||||
#TOC>   2.10       dbFetchPrositeFeatures()         289
 | 
			
		||||
#TOC>   2.11       node2text()                      339
 | 
			
		||||
#TOC>   2.12       dbFetchNCBItaxData()             351
 | 
			
		||||
#TOC>   2.13       UniProtIDmap()                   390
 | 
			
		||||
#TOC>   3        TESTS                              429
 | 
			
		||||
#TOC>   Section  Title                                   Line
 | 
			
		||||
#TOC> -------------------------------------------------------
 | 
			
		||||
#TOC>   1        INITIALISATIONS AND PARAMETERS            60
 | 
			
		||||
#TOC>   2        PACKAGES                                  65
 | 
			
		||||
#TOC>   3        FUNCTIONS                                 81
 | 
			
		||||
#TOC>   3.01       dbSanitizeSequence()                    84
 | 
			
		||||
#TOC>   3.02       dbConfirmUnique()                      119
 | 
			
		||||
#TOC>   3.03       dbInit()                               137
 | 
			
		||||
#TOC>   3.04       dbAutoincrement()                      177
 | 
			
		||||
#TOC>   3.05       dbAddProtein()                         190
 | 
			
		||||
#TOC>   3.06       dbAddFeature()                         222
 | 
			
		||||
#TOC>   3.07       dbAddTaxonomy()                        253
 | 
			
		||||
#TOC>   3.08       dbAddAnnotation()                      288
 | 
			
		||||
#TOC>   3.09       dbFetchUniProtSeq()                    335
 | 
			
		||||
#TOC>   3.10       dbFetchPrositeFeatures()               381
 | 
			
		||||
#TOC>   3.11       node2text()                            431
 | 
			
		||||
#TOC>   3.12       dbFetchNCBItaxData()                   443
 | 
			
		||||
#TOC>   3.13       UniProtIDmap()                         482
 | 
			
		||||
#TOC>   3.14       dbProt2JSON()                          521
 | 
			
		||||
#TOC>   3.15       dbSeq2JSON()                           606
 | 
			
		||||
#TOC>   3.16       dbRow2JSON()                           636
 | 
			
		||||
#TOC>   4        TESTS                                    656
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  PACKAGES  ============================================================
 | 
			
		||||
# =    1  INITIALISATIONS AND PARAMETERS  ======================================
 | 
			
		||||
 | 
			
		||||
doTESTS <- FALSE  # run tests if TRUE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  PACKAGES  ============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("jsonlite", quietly = TRUE)) {
 | 
			
		||||
  install.packages("jsonlite")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("xml2", quietly = TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  FUNCTIONS  ===========================================================
 | 
			
		||||
# =    3  FUNCTIONS  ===========================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.01  dbSanitizeSequence()  =============================================
 | 
			
		||||
# ==   3.01  dbSanitizeSequence()  =============================================
 | 
			
		||||
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 | 
			
		||||
  # Remove FASTA header lines, if any,
 | 
			
		||||
  # flatten any structure that s has,
 | 
			
		||||
@@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.02  dbConfirmUnique()  ================================================
 | 
			
		||||
# ==   3.02  dbConfirmUnique()  ================================================
 | 
			
		||||
dbConfirmUnique <- function(x) {
 | 
			
		||||
  # x is a vector of logicals.
 | 
			
		||||
  # returns x if x has exactly one TRUE element.
 | 
			
		||||
@@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.03  dbInit()  =========================================================
 | 
			
		||||
# ==   3.03  dbInit()  =========================================================
 | 
			
		||||
dbInit <- function() {
 | 
			
		||||
  # Return an empty instance of the protein database
 | 
			
		||||
  # Open the link and study the schema:
 | 
			
		||||
  # The schema is here:
 | 
			
		||||
  # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
 | 
			
		||||
 | 
			
		||||
  db <- list()
 | 
			
		||||
@@ -125,7 +156,6 @@ dbInit <- function() {
 | 
			
		||||
    ID = numeric(),
 | 
			
		||||
    species = character())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  db$annotation <- data.frame(
 | 
			
		||||
    ID = numeric(),
 | 
			
		||||
    proteinID = numeric(),
 | 
			
		||||
@@ -144,7 +174,7 @@ dbInit <- function() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.04  dbAutoincrement()  ================================================
 | 
			
		||||
# ==   3.04  dbAutoincrement()  ================================================
 | 
			
		||||
dbAutoincrement <- function(tb) {
 | 
			
		||||
  # Return a unique integer that can be used as a primary key
 | 
			
		||||
  # Value:
 | 
			
		||||
@@ -157,90 +187,152 @@ dbAutoincrement <- function(tb) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.05  dbAddProtein()  ===================================================
 | 
			
		||||
# ==   3.05  dbAddProtein()  ===================================================
 | 
			
		||||
dbAddProtein <- function(db, jsonDF) {
 | 
			
		||||
  # Add one or more protein entries to the database db.
 | 
			
		||||
  # Add one or more protein entries to the database db if a protein with the
 | 
			
		||||
  # same name does not yet exist. This enforces that protein names are unique.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     db   list   a database created with dbInit()
 | 
			
		||||
  #     jsonDF  data frame  protein data imported into a data frame with
 | 
			
		||||
  #                           fromJSON()
 | 
			
		||||
 | 
			
		||||
  for (i in seq_len(nrow(jsonDF))) {
 | 
			
		||||
    x <- data.frame(ID          = dbAutoincrement(db$protein),
 | 
			
		||||
                    name        = jsonDF$name[i],
 | 
			
		||||
                    RefSeqID    = jsonDF$RefSeqID[i],
 | 
			
		||||
                    UniProtID   = jsonDF$UniProtID[i],
 | 
			
		||||
                    taxonomyID  = jsonDF$taxonomyID[i],
 | 
			
		||||
                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
 | 
			
		||||
    db$protein <- rbind(db$protein, x)
 | 
			
		||||
    if (jsonDF$name[i] %in% db$protein$name) {
 | 
			
		||||
      cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
 | 
			
		||||
                  i, jsonDF$name[i],
 | 
			
		||||
                  "a protein with this name already exists in the database. ",
 | 
			
		||||
                  "Skipping this input."))
 | 
			
		||||
      isValid <- FALSE
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (isValid) {
 | 
			
		||||
      x <- data.frame(ID          = dbAutoincrement(db$protein),
 | 
			
		||||
                      name        = jsonDF$name[i],
 | 
			
		||||
                      RefSeqID    = jsonDF$RefSeqID[i],
 | 
			
		||||
                      UniProtID   = jsonDF$UniProtID[i],
 | 
			
		||||
                      taxonomyID  = jsonDF$taxonomyID[i],
 | 
			
		||||
                      sequence    = dbSanitizeSequence(jsonDF$sequence[i]))
 | 
			
		||||
      db$protein <- rbind(db$protein, x)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return(db)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.06  dbAddFeature()  ===================================================
 | 
			
		||||
# ==   3.06  dbAddFeature()  ===================================================
 | 
			
		||||
dbAddFeature <- function(db, jsonDF) {
 | 
			
		||||
  # Add one or more feature entries to the database db.
 | 
			
		||||
  # Add one or more feature entries to the database db. Skip if a feature with
 | 
			
		||||
  # the same name already exists.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     db   list   a database created with dbInit()
 | 
			
		||||
  #     jsonDF  data frame  feature data imported into a data frame with
 | 
			
		||||
  #                           fromJSON()
 | 
			
		||||
  for (i in seq_len(nrow(jsonDF))) {
 | 
			
		||||
    x <- data.frame(ID          = dbAutoincrement(db$feature),
 | 
			
		||||
                    name        = jsonDF$name[i],
 | 
			
		||||
                    description = jsonDF$description[i],
 | 
			
		||||
                    sourceDB    = jsonDF$sourceDB[i],
 | 
			
		||||
                    accession   = jsonDF$accession[i])
 | 
			
		||||
    db$feature <- rbind(db$feature, x)
 | 
			
		||||
    isValid <- TRUE
 | 
			
		||||
    if (jsonDF$name[i] %in% db$feature$name) {
 | 
			
		||||
      cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
 | 
			
		||||
                  i, jsonDF$name[i],
 | 
			
		||||
                  "a feature with this name already exists in the database. ",
 | 
			
		||||
                  "Skipping this input."))
 | 
			
		||||
      isValid <- FALSE
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (isVALID) {
 | 
			
		||||
      x <- data.frame(ID          = dbAutoincrement(db$feature),
 | 
			
		||||
                      name        = jsonDF$name[i],
 | 
			
		||||
                      description = jsonDF$description[i],
 | 
			
		||||
                      sourceDB    = jsonDF$sourceDB[i],
 | 
			
		||||
                      accession   = jsonDF$accession[i])
 | 
			
		||||
      db$feature <- rbind(db$feature, x)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return(db)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.07  dbAddTaxonomy()  ==================================================
 | 
			
		||||
# ==   3.07  dbAddTaxonomy()  ==================================================
 | 
			
		||||
dbAddTaxonomy <- function(db, jsonDF) {
 | 
			
		||||
  # Add one or more taxonomy entries to the database db.
 | 
			
		||||
  # Add one or more taxonomy entries to the database db. Skip if species name
 | 
			
		||||
  # or taxonomy ID already exist in the database.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     db      list         A database created with dbInit()
 | 
			
		||||
  #     jsonDF  data frame   Taxonomy data imported into a data frame with
 | 
			
		||||
  #                            fromJSON()
 | 
			
		||||
  for (i in seq_len(nrow(jsonDF))) {
 | 
			
		||||
    x <- data.frame(
 | 
			
		||||
      ID =  jsonDF$ID[i],
 | 
			
		||||
      species = jsonDF$species[i])
 | 
			
		||||
    db$taxonomy <- rbind(db$taxonomy, x)
 | 
			
		||||
    isValid <- TRUE
 | 
			
		||||
 | 
			
		||||
    if (jsonDF$species[i] %in% db$taxonomy$species) {
 | 
			
		||||
      cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
 | 
			
		||||
                  i, jsonDF$name[i],
 | 
			
		||||
                  "a species with this name already exists in the database. ",
 | 
			
		||||
                  "Skipping this input."))
 | 
			
		||||
      isValid <- FALSE
 | 
			
		||||
    } else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
 | 
			
		||||
      cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
 | 
			
		||||
                  i, jsonDF$ID[i],
 | 
			
		||||
                  "this taxonomy ID already exists in the database. ",
 | 
			
		||||
                  "Skipping this input."))
 | 
			
		||||
      isValid <- FALSE
 | 
			
		||||
    }
 | 
			
		||||
    if (isValid) {
 | 
			
		||||
      x <- data.frame(
 | 
			
		||||
        ID =  as.integer(jsonDF$ID[i]),
 | 
			
		||||
        species = jsonDF$species[i])
 | 
			
		||||
      db$taxonomy <- rbind(db$taxonomy, x)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return(db)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ==   2.08  dbAddAnnotation()  ================================================
 | 
			
		||||
 | 
			
		||||
# ==   3.08  dbAddAnnotation()  ================================================
 | 
			
		||||
dbAddAnnotation <- function(db, jsonDF) {
 | 
			
		||||
  # Add one or more annotation entries to the database db.
 | 
			
		||||
  # Add one or more annotation entries to the database db. Skip the entry if
 | 
			
		||||
  # it already exists in the database.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     db   list   a database created with dbInit()
 | 
			
		||||
  #     jsonDF  data frame  annotation data imported into a data frame with
 | 
			
		||||
  #                           fromJSON()
 | 
			
		||||
  for (i in seq_len(nrow(jsonDF))) {
 | 
			
		||||
    isValid <- TRUE
 | 
			
		||||
 | 
			
		||||
    sel <- jsonDF$pName[i] == db$protein$name
 | 
			
		||||
    sel <- dbConfirmUnique(sel)
 | 
			
		||||
    sel <- dbConfirmUnique(sel)    # Confirm that this protein ID exists
 | 
			
		||||
    pID <- db$protein$ID[sel]
 | 
			
		||||
 | 
			
		||||
    sel <- jsonDF$fName[i] == db$feature$name
 | 
			
		||||
    sel <- dbConfirmUnique(sel)
 | 
			
		||||
    sel <- dbConfirmUnique(sel)    # Confirm that this feature ID exists
 | 
			
		||||
    fID <- db$feature$ID[sel]
 | 
			
		||||
 | 
			
		||||
    x <- data.frame(ID        = dbAutoincrement(db$annotation),
 | 
			
		||||
                    proteinID = pID,
 | 
			
		||||
                    featureID = fID,
 | 
			
		||||
                    start     = as.integer(jsonDF$start[i]),
 | 
			
		||||
                    end       = as.integer(jsonDF$end[i]))
 | 
			
		||||
    db$annotation <- rbind(db$annotation, x)
 | 
			
		||||
    sel <- db$annotation$proteinID == pID &
 | 
			
		||||
           db$annotation$featureID == fID &
 | 
			
		||||
           db$annotation$start == as.integer(jsonDF$start[idx]) &
 | 
			
		||||
           db$annotation$end   == as.integer(jsonDF$end[idx])
 | 
			
		||||
 | 
			
		||||
    if (any(sel)) {
 | 
			
		||||
      cat(sprintf("Note: annotation No. %d in the input has %s%s%\n",
 | 
			
		||||
                  i,
 | 
			
		||||
                  "the same protein name, feature name, start, and end ",
 | 
			
		||||
                  "as one that already exists in the database. ",
 | 
			
		||||
                  "Skipping this input."))
 | 
			
		||||
 | 
			
		||||
      isValid <- FALSE
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (isValid) {
 | 
			
		||||
      x <- data.frame(ID        = dbAutoincrement(db$annotation),
 | 
			
		||||
                      proteinID = pID,
 | 
			
		||||
                      featureID = fID,
 | 
			
		||||
                      start     = as.integer(jsonDF$start[i]),
 | 
			
		||||
                      end       = as.integer(jsonDF$end[i]))
 | 
			
		||||
      db$annotation <- rbind(db$annotation, x)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return(db)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.09  dbFetchUniProtSeq()  ==============================================
 | 
			
		||||
# ==   3.09  dbFetchUniProtSeq()  ==============================================
 | 
			
		||||
dbFetchUniProtSeq <- function(IDs) {
 | 
			
		||||
  # Fetch a protein sequence from UniProt.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
@@ -286,7 +378,7 @@ if (FALSE) {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.10  dbFetchPrositeFeatures()  =========================================
 | 
			
		||||
# ==   3.10  dbFetchPrositeFeatures()  =========================================
 | 
			
		||||
dbFetchPrositeFeatures <- function(ID) {
 | 
			
		||||
  # Fetch feature annotations from ScanProsite.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
@@ -336,7 +428,7 @@ if (FALSE) {
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ==   2.11  node2text()  ======================================================
 | 
			
		||||
# ==   3.11  node2text()  ======================================================
 | 
			
		||||
node2text <- function(doc, tag) {
 | 
			
		||||
  # an extractor function for the contents of elements
 | 
			
		||||
  # between given tags in an XML response.
 | 
			
		||||
@@ -348,7 +440,7 @@ node2text <- function(doc, tag) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.12  dbFetchNCBItaxData()  =============================================
 | 
			
		||||
# ==   3.12  dbFetchNCBItaxData()  =============================================
 | 
			
		||||
dbFetchNCBItaxData <- function(ID) {
 | 
			
		||||
  # Fetch feature taxID and Organism from the NCBI.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
@@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.13  UniProtIDmap()  ===================================================
 | 
			
		||||
# ==   3.13  UniProtIDmap()  ===================================================
 | 
			
		||||
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 | 
			
		||||
  # Use UniProt ID mapping service to map one or more IDs
 | 
			
		||||
  # Parameters:
 | 
			
		||||
@@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  TESTS  ===============================================================
 | 
			
		||||
# ==   3.14  dbProt2JSON()  ====================================================
 | 
			
		||||
dbProt2JSON <- function(thisProt) {
 | 
			
		||||
  # Extract all protein related data from myDB and return in JSON format.
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  thisData <- list()
 | 
			
		||||
 | 
			
		||||
  # add a protein table
 | 
			
		||||
  sel <- which(myDB$protein$name == thisProt)
 | 
			
		||||
  thisData$protein <- myDB$protein[sel, ]
 | 
			
		||||
 | 
			
		||||
  # add a taxonomy table
 | 
			
		||||
  sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
 | 
			
		||||
  thisData$taxonomy <- myDB$taxonomy[sel, ]
 | 
			
		||||
 | 
			
		||||
  # add the entries for this protein from the  annotation table
 | 
			
		||||
  sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
 | 
			
		||||
  thisData$annotation <- myDB$annotation[sel, ]
 | 
			
		||||
  # our .json convention uses pName and fName as keys, not the db-internal IDs
 | 
			
		||||
  # add empty columns for pName and fName
 | 
			
		||||
  l <- nrow(thisData$annotation)
 | 
			
		||||
  thisData$annotation$pName <- character(l)
 | 
			
		||||
  thisData$annotation$fName <- character(l)
 | 
			
		||||
  # get the appropriate protein and feature names
 | 
			
		||||
  for (i in seq_len(l)) {
 | 
			
		||||
    pID <- thisData$annotation$proteinID[i]
 | 
			
		||||
    sel <- which(myDB$protein$ID == pID)
 | 
			
		||||
    thisData$annotation$pName[i] <- myDB$protein$name[sel]    # store pName
 | 
			
		||||
    fID <- thisData$annotation$featureID[i]
 | 
			
		||||
    sel <- which(myDB$feature$ID == fID)
 | 
			
		||||
    thisData$annotation$fName[i] <- myDB$feature$name[sel]    # store fName
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # add the corresponding feature table
 | 
			
		||||
  sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
 | 
			
		||||
  thisData$feature <- myDB$feature[sel, ]
 | 
			
		||||
 | 
			
		||||
  # remove columns that are not going into JSON output
 | 
			
		||||
  thisData$protein$ID           <- NULL
 | 
			
		||||
  thisData$annotation$ID        <- NULL
 | 
			
		||||
  thisData$annotation$proteinID <- NULL
 | 
			
		||||
  thisData$annotation$featureID <- NULL
 | 
			
		||||
  thisData$feature$ID           <- NULL
 | 
			
		||||
 | 
			
		||||
  # create JSON-formatted output
 | 
			
		||||
  # ( jsonlite::prettify() is too  wordy for a compact Wikipage )
 | 
			
		||||
 | 
			
		||||
  out <- character()
 | 
			
		||||
  out <- c(out, '{')
 | 
			
		||||
 | 
			
		||||
  out <- c(out, '  "protein": {')
 | 
			
		||||
  sel <- colnames(thisData$protein) != "sequence"
 | 
			
		||||
  out <- c(out, sprintf("    %s,", dbRow2JSON(thisData$protein[1, sel],
 | 
			
		||||
                                              coll = ",\n    ")))
 | 
			
		||||
  out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
 | 
			
		||||
  out <- c(out, '  },')
 | 
			
		||||
 | 
			
		||||
  out <- c(out, '  "taxonomy": {')
 | 
			
		||||
  out <- c(out, sprintf("    %s", dbRow2JSON(thisData$taxonomy)))
 | 
			
		||||
  out <- c(out, '  },')
 | 
			
		||||
 | 
			
		||||
  out <- c(out, '  "annotation": [')
 | 
			
		||||
  for (i in seq_len(nrow(thisData$annotation))) {
 | 
			
		||||
    out <- c(out, sprintf("    {%s},", dbRow2JSON(thisData$annotation[i, ])))
 | 
			
		||||
  }
 | 
			
		||||
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
 | 
			
		||||
  out <- c(out, '  ],')
 | 
			
		||||
 | 
			
		||||
  out <- c(out, '  "feature": [')
 | 
			
		||||
  sel <- colnames(thisData$feature) != "description"
 | 
			
		||||
  for (i in seq_len(nrow(thisData$feature))) {
 | 
			
		||||
    out <- c(out, sprintf("    {%s,",
 | 
			
		||||
                          dbRow2JSON(thisData$feature[i, sel])))
 | 
			
		||||
    out <- c(out, sprintf("     %s},",
 | 
			
		||||
                          dbRow2JSON(thisData$feature[i, "description",
 | 
			
		||||
                                                      drop = FALSE])))
 | 
			
		||||
  }
 | 
			
		||||
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
 | 
			
		||||
  out <- c(out, '  ]')
 | 
			
		||||
 | 
			
		||||
  out <- c(out, '}')
 | 
			
		||||
 | 
			
		||||
  return(paste0(out, collapse = "\n"))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.15  dbSeq2JSON()  =====================================================
 | 
			
		||||
 | 
			
		||||
dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
 | 
			
		||||
  # Turn a sequence into a JSON key-value pair, with the value being a JSON
 | 
			
		||||
  # array of elements not exceeding a width of "width", and an indent of
 | 
			
		||||
  # "indents" spaces.
 | 
			
		||||
  ind <- paste0(rep(" ", nIndents), collapse = "")
 | 
			
		||||
 | 
			
		||||
  out <- character()
 | 
			
		||||
  out <- c(out, sprintf("%s\"sequence\" : [", ind))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(s)) {
 | 
			
		||||
    l <- nchar(s[i])
 | 
			
		||||
    if (l <= width) {
 | 
			
		||||
      out <- c(out, s[i])
 | 
			
		||||
    } else {
 | 
			
		||||
      starts <- seq(1, l, by = width)
 | 
			
		||||
      ends <- seq(width, l, by = width)
 | 
			
		||||
      if (length(ends) < length(starts)) { ends <- c(ends, l) }
 | 
			
		||||
      out <- c(out, sprintf("%s  \"%s\",", ind, substring(s[i], starts, ends)))
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
 | 
			
		||||
 | 
			
		||||
  out <- c(out, sprintf("%s]", ind))
 | 
			
		||||
  return(paste0(out, collapse = "\n"))
 | 
			
		||||
}
 | 
			
		||||
cat(dbSeq2JSON(myDB$protein$sequence[1]))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.16  dbRow2JSON()  =====================================================
 | 
			
		||||
 | 
			
		||||
dbRow2JSON <- function(df, coll = ", ") {
 | 
			
		||||
  # Turn a single dataframe row into JSON key value pairs, where the keys are the
 | 
			
		||||
  # column names. Respects character / numeric mode.
 | 
			
		||||
  out <- character()
 | 
			
		||||
  for (i in 1:ncol(df)) {
 | 
			
		||||
    if (class(df[1, i]) == "integer") {
 | 
			
		||||
      val <- sprintf("%d", df[1, i])
 | 
			
		||||
    } else if (class(df[1, i]) == "numeric") {
 | 
			
		||||
      val <- sprintf("%f", df[1, i])
 | 
			
		||||
    } else {
 | 
			
		||||
      val <- sprintf("\"%s\"", as.character(df[1, i]))
 | 
			
		||||
    }
 | 
			
		||||
    out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
 | 
			
		||||
  }
 | 
			
		||||
  return(paste0(out, collapse = coll))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  TESTS  ===============================================================
 | 
			
		||||
 | 
			
		||||
if (doTESTS) {
 | 
			
		||||
  if (! requireNamespace("testthat", quietly = TRUE)) {
 | 
			
		||||
    install.packages("testthat")
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user