From 4d071cf8d57fc6ece7b30eeaa9d1a9ae792da79f Mon Sep 17 00:00:00 2001 From: hyginn Date: Tue, 13 Oct 2020 16:08:35 +1000 Subject: [PATCH] Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page --- BIN-FUNC-Domain_annotation.R | 61 ++++-- scripts/ABC-dbUtilities.R | 365 ++++++++++++++++++++++++++++------- 2 files changed, 344 insertions(+), 82 deletions(-) diff --git a/BIN-FUNC-Domain_annotation.R b/BIN-FUNC-Domain_annotation.R index a6daa7b..6c1a4d9 100644 --- a/BIN-FUNC-Domain_annotation.R +++ b/BIN-FUNC-Domain_annotation.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC-Domain_annotation unit. # -# Version: 1.2 +# Version: 1.3 # # Date: 2017-11 - 2020-10 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.3 Add code for database export to JSON and instructions +# for uploading annotations to the Public Student Wiki page # 1.2 Consistently: data in ./myScripts/ ; # begin SHARING DATA section # 1.1 2020 Updates @@ -28,16 +30,18 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------------- -#TOC> 1 Update your database script 42 -#TOC> 1.1 Preparing an annotation file ... 49 -#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 52 -#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 97 -#TOC> 1.2 Execute and Validate 124 -#TOC> 2 Plot Annotations 149 -#TOC> +#TOC> 1 Update your database script 48 +#TOC> 1.1 Preparing an annotation file ... 55 +#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 58 +#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 106 +#TOC> 1.2 Execute and Validate 133 +#TOC> 2 Plot Annotations 158 +#TOC> 3 SHARING DATA 283 +#TOC> 3.1 Post MBP1_MYSPE as JSON data 298 +#TOC> #TOC> ========================================================================== @@ -99,7 +103,7 @@ # Then SKIP the next section. # # -# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" +# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" # # IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: # @@ -276,16 +280,47 @@ par(oPar) # reset the plot parameters # It would be better to align the motif borders, at least approximately (not # all proteins have all motifs). How would you go about doing that? -# = 1 SHARING DATA ====== +# = 3 SHARING DATA ======================================================== # It's particularly interesting to compare such annotations across many -# homologous proteins. I have created a file on the Student Wiki that you can +# homologous proteins. I have created a page on the Student Wiki () that you can # edit, and then download the data from the entire class directly to your # RStudio project. # + +# I have provided a function that extracts all information that refers to a +# single protein from the database, and prints it out as well-formatted JSON, +# suitable to be pasted into our shareable Wiki-page. There is a fair amount of +# bookkeeping involved, but the code is not otherwise very enlightening so I +# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you +# would want to have a look. + +# == 3.1 Post MBP1_MYSPE as JSON data ====================================== + # Task: # ===== -# TBC ... +# 1: Run the following code: + +cat("{{Vspace}}", + "", + "
",
+    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
+    "
", + "", + "", sep = "\n" +) + +# 2: Copy the entire output, +# 3: Navigate to +# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public +# ... edit the page, and paste your output at the top. +# 4: Save your edits. + +# Next, once we have collected a number of protein annotations, we can access +# the page and import the data into our database. +# +# Code to come soon ... + # [END] diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R index be4e800..c5b8a9b 100644 --- a/scripts/ABC-dbUtilities.R +++ b/scripts/ABC-dbUtilities.R @@ -1,56 +1,87 @@ # tocID <- "scripts/ABC-dbUtilities.R" # -# database utilities for ABC learning units +# Purpose: Database utilities for ABC learning units. +# +# Version 2.1 +# +# Date: 2017-11 - 2020-10 +# Author: Boris Steipe (boris.steipe@utoronto.ca) +# +# Versions: +# 2.1 Add JSON export functions +# 2.0 Test all JSON import and prevent addition of duplicates. This +# is necessary for import of data from the public page +# 1.1 2020 Updates +# 1.0 Live version 2017 +# +# Notes: +# There are no functions to modify or delete entries. To do either, +# recreate the database with correct data in the creation script. This is the +# preferred way that ensures the entire database can be reproduced by +# source()'ing its generating script. +# +# Inserting data goes only through the very most minimal validation steps. For +# production applications, more validation would need to be added, as well +# as an overall validation of database integrity +# +# ToDo: # # ============================================================================== #TOC> ========================================================================== #TOC> -#TOC> Section Title Line -#TOC> ------------------------------------------------- -#TOC> 1 PACKAGES 32 -#TOC> 2 FUNCTIONS 50 -#TOC> 2.01 dbSanitizeSequence() 53 -#TOC> 2.02 dbConfirmUnique() 88 -#TOC> 2.03 dbInit() 106 -#TOC> 2.04 dbAutoincrement() 147 -#TOC> 2.05 dbAddProtein() 160 -#TOC> 2.06 dbAddFeature() 180 -#TOC> 2.07 dbAddTaxonomy() 199 -#TOC> 2.08 dbAddAnnotation() 215 -#TOC> 2.09 dbFetchUniProtSeq() 243 -#TOC> 2.10 dbFetchPrositeFeatures() 289 -#TOC> 2.11 node2text() 339 -#TOC> 2.12 dbFetchNCBItaxData() 351 -#TOC> 2.13 UniProtIDmap() 390 -#TOC> 3 TESTS 429 +#TOC> Section Title Line +#TOC> ------------------------------------------------------- +#TOC> 1 INITIALISATIONS AND PARAMETERS 60 +#TOC> 2 PACKAGES 65 +#TOC> 3 FUNCTIONS 81 +#TOC> 3.01 dbSanitizeSequence() 84 +#TOC> 3.02 dbConfirmUnique() 119 +#TOC> 3.03 dbInit() 137 +#TOC> 3.04 dbAutoincrement() 177 +#TOC> 3.05 dbAddProtein() 190 +#TOC> 3.06 dbAddFeature() 222 +#TOC> 3.07 dbAddTaxonomy() 253 +#TOC> 3.08 dbAddAnnotation() 288 +#TOC> 3.09 dbFetchUniProtSeq() 335 +#TOC> 3.10 dbFetchPrositeFeatures() 381 +#TOC> 3.11 node2text() 431 +#TOC> 3.12 dbFetchNCBItaxData() 443 +#TOC> 3.13 UniProtIDmap() 482 +#TOC> 3.14 dbProt2JSON() 521 +#TOC> 3.15 dbSeq2JSON() 606 +#TOC> 3.16 dbRow2JSON() 636 +#TOC> 4 TESTS 656 #TOC> #TOC> ========================================================================== -# = 1 PACKAGES ============================================================ +# = 1 INITIALISATIONS AND PARAMETERS ====================================== + +doTESTS <- FALSE # run tests if TRUE + + +# = 2 PACKAGES ============================================================ if (! requireNamespace("jsonlite", quietly = TRUE)) { install.packages("jsonlite") } - if (! requireNamespace("httr", quietly = TRUE)) { install.packages("httr") } - if (! requireNamespace("xml2", quietly = TRUE)) { install.packages("xml2") } -# = 2 FUNCTIONS =========================================================== +# = 3 FUNCTIONS =========================================================== -# == 2.01 dbSanitizeSequence() ============================================= +# == 3.01 dbSanitizeSequence() ============================================= dbSanitizeSequence <- function(s, unambiguous = TRUE) { # Remove FASTA header lines, if any, # flatten any structure that s has, @@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) { } -# == 2.02 dbConfirmUnique() ================================================ +# == 3.02 dbConfirmUnique() ================================================ dbConfirmUnique <- function(x) { # x is a vector of logicals. # returns x if x has exactly one TRUE element. @@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) { } -# == 2.03 dbInit() ========================================================= +# == 3.03 dbInit() ========================================================= dbInit <- function() { # Return an empty instance of the protein database - # Open the link and study the schema: + # The schema is here: # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 db <- list() @@ -125,7 +156,6 @@ dbInit <- function() { ID = numeric(), species = character()) - db$annotation <- data.frame( ID = numeric(), proteinID = numeric(), @@ -144,7 +174,7 @@ dbInit <- function() { } -# == 2.04 dbAutoincrement() ================================================ +# == 3.04 dbAutoincrement() ================================================ dbAutoincrement <- function(tb) { # Return a unique integer that can be used as a primary key # Value: @@ -157,90 +187,152 @@ dbAutoincrement <- function(tb) { } -# == 2.05 dbAddProtein() =================================================== +# == 3.05 dbAddProtein() =================================================== dbAddProtein <- function(db, jsonDF) { - # Add one or more protein entries to the database db. + # Add one or more protein entries to the database db if a protein with the + # same name does not yet exist. This enforces that protein names are unique. # Parameters: # db list a database created with dbInit() # jsonDF data frame protein data imported into a data frame with # fromJSON() + for (i in seq_len(nrow(jsonDF))) { - x <- data.frame(ID = dbAutoincrement(db$protein), - name = jsonDF$name[i], - RefSeqID = jsonDF$RefSeqID[i], - UniProtID = jsonDF$UniProtID[i], - taxonomyID = jsonDF$taxonomyID[i], - sequence = dbSanitizeSequence(jsonDF$sequence[i])) - db$protein <- rbind(db$protein, x) + if (jsonDF$name[i] %in% db$protein$name) { + cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n", + i, jsonDF$name[i], + "a protein with this name already exists in the database. ", + "Skipping this input.")) + isValid <- FALSE + } + + if (isValid) { + x <- data.frame(ID = dbAutoincrement(db$protein), + name = jsonDF$name[i], + RefSeqID = jsonDF$RefSeqID[i], + UniProtID = jsonDF$UniProtID[i], + taxonomyID = jsonDF$taxonomyID[i], + sequence = dbSanitizeSequence(jsonDF$sequence[i])) + db$protein <- rbind(db$protein, x) + } } return(db) } -# == 2.06 dbAddFeature() =================================================== +# == 3.06 dbAddFeature() =================================================== dbAddFeature <- function(db, jsonDF) { - # Add one or more feature entries to the database db. + # Add one or more feature entries to the database db. Skip if a feature with + # the same name already exists. # Parameters: # db list a database created with dbInit() # jsonDF data frame feature data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { - x <- data.frame(ID = dbAutoincrement(db$feature), - name = jsonDF$name[i], - description = jsonDF$description[i], - sourceDB = jsonDF$sourceDB[i], - accession = jsonDF$accession[i]) - db$feature <- rbind(db$feature, x) + isValid <- TRUE + if (jsonDF$name[i] %in% db$feature$name) { + cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n", + i, jsonDF$name[i], + "a feature with this name already exists in the database. ", + "Skipping this input.")) + isValid <- FALSE + } + + if (isVALID) { + x <- data.frame(ID = dbAutoincrement(db$feature), + name = jsonDF$name[i], + description = jsonDF$description[i], + sourceDB = jsonDF$sourceDB[i], + accession = jsonDF$accession[i]) + db$feature <- rbind(db$feature, x) + } } return(db) } -# == 2.07 dbAddTaxonomy() ================================================== +# == 3.07 dbAddTaxonomy() ================================================== dbAddTaxonomy <- function(db, jsonDF) { - # Add one or more taxonomy entries to the database db. + # Add one or more taxonomy entries to the database db. Skip if species name + # or taxonomy ID already exist in the database. # Parameters: # db list A database created with dbInit() # jsonDF data frame Taxonomy data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { - x <- data.frame( - ID = jsonDF$ID[i], - species = jsonDF$species[i]) - db$taxonomy <- rbind(db$taxonomy, x) + isValid <- TRUE + + if (jsonDF$species[i] %in% db$taxonomy$species) { + cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n", + i, jsonDF$name[i], + "a species with this name already exists in the database. ", + "Skipping this input.")) + isValid <- FALSE + } else if (jsonDF$ID[i] %in% db$taxonomy$ID) { + cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n", + i, jsonDF$ID[i], + "this taxonomy ID already exists in the database. ", + "Skipping this input.")) + isValid <- FALSE + } + if (isValid) { + x <- data.frame( + ID = as.integer(jsonDF$ID[i]), + species = jsonDF$species[i]) + db$taxonomy <- rbind(db$taxonomy, x) + } } return(db) } -# == 2.08 dbAddAnnotation() ================================================ + +# == 3.08 dbAddAnnotation() ================================================ dbAddAnnotation <- function(db, jsonDF) { - # Add one or more annotation entries to the database db. + # Add one or more annotation entries to the database db. Skip the entry if + # it already exists in the database. # Parameters: # db list a database created with dbInit() # jsonDF data frame annotation data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { + isValid <- TRUE sel <- jsonDF$pName[i] == db$protein$name - sel <- dbConfirmUnique(sel) + sel <- dbConfirmUnique(sel) # Confirm that this protein ID exists pID <- db$protein$ID[sel] sel <- jsonDF$fName[i] == db$feature$name - sel <- dbConfirmUnique(sel) + sel <- dbConfirmUnique(sel) # Confirm that this feature ID exists fID <- db$feature$ID[sel] - x <- data.frame(ID = dbAutoincrement(db$annotation), - proteinID = pID, - featureID = fID, - start = as.integer(jsonDF$start[i]), - end = as.integer(jsonDF$end[i])) - db$annotation <- rbind(db$annotation, x) + sel <- db$annotation$proteinID == pID & + db$annotation$featureID == fID & + db$annotation$start == as.integer(jsonDF$start[idx]) & + db$annotation$end == as.integer(jsonDF$end[idx]) + + if (any(sel)) { + cat(sprintf("Note: annotation No. %d in the input has %s%s%\n", + i, + "the same protein name, feature name, start, and end ", + "as one that already exists in the database. ", + "Skipping this input.")) + + isValid <- FALSE + } + + if (isValid) { + x <- data.frame(ID = dbAutoincrement(db$annotation), + proteinID = pID, + featureID = fID, + start = as.integer(jsonDF$start[i]), + end = as.integer(jsonDF$end[i])) + db$annotation <- rbind(db$annotation, x) + } } return(db) } -# == 2.09 dbFetchUniProtSeq() ============================================== +# == 3.09 dbFetchUniProtSeq() ============================================== dbFetchUniProtSeq <- function(IDs) { # Fetch a protein sequence from UniProt. # Parameters: @@ -286,7 +378,7 @@ if (FALSE) { -# == 2.10 dbFetchPrositeFeatures() ========================================= +# == 3.10 dbFetchPrositeFeatures() ========================================= dbFetchPrositeFeatures <- function(ID) { # Fetch feature annotations from ScanProsite. # Parameters: @@ -336,7 +428,7 @@ if (FALSE) { } -# == 2.11 node2text() ====================================================== +# == 3.11 node2text() ====================================================== node2text <- function(doc, tag) { # an extractor function for the contents of elements # between given tags in an XML response. @@ -348,7 +440,7 @@ node2text <- function(doc, tag) { } -# == 2.12 dbFetchNCBItaxData() ============================================= +# == 3.12 dbFetchNCBItaxData() ============================================= dbFetchNCBItaxData <- function(ID) { # Fetch feature taxID and Organism from the NCBI. # Parameters: @@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) { -# == 2.13 UniProtIDmap() =================================================== +# == 3.13 UniProtIDmap() =================================================== UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { # Use UniProt ID mapping service to map one or more IDs # Parameters: @@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { } -# = 3 TESTS =============================================================== +# == 3.14 dbProt2JSON() ==================================================== +dbProt2JSON <- function(thisProt) { + # Extract all protein related data from myDB and return in JSON format. -if (FALSE) { + thisData <- list() + + # add a protein table + sel <- which(myDB$protein$name == thisProt) + thisData$protein <- myDB$protein[sel, ] + + # add a taxonomy table + sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID) + thisData$taxonomy <- myDB$taxonomy[sel, ] + + # add the entries for this protein from the annotation table + sel <- which(myDB$annotation$proteinID == thisData$protein$ID) + thisData$annotation <- myDB$annotation[sel, ] + # our .json convention uses pName and fName as keys, not the db-internal IDs + # add empty columns for pName and fName + l <- nrow(thisData$annotation) + thisData$annotation$pName <- character(l) + thisData$annotation$fName <- character(l) + # get the appropriate protein and feature names + for (i in seq_len(l)) { + pID <- thisData$annotation$proteinID[i] + sel <- which(myDB$protein$ID == pID) + thisData$annotation$pName[i] <- myDB$protein$name[sel] # store pName + fID <- thisData$annotation$featureID[i] + sel <- which(myDB$feature$ID == fID) + thisData$annotation$fName[i] <- myDB$feature$name[sel] # store fName + } + + # add the corresponding feature table + sel <- which(myDB$feature$ID %in% thisData$annotation$featureID) + thisData$feature <- myDB$feature[sel, ] + + # remove columns that are not going into JSON output + thisData$protein$ID <- NULL + thisData$annotation$ID <- NULL + thisData$annotation$proteinID <- NULL + thisData$annotation$featureID <- NULL + thisData$feature$ID <- NULL + + # create JSON-formatted output + # ( jsonlite::prettify() is too wordy for a compact Wikipage ) + + out <- character() + out <- c(out, '{') + + out <- c(out, ' "protein": {') + sel <- colnames(thisData$protein) != "sequence" + out <- c(out, sprintf(" %s,", dbRow2JSON(thisData$protein[1, sel], + coll = ",\n "))) + out <- c(out, dbSeq2JSON(thisData$protein$sequence[1])) + out <- c(out, ' },') + + out <- c(out, ' "taxonomy": {') + out <- c(out, sprintf(" %s", dbRow2JSON(thisData$taxonomy))) + out <- c(out, ' },') + + out <- c(out, ' "annotation": [') + for (i in seq_len(nrow(thisData$annotation))) { + out <- c(out, sprintf(" {%s},", dbRow2JSON(thisData$annotation[i, ]))) + } + out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last "," + out <- c(out, ' ],') + + out <- c(out, ' "feature": [') + sel <- colnames(thisData$feature) != "description" + for (i in seq_len(nrow(thisData$feature))) { + out <- c(out, sprintf(" {%s,", + dbRow2JSON(thisData$feature[i, sel]))) + out <- c(out, sprintf(" %s},", + dbRow2JSON(thisData$feature[i, "description", + drop = FALSE]))) + } + out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last "," + out <- c(out, ' ]') + + out <- c(out, '}') + + return(paste0(out, collapse = "\n")) +} + + +# == 3.15 dbSeq2JSON() ===================================================== + +dbSeq2JSON <- function(s, nIndents = 4, width = 70) { + # Turn a sequence into a JSON key-value pair, with the value being a JSON + # array of elements not exceeding a width of "width", and an indent of + # "indents" spaces. + ind <- paste0(rep(" ", nIndents), collapse = "") + + out <- character() + out <- c(out, sprintf("%s\"sequence\" : [", ind)) + + for (i in seq_along(s)) { + l <- nchar(s[i]) + if (l <= width) { + out <- c(out, s[i]) + } else { + starts <- seq(1, l, by = width) + ends <- seq(width, l, by = width) + if (length(ends) < length(starts)) { ends <- c(ends, l) } + out <- c(out, sprintf("%s \"%s\",", ind, substring(s[i], starts, ends))) + } + } + out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last "," + + out <- c(out, sprintf("%s]", ind)) + return(paste0(out, collapse = "\n")) +} +cat(dbSeq2JSON(myDB$protein$sequence[1])) + + +# == 3.16 dbRow2JSON() ===================================================== + +dbRow2JSON <- function(df, coll = ", ") { + # Turn a single dataframe row into JSON key value pairs, where the keys are the + # column names. Respects character / numeric mode. + out <- character() + for (i in 1:ncol(df)) { + if (class(df[1, i]) == "integer") { + val <- sprintf("%d", df[1, i]) + } else if (class(df[1, i]) == "numeric") { + val <- sprintf("%f", df[1, i]) + } else { + val <- sprintf("\"%s\"", as.character(df[1, i])) + } + out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val)) + } + return(paste0(out, collapse = coll)) +} + + +# = 4 TESTS =============================================================== + +if (doTESTS) { if (! requireNamespace("testthat", quietly = TRUE)) { install.packages("testthat") }