Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page

This commit is contained in:
hyginn 2020-10-13 16:08:35 +10:00
parent 3bee83495f
commit 4d071cf8d5
2 changed files with 344 additions and 82 deletions

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit.
#
# Version: 1.2
# Version: 1.3
#
# Date: 2017-11 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 Add code for database export to JSON and instructions
# for uploading annotations to the Public Student Wiki page
# 1.2 Consistently: data in ./myScripts/ ;
# begin SHARING DATA section
# 1.1 2020 Updates
@ -31,12 +33,14 @@
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Update your database script 42
#TOC> 1.1 Preparing an annotation file ... 49
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 52
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 97
#TOC> 1.2 Execute and Validate 124
#TOC> 2 Plot Annotations 149
#TOC> 1 Update your database script 48
#TOC> 1.1 Preparing an annotation file ... 55
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 58
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 106
#TOC> 1.2 Execute and Validate 133
#TOC> 2 Plot Annotations 158
#TOC> 3 SHARING DATA 283
#TOC> 3.1 Post MBP1_MYSPE as JSON data 298
#TOC>
#TOC> ==========================================================================
@ -276,16 +280,47 @@ par(oPar) # reset the plot parameters
# It would be better to align the motif borders, at least approximately (not
# all proteins have all motifs). How would you go about doing that?
# = 1 SHARING DATA ======
# = 3 SHARING DATA ========================================================
# It's particularly interesting to compare such annotations across many
# homologous proteins. I have created a file on the Student Wiki that you can
# homologous proteins. I have created a page on the Student Wiki () that you can
# edit, and then download the data from the entire class directly to your
# RStudio project.
#
# I have provided a function that extracts all information that refers to a
# single protein from the database, and prints it out as well-formatted JSON,
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
# bookkeeping involved, but the code is not otherwise very enlightening so I
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
# would want to have a look.
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
# Task:
# =====
# TBC ...
# 1: Run the following code:
cat("{{Vspace}}",
"<!-- ==== BEGIN PROTEIN ==== -->",
"<pre>",
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
"</pre>",
"<!-- ===== END PROTEIN ====== -->",
"", sep = "\n"
)
# 2: Copy the entire output,
# 3: Navigate to
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
# ... edit the page, and paste your output at the top.
# 4: Save your edits.
# Next, once we have collected a number of protein annotations, we can access
# the page and import the data into our database.
#
# Code to come soon ...
# [END]

View File

@ -1,6 +1,30 @@
# tocID <- "scripts/ABC-dbUtilities.R"
#
# database utilities for ABC learning units
# Purpose: Database utilities for ABC learning units.
#
# Version 2.1
#
# Date: 2017-11 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 2.1 Add JSON export functions
# 2.0 Test all JSON import and prevent addition of duplicates. This
# is necessary for import of data from the public page
# 1.1 2020 Updates
# 1.0 Live version 2017
#
# Notes:
# There are no functions to modify or delete entries. To do either,
# recreate the database with correct data in the creation script. This is the
# preferred way that ensures the entire database can be reproduced by
# source()'ing its generating script.
#
# Inserting data goes only through the very most minimal validation steps. For
# production applications, more validation would need to be added, as well
# as an overall validation of database integrity
#
# ToDo:
#
# ==============================================================================
@ -8,49 +32,56 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 PACKAGES 32
#TOC> 2 FUNCTIONS 50
#TOC> 2.01 dbSanitizeSequence() 53
#TOC> 2.02 dbConfirmUnique() 88
#TOC> 2.03 dbInit() 106
#TOC> 2.04 dbAutoincrement() 147
#TOC> 2.05 dbAddProtein() 160
#TOC> 2.06 dbAddFeature() 180
#TOC> 2.07 dbAddTaxonomy() 199
#TOC> 2.08 dbAddAnnotation() 215
#TOC> 2.09 dbFetchUniProtSeq() 243
#TOC> 2.10 dbFetchPrositeFeatures() 289
#TOC> 2.11 node2text() 339
#TOC> 2.12 dbFetchNCBItaxData() 351
#TOC> 2.13 UniProtIDmap() 390
#TOC> 3 TESTS 429
#TOC> -------------------------------------------------------
#TOC> 1 INITIALISATIONS AND PARAMETERS 60
#TOC> 2 PACKAGES 65
#TOC> 3 FUNCTIONS 81
#TOC> 3.01 dbSanitizeSequence() 84
#TOC> 3.02 dbConfirmUnique() 119
#TOC> 3.03 dbInit() 137
#TOC> 3.04 dbAutoincrement() 177
#TOC> 3.05 dbAddProtein() 190
#TOC> 3.06 dbAddFeature() 222
#TOC> 3.07 dbAddTaxonomy() 253
#TOC> 3.08 dbAddAnnotation() 288
#TOC> 3.09 dbFetchUniProtSeq() 335
#TOC> 3.10 dbFetchPrositeFeatures() 381
#TOC> 3.11 node2text() 431
#TOC> 3.12 dbFetchNCBItaxData() 443
#TOC> 3.13 UniProtIDmap() 482
#TOC> 3.14 dbProt2JSON() 521
#TOC> 3.15 dbSeq2JSON() 606
#TOC> 3.16 dbRow2JSON() 636
#TOC> 4 TESTS 656
#TOC>
#TOC> ==========================================================================
# = 1 PACKAGES ============================================================
# = 1 INITIALISATIONS AND PARAMETERS ======================================
doTESTS <- FALSE # run tests if TRUE
# = 2 PACKAGES ============================================================
if (! requireNamespace("jsonlite", quietly = TRUE)) {
install.packages("jsonlite")
}
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
if (! requireNamespace("xml2", quietly = TRUE)) {
install.packages("xml2")
}
# = 2 FUNCTIONS ===========================================================
# = 3 FUNCTIONS ===========================================================
# == 2.01 dbSanitizeSequence() =============================================
# == 3.01 dbSanitizeSequence() =============================================
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
# Remove FASTA header lines, if any,
# flatten any structure that s has,
@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
}
# == 2.02 dbConfirmUnique() ================================================
# == 3.02 dbConfirmUnique() ================================================
dbConfirmUnique <- function(x) {
# x is a vector of logicals.
# returns x if x has exactly one TRUE element.
@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) {
}
# == 2.03 dbInit() =========================================================
# == 3.03 dbInit() =========================================================
dbInit <- function() {
# Return an empty instance of the protein database
# Open the link and study the schema:
# The schema is here:
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
db <- list()
@ -125,7 +156,6 @@ dbInit <- function() {
ID = numeric(),
species = character())
db$annotation <- data.frame(
ID = numeric(),
proteinID = numeric(),
@ -144,7 +174,7 @@ dbInit <- function() {
}
# == 2.04 dbAutoincrement() ================================================
# == 3.04 dbAutoincrement() ================================================
dbAutoincrement <- function(tb) {
# Return a unique integer that can be used as a primary key
# Value:
@ -157,14 +187,25 @@ dbAutoincrement <- function(tb) {
}
# == 2.05 dbAddProtein() ===================================================
# == 3.05 dbAddProtein() ===================================================
dbAddProtein <- function(db, jsonDF) {
# Add one or more protein entries to the database db.
# Add one or more protein entries to the database db if a protein with the
# same name does not yet exist. This enforces that protein names are unique.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame protein data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
if (jsonDF$name[i] %in% db$protein$name) {
cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
i, jsonDF$name[i],
"a protein with this name already exists in the database. ",
"Skipping this input."))
isValid <- FALSE
}
if (isValid) {
x <- data.frame(ID = dbAutoincrement(db$protein),
name = jsonDF$name[i],
RefSeqID = jsonDF$RefSeqID[i],
@ -173,18 +214,30 @@ dbAddProtein <- function(db, jsonDF) {
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
db$protein <- rbind(db$protein, x)
}
}
return(db)
}
# == 2.06 dbAddFeature() ===================================================
# == 3.06 dbAddFeature() ===================================================
dbAddFeature <- function(db, jsonDF) {
# Add one or more feature entries to the database db.
# Add one or more feature entries to the database db. Skip if a feature with
# the same name already exists.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame feature data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
isValid <- TRUE
if (jsonDF$name[i] %in% db$feature$name) {
cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
i, jsonDF$name[i],
"a feature with this name already exists in the database. ",
"Skipping this input."))
isValid <- FALSE
}
if (isVALID) {
x <- data.frame(ID = dbAutoincrement(db$feature),
name = jsonDF$name[i],
description = jsonDF$description[i],
@ -192,43 +245,81 @@ dbAddFeature <- function(db, jsonDF) {
accession = jsonDF$accession[i])
db$feature <- rbind(db$feature, x)
}
}
return(db)
}
# == 2.07 dbAddTaxonomy() ==================================================
# == 3.07 dbAddTaxonomy() ==================================================
dbAddTaxonomy <- function(db, jsonDF) {
# Add one or more taxonomy entries to the database db.
# Add one or more taxonomy entries to the database db. Skip if species name
# or taxonomy ID already exist in the database.
# Parameters:
# db list A database created with dbInit()
# jsonDF data frame Taxonomy data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
isValid <- TRUE
if (jsonDF$species[i] %in% db$taxonomy$species) {
cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
i, jsonDF$name[i],
"a species with this name already exists in the database. ",
"Skipping this input."))
isValid <- FALSE
} else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
i, jsonDF$ID[i],
"this taxonomy ID already exists in the database. ",
"Skipping this input."))
isValid <- FALSE
}
if (isValid) {
x <- data.frame(
ID = jsonDF$ID[i],
ID = as.integer(jsonDF$ID[i]),
species = jsonDF$species[i])
db$taxonomy <- rbind(db$taxonomy, x)
}
}
return(db)
}
# == 2.08 dbAddAnnotation() ================================================
# == 3.08 dbAddAnnotation() ================================================
dbAddAnnotation <- function(db, jsonDF) {
# Add one or more annotation entries to the database db.
# Add one or more annotation entries to the database db. Skip the entry if
# it already exists in the database.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame annotation data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
isValid <- TRUE
sel <- jsonDF$pName[i] == db$protein$name
sel <- dbConfirmUnique(sel)
sel <- dbConfirmUnique(sel) # Confirm that this protein ID exists
pID <- db$protein$ID[sel]
sel <- jsonDF$fName[i] == db$feature$name
sel <- dbConfirmUnique(sel)
sel <- dbConfirmUnique(sel) # Confirm that this feature ID exists
fID <- db$feature$ID[sel]
sel <- db$annotation$proteinID == pID &
db$annotation$featureID == fID &
db$annotation$start == as.integer(jsonDF$start[idx]) &
db$annotation$end == as.integer(jsonDF$end[idx])
if (any(sel)) {
cat(sprintf("Note: annotation No. %d in the input has %s%s%\n",
i,
"the same protein name, feature name, start, and end ",
"as one that already exists in the database. ",
"Skipping this input."))
isValid <- FALSE
}
if (isValid) {
x <- data.frame(ID = dbAutoincrement(db$annotation),
proteinID = pID,
featureID = fID,
@ -236,11 +327,12 @@ dbAddAnnotation <- function(db, jsonDF) {
end = as.integer(jsonDF$end[i]))
db$annotation <- rbind(db$annotation, x)
}
}
return(db)
}
# == 2.09 dbFetchUniProtSeq() ==============================================
# == 3.09 dbFetchUniProtSeq() ==============================================
dbFetchUniProtSeq <- function(IDs) {
# Fetch a protein sequence from UniProt.
# Parameters:
@ -286,7 +378,7 @@ if (FALSE) {
# == 2.10 dbFetchPrositeFeatures() =========================================
# == 3.10 dbFetchPrositeFeatures() =========================================
dbFetchPrositeFeatures <- function(ID) {
# Fetch feature annotations from ScanProsite.
# Parameters:
@ -336,7 +428,7 @@ if (FALSE) {
}
# == 2.11 node2text() ======================================================
# == 3.11 node2text() ======================================================
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
# between given tags in an XML response.
@ -348,7 +440,7 @@ node2text <- function(doc, tag) {
}
# == 2.12 dbFetchNCBItaxData() =============================================
# == 3.12 dbFetchNCBItaxData() =============================================
dbFetchNCBItaxData <- function(ID) {
# Fetch feature taxID and Organism from the NCBI.
# Parameters:
@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) {
# == 2.13 UniProtIDmap() ===================================================
# == 3.13 UniProtIDmap() ===================================================
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# Use UniProt ID mapping service to map one or more IDs
# Parameters:
@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
}
# = 3 TESTS ===============================================================
# == 3.14 dbProt2JSON() ====================================================
dbProt2JSON <- function(thisProt) {
# Extract all protein related data from myDB and return in JSON format.
if (FALSE) {
thisData <- list()
# add a protein table
sel <- which(myDB$protein$name == thisProt)
thisData$protein <- myDB$protein[sel, ]
# add a taxonomy table
sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
thisData$taxonomy <- myDB$taxonomy[sel, ]
# add the entries for this protein from the annotation table
sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
thisData$annotation <- myDB$annotation[sel, ]
# our .json convention uses pName and fName as keys, not the db-internal IDs
# add empty columns for pName and fName
l <- nrow(thisData$annotation)
thisData$annotation$pName <- character(l)
thisData$annotation$fName <- character(l)
# get the appropriate protein and feature names
for (i in seq_len(l)) {
pID <- thisData$annotation$proteinID[i]
sel <- which(myDB$protein$ID == pID)
thisData$annotation$pName[i] <- myDB$protein$name[sel] # store pName
fID <- thisData$annotation$featureID[i]
sel <- which(myDB$feature$ID == fID)
thisData$annotation$fName[i] <- myDB$feature$name[sel] # store fName
}
# add the corresponding feature table
sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
thisData$feature <- myDB$feature[sel, ]
# remove columns that are not going into JSON output
thisData$protein$ID <- NULL
thisData$annotation$ID <- NULL
thisData$annotation$proteinID <- NULL
thisData$annotation$featureID <- NULL
thisData$feature$ID <- NULL
# create JSON-formatted output
# ( jsonlite::prettify() is too wordy for a compact Wikipage )
out <- character()
out <- c(out, '{')
out <- c(out, ' "protein": {')
sel <- colnames(thisData$protein) != "sequence"
out <- c(out, sprintf(" %s,", dbRow2JSON(thisData$protein[1, sel],
coll = ",\n ")))
out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
out <- c(out, ' },')
out <- c(out, ' "taxonomy": {')
out <- c(out, sprintf(" %s", dbRow2JSON(thisData$taxonomy)))
out <- c(out, ' },')
out <- c(out, ' "annotation": [')
for (i in seq_len(nrow(thisData$annotation))) {
out <- c(out, sprintf(" {%s},", dbRow2JSON(thisData$annotation[i, ])))
}
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
out <- c(out, ' ],')
out <- c(out, ' "feature": [')
sel <- colnames(thisData$feature) != "description"
for (i in seq_len(nrow(thisData$feature))) {
out <- c(out, sprintf(" {%s,",
dbRow2JSON(thisData$feature[i, sel])))
out <- c(out, sprintf(" %s},",
dbRow2JSON(thisData$feature[i, "description",
drop = FALSE])))
}
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
out <- c(out, ' ]')
out <- c(out, '}')
return(paste0(out, collapse = "\n"))
}
# == 3.15 dbSeq2JSON() =====================================================
dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
# Turn a sequence into a JSON key-value pair, with the value being a JSON
# array of elements not exceeding a width of "width", and an indent of
# "indents" spaces.
ind <- paste0(rep(" ", nIndents), collapse = "")
out <- character()
out <- c(out, sprintf("%s\"sequence\" : [", ind))
for (i in seq_along(s)) {
l <- nchar(s[i])
if (l <= width) {
out <- c(out, s[i])
} else {
starts <- seq(1, l, by = width)
ends <- seq(width, l, by = width)
if (length(ends) < length(starts)) { ends <- c(ends, l) }
out <- c(out, sprintf("%s \"%s\",", ind, substring(s[i], starts, ends)))
}
}
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
out <- c(out, sprintf("%s]", ind))
return(paste0(out, collapse = "\n"))
}
cat(dbSeq2JSON(myDB$protein$sequence[1]))
# == 3.16 dbRow2JSON() =====================================================
dbRow2JSON <- function(df, coll = ", ") {
# Turn a single dataframe row into JSON key value pairs, where the keys are the
# column names. Respects character / numeric mode.
out <- character()
for (i in 1:ncol(df)) {
if (class(df[1, i]) == "integer") {
val <- sprintf("%d", df[1, i])
} else if (class(df[1, i]) == "numeric") {
val <- sprintf("%f", df[1, i])
} else {
val <- sprintf("\"%s\"", as.character(df[1, i]))
}
out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
}
return(paste0(out, collapse = coll))
}
# = 4 TESTS ===============================================================
if (doTESTS) {
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
}