670 lines
22 KiB
R
670 lines
22 KiB
R
# tocID <- "scripts/ABC-dbUtilities.R"
|
|
#
|
|
# Purpose: Database utilities for ABC learning units.
|
|
#
|
|
# Version 2.2
|
|
#
|
|
# Date: 2017-11 - 2020-10
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
#
|
|
# Versions:
|
|
# 2.2 Bugfixes
|
|
# 2.1 Add JSON export functions
|
|
# 2.0 Test all JSON import and prevent addition of duplicates. This
|
|
# is necessary for import of data from the public page
|
|
# 1.1 2020 Updates
|
|
# 1.0 Live version 2017
|
|
#
|
|
# Notes:
|
|
# There are no functions to modify or delete entries. To do either,
|
|
# recreate the database with correct data in the creation script. This is the
|
|
# preferred way that ensures the entire database can be reproduced by
|
|
# source()'ing its generating script.
|
|
#
|
|
# Inserting data goes only through the very most minimal validation steps. For
|
|
# production applications, more validation would need to be added, as well
|
|
# as an overall validation of database integrity
|
|
#
|
|
# ToDo:
|
|
#
|
|
# ==============================================================================
|
|
|
|
|
|
#TOC> ==========================================================================
|
|
#TOC>
|
|
#TOC> Section Title Line
|
|
#TOC> -------------------------------------------------------
|
|
#TOC> 1 INITIALISATIONS AND PARAMETERS 61
|
|
#TOC> 2 PACKAGES 66
|
|
#TOC> 3 FUNCTIONS 82
|
|
#TOC> 3.01 dbSanitizeSequence() 85
|
|
#TOC> 3.02 dbConfirmUnique() 120
|
|
#TOC> 3.03 dbInit() 138
|
|
#TOC> 3.04 dbAutoincrement() 178
|
|
#TOC> 3.05 dbAddProtein() 191
|
|
#TOC> 3.06 dbAddFeature() 224
|
|
#TOC> 3.07 dbAddTaxonomy() 255
|
|
#TOC> 3.08 dbAddAnnotation() 290
|
|
#TOC> 3.09 dbFetchUniProtSeq() 337
|
|
#TOC> 3.10 dbFetchPrositeFeatures() 383
|
|
#TOC> 3.11 node2text() 433
|
|
#TOC> 3.12 dbFetchNCBItaxData() 445
|
|
#TOC> 3.13 UniProtIDmap() 484
|
|
#TOC> 3.14 dbProt2JSON() 523
|
|
#TOC> 3.15 dbSeq2JSON() 608
|
|
#TOC> 3.16 dbRow2JSON() 637
|
|
#TOC> 4 TESTS 657
|
|
#TOC>
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
# = 1 INITIALISATIONS AND PARAMETERS ======================================
|
|
|
|
doTESTS <- FALSE # run tests if TRUE
|
|
|
|
|
|
# = 2 PACKAGES ============================================================
|
|
|
|
|
|
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
|
install.packages("jsonlite")
|
|
}
|
|
|
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
|
install.packages("httr")
|
|
}
|
|
|
|
if (! requireNamespace("xml2", quietly = TRUE)) {
|
|
install.packages("xml2")
|
|
}
|
|
|
|
|
|
# = 3 FUNCTIONS ===========================================================
|
|
|
|
|
|
# == 3.01 dbSanitizeSequence() =============================================
|
|
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
|
# Remove FASTA header lines, if any,
|
|
# flatten any structure that s has,
|
|
# remove all non-letters except "-" (gap) and "*" (stop),
|
|
# convert to uppercase.
|
|
#
|
|
# Parameters:
|
|
# s chr A DNA or protein sequence plus other characters
|
|
# unambiguous bool if TRUE, stop() if any letter remaining after
|
|
# processing matches an ambiguity code. This is likely
|
|
# due to inadvertently including meta-data, such as
|
|
# a FASTA header, with the sequence.
|
|
# Note: since U is an ambiguity code for amino acid sequences, you need
|
|
# to set unambiguous = FALSE to process RNA sequences with Uracil.
|
|
# Value: chr a valid, uppercase, amino acid sequence
|
|
#
|
|
|
|
s <- as.character(unlist(s)) # convert complex object to plain chr vector
|
|
s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any
|
|
s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)
|
|
s <- paste(s, collapse="") # combine into single string
|
|
s <- toupper(gsub("[^a-zA-Z*-]", "", s))
|
|
if (unambiguous) {
|
|
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match
|
|
ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
|
|
if (! is.na(ambChar)) {
|
|
stop(paste("Input contains ambiguous codes(s): \"",
|
|
ambChar, "\".", sep=""))
|
|
}
|
|
}
|
|
return(s)
|
|
}
|
|
|
|
|
|
# == 3.02 dbConfirmUnique() ================================================
|
|
dbConfirmUnique <- function(x) {
|
|
# x is a vector of logicals.
|
|
# returns x if x has exactly one TRUE element.
|
|
# stop() otherwise.
|
|
|
|
if (any(!is.logical(x))) {
|
|
stop("PANIC: Input is not a boolean vector.")
|
|
} else if (sum(x) == 0) {
|
|
stop("PANIC: No match found.")
|
|
} else if (sum(x) > 1) {
|
|
stop("PANIC: More than one match found.")
|
|
} else {
|
|
return(x)
|
|
}
|
|
}
|
|
|
|
|
|
# == 3.03 dbInit() =========================================================
|
|
dbInit <- function() {
|
|
# Return an empty instance of the protein database
|
|
# The schema is here:
|
|
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
|
|
|
db <- list()
|
|
|
|
db$version <- "1.0"
|
|
|
|
db$protein <- data.frame(
|
|
ID = numeric(),
|
|
name = character(),
|
|
RefSeqID = character(),
|
|
UniProtID = character(),
|
|
taxonomyID = numeric(),
|
|
sequence = character())
|
|
|
|
db$taxonomy <- data.frame(
|
|
ID = numeric(),
|
|
species = character())
|
|
|
|
db$annotation <- data.frame(
|
|
ID = numeric(),
|
|
proteinID = numeric(),
|
|
featureID = numeric(),
|
|
start = numeric(),
|
|
end = numeric())
|
|
|
|
db$feature <- data.frame(
|
|
ID = numeric(),
|
|
name = character(),
|
|
description = character(),
|
|
sourceDB = character(),
|
|
accession = character())
|
|
|
|
return(db)
|
|
}
|
|
|
|
|
|
# == 3.04 dbAutoincrement() ================================================
|
|
dbAutoincrement <- function(tb) {
|
|
# Return a unique integer that can be used as a primary key
|
|
# Value:
|
|
# num a number one-larger than the largest current value in table$ID
|
|
if (length(tb$ID) == 0) {
|
|
return(1)
|
|
} else {
|
|
return(max(tb$ID) + 1)
|
|
}
|
|
}
|
|
|
|
|
|
# == 3.05 dbAddProtein() ===================================================
|
|
dbAddProtein <- function(db, jsonDF) {
|
|
# Add one or more protein entries to the database db if a protein with the
|
|
# same name does not yet exist. This enforces that protein names are unique.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame protein data imported into a data frame with
|
|
# fromJSON()
|
|
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
isValid <- TRUE
|
|
if (jsonDF$name[i] %in% db$protein$name) {
|
|
cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
|
|
i, jsonDF$name[i],
|
|
"a protein with this name already exists in the database. ",
|
|
"Skipping this input."))
|
|
isValid <- FALSE
|
|
}
|
|
|
|
if (isValid) {
|
|
x <- data.frame(ID = dbAutoincrement(db$protein),
|
|
name = jsonDF$name[i],
|
|
RefSeqID = jsonDF$RefSeqID[i],
|
|
UniProtID = jsonDF$UniProtID[i],
|
|
taxonomyID = jsonDF$taxonomyID[i],
|
|
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
|
|
db$protein <- rbind(db$protein, x)
|
|
}
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
# == 3.06 dbAddFeature() ===================================================
|
|
dbAddFeature <- function(db, jsonDF) {
|
|
# Add one or more feature entries to the database db. Skip if a feature with
|
|
# the same name already exists.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame feature data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
isValid <- TRUE
|
|
if (jsonDF$name[i] %in% db$feature$name) {
|
|
cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
|
|
i, jsonDF$name[i],
|
|
"a feature with this name already exists in the database. ",
|
|
"Skipping this input."))
|
|
isValid <- FALSE
|
|
}
|
|
|
|
if (isValid) {
|
|
x <- data.frame(ID = dbAutoincrement(db$feature),
|
|
name = jsonDF$name[i],
|
|
description = jsonDF$description[i],
|
|
sourceDB = jsonDF$sourceDB[i],
|
|
accession = jsonDF$accession[i])
|
|
db$feature <- rbind(db$feature, x)
|
|
}
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
# == 3.07 dbAddTaxonomy() ==================================================
|
|
dbAddTaxonomy <- function(db, jsonDF) {
|
|
# Add one or more taxonomy entries to the database db. Skip if species name
|
|
# or taxonomy ID already exist in the database.
|
|
# Parameters:
|
|
# db list A database created with dbInit()
|
|
# jsonDF data frame Taxonomy data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
isValid <- TRUE
|
|
|
|
if (jsonDF$species[i] %in% db$taxonomy$species) {
|
|
cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
|
|
i, jsonDF$name[i],
|
|
"a species with this name already exists in the database. ",
|
|
"Skipping this input."))
|
|
isValid <- FALSE
|
|
} else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
|
|
cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
|
|
i, jsonDF$ID[i],
|
|
"this taxonomy ID already exists in the database. ",
|
|
"Skipping this input."))
|
|
isValid <- FALSE
|
|
}
|
|
if (isValid) {
|
|
x <- data.frame(
|
|
ID = as.integer(jsonDF$ID[i]),
|
|
species = jsonDF$species[i])
|
|
db$taxonomy <- rbind(db$taxonomy, x)
|
|
}
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
# == 3.08 dbAddAnnotation() ================================================
|
|
dbAddAnnotation <- function(db, jsonDF) {
|
|
# Add one or more annotation entries to the database db. Skip the entry if
|
|
# it already exists in the database.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame annotation data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
isValid <- TRUE
|
|
|
|
sel <- jsonDF$pName[i] == db$protein$name
|
|
sel <- dbConfirmUnique(sel) # Confirm that this protein ID exists
|
|
pID <- db$protein$ID[sel]
|
|
|
|
sel <- jsonDF$fName[i] == db$feature$name
|
|
sel <- dbConfirmUnique(sel) # Confirm that this feature ID exists
|
|
fID <- db$feature$ID[sel]
|
|
|
|
sel <- db$annotation$proteinID == pID &
|
|
db$annotation$featureID == fID &
|
|
db$annotation$start == as.integer(jsonDF$start[i]) &
|
|
db$annotation$end == as.integer(jsonDF$end[i])
|
|
|
|
if (any(sel)) {
|
|
cat(sprintf("Note: annotation No. %d in the input has %s%s\n",
|
|
i,
|
|
"the same protein name, feature name, start, and end ",
|
|
"as one that already exists in the database. ",
|
|
"Skipping this input."))
|
|
|
|
isValid <- FALSE
|
|
}
|
|
|
|
if (isValid) {
|
|
x <- data.frame(ID = dbAutoincrement(db$annotation),
|
|
proteinID = pID,
|
|
featureID = fID,
|
|
start = as.integer(jsonDF$start[i]),
|
|
end = as.integer(jsonDF$end[i]))
|
|
db$annotation <- rbind(db$annotation, x)
|
|
}
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
# == 3.09 dbFetchUniProtSeq() ==============================================
|
|
dbFetchUniProtSeq <- function(IDs) {
|
|
# Fetch a protein sequence from UniProt.
|
|
# Parameters:
|
|
# IDs char a vector of UniProt IDs (accession number)
|
|
# Value:
|
|
# char a vector of the same length as ID. It contains
|
|
# sequences where the retrieval was successful, NA where
|
|
# it was not successful. The elements are named with
|
|
# the ID, the header lines are set as attribute "header"
|
|
|
|
|
|
BASE <- "http://www.uniprot.org/uniprot/"
|
|
|
|
sq <- character()
|
|
hd <- character()
|
|
for (i in seq_along(IDs)) {
|
|
URL <- sprintf("%s%s.fasta", BASE, IDs[i])
|
|
response <- httr::GET(URL)
|
|
if (httr::status_code(response) == 200) {
|
|
s <- as.character(response)
|
|
s <- unlist(strsplit(s, "\n"))
|
|
x <- dbSanitizeSequence(s)
|
|
} else {
|
|
s <- ""
|
|
x <- NA
|
|
}
|
|
hd[i] <- s[1]
|
|
sq[i] <- x
|
|
}
|
|
names(sq) <- IDs
|
|
attr(sq, "headers") <- hd
|
|
|
|
return(sq)
|
|
}
|
|
|
|
if (FALSE) {
|
|
inp <- c("P79073", "P0000000", "A0A1W2TKZ7")
|
|
s <- dbFetchUniProtSeq(inp)
|
|
s[1:3]
|
|
str(s)
|
|
attr(s, "headers")[1]
|
|
}
|
|
|
|
|
|
|
|
# == 3.10 dbFetchPrositeFeatures() =========================================
|
|
dbFetchPrositeFeatures <- function(ID) {
|
|
# Fetch feature annotations from ScanProsite.
|
|
# Parameters:
|
|
# ID char a UniProt ID (accession number)
|
|
# Value:
|
|
# data frame uID char UniProt ID
|
|
# start num start of motif
|
|
# end num end of motif
|
|
# psID char PROSITE motif ID
|
|
# psName char PROSITE motif name
|
|
# psSeq char sequence annotated to the feature
|
|
# If the operation is not successful, a 0-length data frame is returned.
|
|
|
|
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
|
|
|
response <- httr::POST(URL,
|
|
body = list(meta = "opt1",
|
|
meta1_protein = "opt1",
|
|
seq = ID,
|
|
skip = "on",
|
|
output = "tabular"))
|
|
|
|
myFeatures <- data.frame()
|
|
if (httr::status_code(response) == 200) {
|
|
|
|
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
|
|
|
patt <- sprintf("\\|%s\\|", ID)
|
|
lines <- lines[grep(patt, lines)]
|
|
|
|
for (line in lines) {
|
|
tokens <- unlist(strsplit(line, "\\t|\\|"))
|
|
myFeatures <- rbind(myFeatures,
|
|
data.frame(uID = tokens[2],
|
|
start = as.numeric(tokens[4]),
|
|
end = as.numeric(tokens[5]),
|
|
psID = tokens[6],
|
|
psName = tokens[7],
|
|
psSeq = tokens[11]))
|
|
}
|
|
}
|
|
return(myFeatures)
|
|
}
|
|
|
|
if (FALSE) {
|
|
dbFetchPrositeFeatures("P33520") # RES1_SCHPO
|
|
|
|
}
|
|
|
|
# == 3.11 node2text() ======================================================
|
|
node2text <- function(doc, tag) {
|
|
# an extractor function for the contents of elements
|
|
# between given tags in an XML response.
|
|
# Contents of all matching elements is returned in
|
|
# a vector of strings.
|
|
path <- paste0("//", tag)
|
|
nodes <- xml2::xml_find_all(doc, path)
|
|
return(xml2::xml_text(nodes))
|
|
}
|
|
|
|
|
|
# == 3.12 dbFetchNCBItaxData() =============================================
|
|
dbFetchNCBItaxData <- function(ID) {
|
|
# Fetch feature taxID and Organism from the NCBI.
|
|
# Parameters:
|
|
# ID char a RefSeq ID (accession number)
|
|
# Value:
|
|
# data frame taxID num NCBI taxID
|
|
# organism char organism for this taxID
|
|
# If the operation is not successful, a 0-length data frame is returned.
|
|
|
|
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
URL <- paste(eUtilsBase,
|
|
"esearch.fcgi?",
|
|
"db=protein",
|
|
"&term=", ID,
|
|
sep="")
|
|
myXML <- xml2::read_xml(URL)
|
|
GID <- node2text(myXML, "Id")
|
|
|
|
URL <- paste0(eUtilsBase,
|
|
"esummary.fcgi?",
|
|
"db=protein",
|
|
"&id=",
|
|
GID,
|
|
"&version=2.0")
|
|
myXML <- xml2::read_xml(URL)
|
|
|
|
x <- as.integer(node2text(myXML, "TaxId"))
|
|
y <- node2text(myXML, "Organism")
|
|
|
|
tID <- data.frame()
|
|
if (length(x) > 0 && length(y) > 0) {
|
|
tID <- data.frame(taxID = x, organism = y)
|
|
}
|
|
return(tID)
|
|
}
|
|
|
|
|
|
|
|
# == 3.13 UniProtIDmap() ===================================================
|
|
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|
# Use UniProt ID mapping service to map one or more IDs
|
|
# Parameters:
|
|
# s char A string of white-space separated IDs
|
|
# mapFrom char the database in which the IDs in s are valid.
|
|
# Default is RefSeq protein
|
|
# mapTo char the database in which the target IDs are valid.
|
|
# Default is UniProtKB
|
|
# Value
|
|
# A data frame of mapped IDs, with column names From and To, or an
|
|
# empty data frame if the mapping was unsuccessful. No rows are returned
|
|
# for IDs that are not mapped.
|
|
|
|
# Initialize curl
|
|
httr::set_config(httr::config(http_version = 0))
|
|
|
|
URL <- "https://www.uniprot.org/uploadlists/"
|
|
response <- httr::POST(URL,
|
|
body = list(from = mapFrom,
|
|
to = mapTo,
|
|
format = "tab",
|
|
query = s))
|
|
|
|
if (httr::status_code(response) == 200) { # 200: oK
|
|
myMap <- read.delim(file = textConnection(httr::content(response)),
|
|
sep = "\t")
|
|
colnames(myMap) <- c("From", "To")
|
|
} else {
|
|
myMap <- data.frame()
|
|
warning(paste("No uniProt ID mapping returned:",
|
|
"server sent status",
|
|
httr::status_code(response)))
|
|
}
|
|
|
|
return(myMap)
|
|
}
|
|
|
|
|
|
# == 3.14 dbProt2JSON() ====================================================
|
|
dbProt2JSON <- function(thisProt) {
|
|
# Extract all protein related data from myDB and return in JSON format.
|
|
|
|
thisData <- list()
|
|
|
|
# add a protein table
|
|
sel <- which(myDB$protein$name == thisProt)
|
|
thisData$protein <- myDB$protein[sel, ]
|
|
|
|
# add a taxonomy table
|
|
sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
|
|
thisData$taxonomy <- myDB$taxonomy[sel, ]
|
|
|
|
# add the entries for this protein from the annotation table
|
|
sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
|
|
thisData$annotation <- myDB$annotation[sel, ]
|
|
# our .json convention uses pName and fName as keys, not the db-internal IDs
|
|
# add empty columns for pName and fName
|
|
l <- nrow(thisData$annotation)
|
|
thisData$annotation$pName <- character(l)
|
|
thisData$annotation$fName <- character(l)
|
|
# get the appropriate protein and feature names
|
|
for (i in seq_len(l)) {
|
|
pID <- thisData$annotation$proteinID[i]
|
|
sel <- which(myDB$protein$ID == pID)
|
|
thisData$annotation$pName[i] <- myDB$protein$name[sel] # store pName
|
|
fID <- thisData$annotation$featureID[i]
|
|
sel <- which(myDB$feature$ID == fID)
|
|
thisData$annotation$fName[i] <- myDB$feature$name[sel] # store fName
|
|
}
|
|
|
|
# add the corresponding feature table
|
|
sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
|
|
thisData$feature <- myDB$feature[sel, ]
|
|
|
|
# remove columns that are not going into JSON output
|
|
thisData$protein$ID <- NULL
|
|
thisData$annotation$ID <- NULL
|
|
thisData$annotation$proteinID <- NULL
|
|
thisData$annotation$featureID <- NULL
|
|
thisData$feature$ID <- NULL
|
|
|
|
# create JSON-formatted output
|
|
# ( jsonlite::prettify() is too wordy for a compact Wikipage )
|
|
|
|
out <- character()
|
|
out <- c(out, '{')
|
|
|
|
out <- c(out, ' "protein": {')
|
|
sel <- colnames(thisData$protein) != "sequence"
|
|
out <- c(out, sprintf(" %s,", dbRow2JSON(thisData$protein[1, sel],
|
|
coll = ",\n ")))
|
|
out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
|
|
out <- c(out, ' },')
|
|
|
|
out <- c(out, ' "taxonomy": {')
|
|
out <- c(out, sprintf(" %s", dbRow2JSON(thisData$taxonomy)))
|
|
out <- c(out, ' },')
|
|
|
|
out <- c(out, ' "annotation": [')
|
|
for (i in seq_len(nrow(thisData$annotation))) {
|
|
out <- c(out, sprintf(" {%s},", dbRow2JSON(thisData$annotation[i, ])))
|
|
}
|
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
|
out <- c(out, ' ],')
|
|
|
|
out <- c(out, ' "feature": [')
|
|
sel <- colnames(thisData$feature) != "description"
|
|
for (i in seq_len(nrow(thisData$feature))) {
|
|
out <- c(out, sprintf(" {%s,",
|
|
dbRow2JSON(thisData$feature[i, sel])))
|
|
out <- c(out, sprintf(" %s},",
|
|
dbRow2JSON(thisData$feature[i, "description",
|
|
drop = FALSE])))
|
|
}
|
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
|
out <- c(out, ' ]')
|
|
|
|
out <- c(out, '}')
|
|
|
|
return(paste0(out, collapse = "\n"))
|
|
}
|
|
|
|
|
|
# == 3.15 dbSeq2JSON() =====================================================
|
|
|
|
dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
|
|
# Turn a sequence into a JSON key-value pair, with the value being a JSON
|
|
# array of elements not exceeding a width of "width", and an indent of
|
|
# "indents" spaces.
|
|
ind <- paste0(rep(" ", nIndents), collapse = "")
|
|
|
|
out <- character()
|
|
out <- c(out, sprintf("%s\"sequence\" : [", ind))
|
|
|
|
for (i in seq_along(s)) {
|
|
l <- nchar(s[i])
|
|
if (l <= width) {
|
|
out <- c(out, s[i])
|
|
} else {
|
|
starts <- seq(1, l, by = width)
|
|
ends <- seq(width, l, by = width)
|
|
if (length(ends) < length(starts)) { ends <- c(ends, l) }
|
|
out <- c(out, sprintf("%s \"%s\",", ind, substring(s[i], starts, ends)))
|
|
}
|
|
}
|
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
|
|
|
out <- c(out, sprintf("%s]", ind))
|
|
return(paste0(out, collapse = "\n"))
|
|
}
|
|
|
|
|
|
# == 3.16 dbRow2JSON() =====================================================
|
|
|
|
dbRow2JSON <- function(df, coll = ", ") {
|
|
# Turn a single dataframe row into JSON key value pairs, where the keys are the
|
|
# column names. Respects character / numeric mode.
|
|
out <- character()
|
|
for (i in 1:ncol(df)) {
|
|
if (class(df[1, i]) == "integer") {
|
|
val <- sprintf("%d", df[1, i])
|
|
} else if (class(df[1, i]) == "numeric") {
|
|
val <- sprintf("%f", df[1, i])
|
|
} else {
|
|
val <- sprintf("\"%s\"", as.character(df[1, i]))
|
|
}
|
|
out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
|
|
}
|
|
return(paste0(out, collapse = coll))
|
|
}
|
|
|
|
|
|
# = 4 TESTS ===============================================================
|
|
|
|
if (doTESTS) {
|
|
if (! requireNamespace("testthat", quietly = TRUE)) {
|
|
install.packages("testthat")
|
|
}
|
|
|
|
# ToDo: test everything here
|
|
|
|
}
|
|
|
|
|
|
# [END]
|