# ABC-dbUtilities.R # database utilities for ABC learning units # # ============================================================================== # # ====== PACKAGES ============================================================== if (! require("jsonlite", quietly = TRUE)) { install.packages("jsonlite") library(jsonlite) } # ====== FUNCTIONS ============================================================= dbSanitizeSequence <- function(s, unambiguous = TRUE) { # Remove FASTA header lines, if any, # flatten any structure that s has, # remove all non-letters, # convert to uppercase. # # Parameters: # s chr A DNA or protein sequence plus other characters # unambiguous bool if TRUE, stop() if any letter remaining after # processing matches an ambiguity code. This is likely # due to inadvertently including meta-data, such as # a FASTA header, with the sequence. # Note: since U is an ambiguity code for amino acid sequences, you need # to set unambiguous = FALSE to process RNA sequences with Uracil. # Value: chr a valid, uppercase, amino acid sequence # s <- as.character(unlist(s)) # convert complex object to plain chr vector s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header) s <- paste(s, collapse="") # combine into single string s <- toupper(gsub("[^a-zA-Z]", "", s)) if (unambiguous) { amb <- "([bjouxzBJOUXZ])" # parentheses capture the match ambChar <- unlist(regmatches(s, regexec(amb, s)))[1] if (! is.na(ambChar)) { stop(paste("Input contains ambiguous codes(s): \"", ambChar, "\".", sep="")) } } return(s) } dbConfirmUnique <- function(x) { # x is a vector of logicals. # returns x if x has exactly one TRUE element. # stop() otherwise. if (any(!is.logical(x))) { stop("PANIC: Input is not a boolean vector.") } else if (sum(x) == 0) { stop("PANIC: No match found.") } else if (sum(x) > 1) { stop("PANIC: More than one match found.") } else { return(x) } } dbInit <- function() { # Return an empty instance of the protein database db <- list() db$protein <- data.frame( ID = numeric(), name = character(), RefSeqID = character(), UniProtID = character(), taxonomyID = numeric(), sequence = character(), stringsAsFactors = FALSE) db$taxonomy <- data.frame( ID = numeric(), species = character(), stringsAsFactors = FALSE) db$annotation <- data.frame( ID = numeric(), proteinID = numeric(), featureID = numeric(), start = numeric(), end = numeric(), stringsAsFactors = FALSE) db$feature <- data.frame( ID = numeric(), name = character(), description = character(), sourceDB = character(), accession = character(), stringsAsFactors = FALSE) return(db) } dbAutoincrement <- function(tb) { # Return a unique integer that can be used as a primary key # Value: # num a number one-larger than the largest current value in table$ID if (length(tb$ID) == 0) { return(1) } else { return(max(tb$ID) + 1) } } dbAddProtein <- function(db, jsonDF) { # Add one or more protein entries to the database db. # Parameters: # db list a database created with dbInit() # jsonDF data frame protein data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { x <- data.frame(ID = dbAutoincrement(db$protein), name = jsonDF$name[i], RefSeqID = jsonDF$RefSeqID[i], UniProtID = jsonDF$UniProtID[i], taxonomyID = jsonDF$taxonomyID[i], sequence = dbSanitizeSequence(jsonDF$sequence[i]), stringsAsFactors = FALSE) db$protein <- rbind(db$protein, x) } return(db) } dbAddFeature <- function(db, jsonDF) { # Add one or more feature entries to the database db. # Parameters: # db list a database created with dbInit() # jsonDF data frame feature data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { x <- data.frame(ID = dbAutoincrement(db$feature), name = jsonDF$name[i], description = jsonDF$description[i], sourceDB = jsonDF$sourceDB[i], accession = jsonDF$accession[i], stringsAsFactors = FALSE) db$feature <- rbind(db$feature, x) } return(db) } dbAddTaxonomy <- function(db, jsonDF) { # Add one or more taxonomy entries to the database db. # Parameters: # db list A database created with dbInit() # jsonDF data frame Taxonomy data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { x <- data.frame( ID = jsonDF$ID[i], species = jsonDF$species[i], stringsAsFactors = FALSE) db$taxonomy <- rbind(db$taxonomy, x) } return(db) } dbAddAnnotation <- function(db, jsonDF) { # Add one or more annotation entries to the database db. # Parameters: # db list a database created with dbInit() # jsonDF data frame annotation data imported into a data frame with # fromJSON() for (i in seq_len(nrow(jsonDF))) { sel <- jsonDF$pName[i] == db$protein$name sel <- dbConfirmUnique(sel) pID <- db$protein$ID[sel] sel <- jsonDF$fName[i] == db$feature$name sel <- dbConfirmUnique(sel) fID <- db$feature$ID[sel] x <- data.frame(ID = dbAutoincrement(db$annotation), proteinID = pID, featureID = fID, start = as.integer(jsonDF$start[i]), end = as.integer(jsonDF$end[i]), stringsAsFactors = FALSE) db$annotation <- rbind(db$annotation, x) } return(db) } # [END]