bch441-work-abc-units/scripts/ABC-dbUtilities.R

208 lines
6.0 KiB
R

# ABC-dbUtilities.R
# database utilities for ABC learning units
#
# ==============================================================================
#
# ====== PACKAGES ==============================================================
if (! require("jsonlite", quietly = TRUE)) {
install.packages("jsonlite")
library(jsonlite)
}
# ====== FUNCTIONS =============================================================
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
# Remove FASTA header lines, if any,
# flatten any structure that s has,
# remove all non-letters,
# convert to uppercase.
#
# Parameters:
# s chr A DNA or protein sequence plus other characters
# unambiguous bool if TRUE, stop() if any letter remaining after
# processing matches an ambiguity code. This is likely
# due to inadvertently including meta-data, such as
# a FASTA header, with the sequence.
# Note: since U is an ambiguity code for amino acid sequences, you need
# to set unambiguous = FALSE to process RNA sequences with Uracil.
# Value: chr a valid, uppercase, amino acid sequence
#
s <- as.character(unlist(s)) # convert complex object to plain chr vector
s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any
s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)
s <- paste(s, collapse="") # combine into single string
s <- toupper(gsub("[^a-zA-Z]", "", s))
if (unambiguous) {
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match
ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
if (! is.na(ambChar)) {
stop(paste("Input contains ambiguous codes(s): \"",
ambChar, "\".", sep=""))
}
}
return(s)
}
dbConfirmUnique <- function(x) {
# x is a vector of logicals.
# returns x if x has exactly one TRUE element.
# stop() otherwise.
if (any(!is.logical(x))) {
stop("PANIC: Input is not a boolean vector.")
} else if (sum(x) == 0) {
stop("PANIC: No match found.")
} else if (sum(x) > 1) {
stop("PANIC: More than one match found.")
} else {
return(x)
}
}
dbInit <- function() {
# Return an empty instance of the protein database
db <- list()
db$protein <- data.frame(
ID = numeric(),
name = character(),
RefSeqID = character(),
UniProtID = character(),
taxonomyID = numeric(),
sequence = character(),
stringsAsFactors = FALSE)
db$taxonomy <- data.frame(
ID = numeric(),
species = character(),
stringsAsFactors = FALSE)
db$annotation <- data.frame(
ID = numeric(),
proteinID = numeric(),
featureID = numeric(),
start = numeric(),
end = numeric(),
stringsAsFactors = FALSE)
db$feature <- data.frame(
ID = numeric(),
name = character(),
description = character(),
sourceDB = character(),
accession = character(),
stringsAsFactors = FALSE)
return(db)
}
dbAutoincrement <- function(tb) {
# Return a unique integer that can be used as a primary key
# Value:
# num a number one-larger than the largest current value in table$ID
if (length(tb$ID) == 0) {
return(1)
} else {
return(max(tb$ID) + 1)
}
}
dbAddProtein <- function(db, jsonDF) {
# Add one or more protein entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame protein data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(ID = dbAutoincrement(db$protein),
name = jsonDF$name[i],
RefSeqID = jsonDF$RefSeqID[i],
UniProtID = jsonDF$UniProtID[i],
taxonomyID = jsonDF$taxonomyID[i],
sequence = dbSanitizeSequence(jsonDF$sequence[i]),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, x)
}
return(db)
}
dbAddFeature <- function(db, jsonDF) {
# Add one or more feature entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame feature data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(ID = dbAutoincrement(db$feature),
name = jsonDF$name[i],
description = jsonDF$description[i],
sourceDB = jsonDF$sourceDB[i],
accession = jsonDF$accession[i],
stringsAsFactors = FALSE)
db$feature <- rbind(db$feature, x)
}
return(db)
}
dbAddTaxonomy <- function(db, jsonDF) {
# Add one or more taxonomy entries to the database db.
# Parameters:
# db list A database created with dbInit()
# jsonDF data frame Taxonomy data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(
ID = jsonDF$ID[i],
species = jsonDF$species[i],
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, x)
}
return(db)
}
dbAddAnnotation <- function(db, jsonDF) {
# Add one or more annotation entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame annotation data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
sel <- jsonDF$pName[i] == db$protein$name
sel <- dbConfirmUnique(sel)
pID <- db$protein$ID[sel]
sel <- jsonDF$fName[i] == db$feature$name
sel <- dbConfirmUnique(sel)
fID <- db$feature$ID[sel]
x <- data.frame(ID = dbAutoincrement(db$annotation),
proteinID = pID,
featureID = fID,
start = as.integer(jsonDF$start[i]),
end = as.integer(jsonDF$end[i]),
stringsAsFactors = FALSE)
db$annotation <- rbind(db$annotation, x)
}
return(db)
}
# [END]