new scripts
This commit is contained in:
parent
6a565c2ab6
commit
95accd9608
26
scripts/ABC-createRefDB.R
Normal file
26
scripts/ABC-createRefDB.R
Normal file
@ -0,0 +1,26 @@
|
||||
# ABC-createRefDB.R
|
||||
#
|
||||
# Create a reference protein database for Mbp1-like proteins
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# For the species, see:
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||
#
|
||||
# For the data model, see
|
||||
# https://docs.google.com/drawings/d/1uupNvz18_FYFwyyVPebTM0CUxcJCPDQuxuIJGpjWQWg
|
||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
myDB <- dbInit()
|
||||
|
||||
myDB <- dbAddProtein( myDB, fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein( myDB, fromJSON("./data/refProteins.json"))
|
||||
myDB <- dbAddTaxonomy( myDB, fromJSON("./data/refTaxonomy.json"))
|
||||
myDB <- dbAddFeature( myDB, fromJSON("./data/refFeatures.json"))
|
||||
myDB <- dbAddAnnotation( myDB, fromJSON("./data/refAnnotations.json"))
|
||||
|
||||
|
||||
# [END]
|
201
scripts/ABC-dbUtilities.R
Normal file
201
scripts/ABC-dbUtilities.R
Normal file
@ -0,0 +1,201 @@
|
||||
# ABC-dbUtilities.R
|
||||
|
||||
# database utilities for ABC learning units
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
|
||||
|
||||
# ====== PACKAGES ==============================================================
|
||||
|
||||
|
||||
if (! require("jsonlite", quietly = TRUE)) {
|
||||
install.packages("jsonlite")
|
||||
library(jsonlite)
|
||||
}
|
||||
|
||||
|
||||
# ====== FUNCTIONS =============================================================
|
||||
|
||||
|
||||
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
||||
# Flatten any structure that s has, remove all non-letters, convert to
|
||||
# uppercase.
|
||||
#
|
||||
# Parameters:
|
||||
# s chr A DNA or protein sequence plus other characters
|
||||
# unambiguous bool if TRUE, stop() if any letter remaining after
|
||||
# processing matches an ambiguity code. This is likely
|
||||
# due to inadvertently including meta-data, such as
|
||||
# a FASTA header, with the sequence.
|
||||
# Note: since U is an ambiguity code for amino acid sequences, you need
|
||||
# to set unambiguous = FALSE to process RNA sequences with Uracil.
|
||||
# Value: chr a valid, uppercase, amino acid sequence
|
||||
#
|
||||
s <- paste(unlist(s), collapse="")
|
||||
s <- toupper(gsub("[^a-zA-Z]", "", s))
|
||||
if (unambiguous) {
|
||||
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match
|
||||
ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
|
||||
if (! is.na(ambChar)) {
|
||||
stop(paste("Input contains ambiguous codes(s): \"",
|
||||
ambChar, "\".", sep=""))
|
||||
}
|
||||
}
|
||||
return(s)
|
||||
}
|
||||
|
||||
|
||||
dbConfirmUnique <- function(x) {
|
||||
# x is a vector of logicals.
|
||||
# returns x if x has exactly one TRUE element.
|
||||
# stop() otherwise.
|
||||
|
||||
if (any(!is.logical(x))) {
|
||||
stop("PANIC: Input is not a boolean vector.")
|
||||
} else if (sum(x) == 0) {
|
||||
stop("PANIC: No match found.")
|
||||
} else if (sum(x) > 1) {
|
||||
stop("PANIC: More than one match found.")
|
||||
} else {
|
||||
return(x)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
dbInit <- function() {
|
||||
# Return an empty instance of the protein database
|
||||
|
||||
db <- list()
|
||||
|
||||
db$protein <- data.frame(
|
||||
ID = numeric(),
|
||||
name = character(),
|
||||
RefSeqID = character(),
|
||||
UniProtID = character(),
|
||||
taxonomyID = numeric(),
|
||||
sequence = character(),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
db$taxonomy <- data.frame(
|
||||
ID = numeric(),
|
||||
species = character(),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
|
||||
db$annotation <- data.frame(
|
||||
ID = numeric(),
|
||||
proteinID = numeric(),
|
||||
featureID = numeric(),
|
||||
start = numeric(),
|
||||
end = numeric(),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
db$feature <- data.frame(
|
||||
ID = numeric(),
|
||||
name = character(),
|
||||
description = character(),
|
||||
sourceDB = character(),
|
||||
accession = character(),
|
||||
stringsAsFactors = FALSE)
|
||||
|
||||
return(db)
|
||||
}
|
||||
|
||||
|
||||
dbAutoincrement <- function(tb) {
|
||||
# Return a unique integer that can be used as a primary key
|
||||
# Value:
|
||||
# num a number one-larger than the largest current value in table$ID
|
||||
if (length(tb$ID) == 0) {
|
||||
return(1)
|
||||
} else {
|
||||
return(max(tb$ID) + 1)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
dbAddProtein <- function(db, jsonDF) {
|
||||
# Add one or more protein entries to the database db.
|
||||
# Parameters:
|
||||
# db list a database created with dbInit()
|
||||
# jsonDF data frame protein data imported into a data frame with
|
||||
# fromJSON()
|
||||
for (i in seq_len(nrow(jsonDF))) {
|
||||
x <- data.frame(ID = dbAutoincrement(db$protein),
|
||||
name = jsonDF$name[i],
|
||||
RefSeqID = jsonDF$RefSeqID[i],
|
||||
UniProtID = jsonDF$UniProtID[i],
|
||||
taxonomyID = jsonDF$taxonomyID[i],
|
||||
sequence = dbSanitizeSequence(jsonDF$sequence[i]),
|
||||
stringsAsFactors = FALSE)
|
||||
db$protein <- rbind(db$protein, x)
|
||||
}
|
||||
return(db)
|
||||
}
|
||||
|
||||
|
||||
dbAddFeature <- function(db, jsonDF) {
|
||||
# Add one or more feature entries to the database db.
|
||||
# Parameters:
|
||||
# db list a database created with dbInit()
|
||||
# jsonDF data frame feature data imported into a data frame with
|
||||
# fromJSON()
|
||||
for (i in seq_len(nrow(jsonDF))) {
|
||||
x <- data.frame(ID = dbAutoincrement(db$feature),
|
||||
name = jsonDF$name[i],
|
||||
description = jsonDF$description[i],
|
||||
sourceDB = jsonDF$sourceDB[i],
|
||||
accession = jsonDF$accession[i],
|
||||
stringsAsFactors = FALSE)
|
||||
db$feature <- rbind(db$feature, x)
|
||||
}
|
||||
return(db)
|
||||
}
|
||||
|
||||
|
||||
dbAddTaxonomy <- function(db, jsonDF) {
|
||||
# Add one or more taxonomy entries to the database db.
|
||||
# Parameters:
|
||||
# db list A database created with dbInit()
|
||||
# jsonDF data frame Taxonomy data imported into a data frame with
|
||||
# fromJSON()
|
||||
for (i in seq_len(nrow(jsonDF))) {
|
||||
x <- data.frame(
|
||||
ID = jsonDF$ID[i],
|
||||
species = jsonDF$species[i],
|
||||
stringsAsFactors = FALSE)
|
||||
db$taxonomy <- rbind(db$taxonomy, x)
|
||||
}
|
||||
return(db)
|
||||
}
|
||||
|
||||
dbAddAnnotation <- function(db, jsonDF) {
|
||||
# Add one or more annotation entries to the database db.
|
||||
# Parameters:
|
||||
# db list a database created with dbInit()
|
||||
# jsonDF data frame annotation data imported into a data frame with
|
||||
# fromJSON()
|
||||
for (i in seq_len(nrow(jsonDF))) {
|
||||
|
||||
sel <- jsonDF$pName[i] == db$protein$name
|
||||
sel <- dbConfirmUnique(sel)
|
||||
pID <- db$protein$ID[sel]
|
||||
|
||||
sel <- jsonDF$fName[i] == db$feature$name
|
||||
sel <- dbConfirmUnique(sel)
|
||||
fID <- db$feature$ID[sel]
|
||||
|
||||
x <- data.frame(ID = dbAutoincrement(db$annotation),
|
||||
proteinID = pID,
|
||||
featureID = fID,
|
||||
start = as.integer(jsonDF$start[i]),
|
||||
end = as.integer(jsonDF$end[i]),
|
||||
stringsAsFactors = FALSE)
|
||||
db$annotation <- rbind(db$annotation, x)
|
||||
}
|
||||
return(db)
|
||||
}
|
||||
|
||||
|
||||
# [END]
|
Loading…
Reference in New Issue
Block a user