334 lines
9.5 KiB
R
334 lines
9.5 KiB
R
# ABC-dbUtilities.R
|
|
|
|
# database utilities for ABC learning units
|
|
#
|
|
# ==============================================================================
|
|
#
|
|
|
|
|
|
# ====== PACKAGES ==============================================================
|
|
|
|
|
|
if (! require(jsonlite, quietly = TRUE)) {
|
|
install.packages("jsonlite")
|
|
library(jsonlite)
|
|
}
|
|
|
|
|
|
if (!require(httr, quietly = TRUE)) {
|
|
install.packages("httr")
|
|
library(httr)
|
|
}
|
|
|
|
|
|
if (!require(xml2, quietly = TRUE)) {
|
|
install.packages("xml2")
|
|
library(xml2)
|
|
}
|
|
|
|
|
|
# ====== FUNCTIONS =============================================================
|
|
|
|
|
|
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
|
# Remove FASTA header lines, if any,
|
|
# flatten any structure that s has,
|
|
# remove all non-letters,
|
|
# convert to uppercase.
|
|
#
|
|
# Parameters:
|
|
# s chr A DNA or protein sequence plus other characters
|
|
# unambiguous bool if TRUE, stop() if any letter remaining after
|
|
# processing matches an ambiguity code. This is likely
|
|
# due to inadvertently including meta-data, such as
|
|
# a FASTA header, with the sequence.
|
|
# Note: since U is an ambiguity code for amino acid sequences, you need
|
|
# to set unambiguous = FALSE to process RNA sequences with Uracil.
|
|
# Value: chr a valid, uppercase, amino acid sequence
|
|
#
|
|
|
|
s <- as.character(unlist(s)) # convert complex object to plain chr vector
|
|
s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any
|
|
s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)
|
|
s <- paste(s, collapse="") # combine into single string
|
|
s <- toupper(gsub("[^a-zA-Z]", "", s))
|
|
if (unambiguous) {
|
|
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match
|
|
ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
|
|
if (! is.na(ambChar)) {
|
|
stop(paste("Input contains ambiguous codes(s): \"",
|
|
ambChar, "\".", sep=""))
|
|
}
|
|
}
|
|
return(s)
|
|
}
|
|
|
|
|
|
dbConfirmUnique <- function(x) {
|
|
# x is a vector of logicals.
|
|
# returns x if x has exactly one TRUE element.
|
|
# stop() otherwise.
|
|
|
|
if (any(!is.logical(x))) {
|
|
stop("PANIC: Input is not a boolean vector.")
|
|
} else if (sum(x) == 0) {
|
|
stop("PANIC: No match found.")
|
|
} else if (sum(x) > 1) {
|
|
stop("PANIC: More than one match found.")
|
|
} else {
|
|
return(x)
|
|
}
|
|
}
|
|
|
|
|
|
dbInit <- function() {
|
|
# Return an empty instance of the protein database
|
|
|
|
db <- list()
|
|
|
|
db$protein <- data.frame(
|
|
ID = numeric(),
|
|
name = character(),
|
|
RefSeqID = character(),
|
|
UniProtID = character(),
|
|
taxonomyID = numeric(),
|
|
sequence = character(),
|
|
stringsAsFactors = FALSE)
|
|
|
|
db$taxonomy <- data.frame(
|
|
ID = numeric(),
|
|
species = character(),
|
|
stringsAsFactors = FALSE)
|
|
|
|
|
|
db$annotation <- data.frame(
|
|
ID = numeric(),
|
|
proteinID = numeric(),
|
|
featureID = numeric(),
|
|
start = numeric(),
|
|
end = numeric(),
|
|
stringsAsFactors = FALSE)
|
|
|
|
db$feature <- data.frame(
|
|
ID = numeric(),
|
|
name = character(),
|
|
description = character(),
|
|
sourceDB = character(),
|
|
accession = character(),
|
|
stringsAsFactors = FALSE)
|
|
|
|
return(db)
|
|
}
|
|
|
|
|
|
dbAutoincrement <- function(tb) {
|
|
# Return a unique integer that can be used as a primary key
|
|
# Value:
|
|
# num a number one-larger than the largest current value in table$ID
|
|
if (length(tb$ID) == 0) {
|
|
return(1)
|
|
} else {
|
|
return(max(tb$ID) + 1)
|
|
}
|
|
}
|
|
|
|
|
|
dbAddProtein <- function(db, jsonDF) {
|
|
# Add one or more protein entries to the database db.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame protein data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
x <- data.frame(ID = dbAutoincrement(db$protein),
|
|
name = jsonDF$name[i],
|
|
RefSeqID = jsonDF$RefSeqID[i],
|
|
UniProtID = jsonDF$UniProtID[i],
|
|
taxonomyID = jsonDF$taxonomyID[i],
|
|
sequence = dbSanitizeSequence(jsonDF$sequence[i]),
|
|
stringsAsFactors = FALSE)
|
|
db$protein <- rbind(db$protein, x)
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
dbAddFeature <- function(db, jsonDF) {
|
|
# Add one or more feature entries to the database db.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame feature data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
x <- data.frame(ID = dbAutoincrement(db$feature),
|
|
name = jsonDF$name[i],
|
|
description = jsonDF$description[i],
|
|
sourceDB = jsonDF$sourceDB[i],
|
|
accession = jsonDF$accession[i],
|
|
stringsAsFactors = FALSE)
|
|
db$feature <- rbind(db$feature, x)
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
dbAddTaxonomy <- function(db, jsonDF) {
|
|
# Add one or more taxonomy entries to the database db.
|
|
# Parameters:
|
|
# db list A database created with dbInit()
|
|
# jsonDF data frame Taxonomy data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
x <- data.frame(
|
|
ID = jsonDF$ID[i],
|
|
species = jsonDF$species[i],
|
|
stringsAsFactors = FALSE)
|
|
db$taxonomy <- rbind(db$taxonomy, x)
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
dbAddAnnotation <- function(db, jsonDF) {
|
|
# Add one or more annotation entries to the database db.
|
|
# Parameters:
|
|
# db list a database created with dbInit()
|
|
# jsonDF data frame annotation data imported into a data frame with
|
|
# fromJSON()
|
|
for (i in seq_len(nrow(jsonDF))) {
|
|
|
|
sel <- jsonDF$pName[i] == db$protein$name
|
|
sel <- dbConfirmUnique(sel)
|
|
pID <- db$protein$ID[sel]
|
|
|
|
sel <- jsonDF$fName[i] == db$feature$name
|
|
sel <- dbConfirmUnique(sel)
|
|
fID <- db$feature$ID[sel]
|
|
|
|
x <- data.frame(ID = dbAutoincrement(db$annotation),
|
|
proteinID = pID,
|
|
featureID = fID,
|
|
start = as.integer(jsonDF$start[i]),
|
|
end = as.integer(jsonDF$end[i]),
|
|
stringsAsFactors = FALSE)
|
|
db$annotation <- rbind(db$annotation, x)
|
|
}
|
|
return(db)
|
|
}
|
|
|
|
|
|
dbFetchUniProtSeq <- function(ID) {
|
|
# Fetch a protein sequence from UniProt.
|
|
# Parameters:
|
|
# ID char a UniProt ID (accession number)
|
|
# Value:
|
|
# char the sequence
|
|
# If the operation is not successful, a 0-length string is returned
|
|
|
|
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
|
|
|
|
response <- GET(URL)
|
|
|
|
mySeq <- character()
|
|
if (status_code(response) == 200) {
|
|
x <- as.character(response)
|
|
x <- strsplit(x, "\n")
|
|
mySeq <- dbSanitizeSequence(x)
|
|
}
|
|
|
|
return(mySeq)
|
|
}
|
|
|
|
|
|
dbFetchPrositeFeatures <- function(ID) {
|
|
# Fetch feature annotations from ScanProsite.
|
|
# Parameters:
|
|
# ID char a UniProt ID (accession number)
|
|
# Value:
|
|
# data frame uID char UniProt ID
|
|
# start num start of motif
|
|
# end num end of motif
|
|
# psID char PROSITE motif ID
|
|
# psName char PROSITE motif name
|
|
# If the operation is not successful, a 0-length data frame is returned.
|
|
|
|
URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
|
|
|
response <- POST(URL,
|
|
body = list(meta = "opt1",
|
|
meta1_protein = "opt1",
|
|
seq = ID,
|
|
skip = "on",
|
|
output = "tabular"))
|
|
|
|
myFeatures <- data.frame()
|
|
if (status_code(response) == 200) {
|
|
|
|
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
|
|
|
patt <- sprintf("\\|%s\\|", UniProtID)
|
|
lines <- lines[grep(patt, lines)]
|
|
|
|
for (line in lines) {
|
|
tokens <- unlist(strsplit(line, "\\t|\\|"))
|
|
myFeatures <- rbind(myFeatures,
|
|
data.frame(uID = tokens[2],
|
|
start = as.numeric(tokens[4]),
|
|
end = as.numeric(tokens[5]),
|
|
psID = tokens[6],
|
|
psName = tokens[7],
|
|
stringsAsFactors = FALSE))
|
|
}
|
|
}
|
|
return(myFeatures)
|
|
}
|
|
|
|
|
|
node2text <- function(doc, tag) {
|
|
# an extractor function for the contents of elements
|
|
# between given tags in an XML response.
|
|
# Contents of all matching elements is returned in
|
|
# a vector of strings.
|
|
path <- paste0("//", tag)
|
|
nodes <- xml_find_all(doc, path)
|
|
return(xml_text(nodes))
|
|
}
|
|
|
|
|
|
dbFetchNCBItaxData <- function(ID) {
|
|
# Fetch feature taxID and Organism from the NCBI.
|
|
# Parameters:
|
|
# ID char a RefSeq ID (accession number)
|
|
# Value:
|
|
# data frame taxID num NCBI taxID
|
|
# organism char organism for this taxID
|
|
# If the operation is not successful, a 0-length data frame is returned.
|
|
|
|
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
URL <- paste(eUtilsBase,
|
|
"esearch.fcgi?",
|
|
"db=protein",
|
|
"&term=", ID,
|
|
sep="")
|
|
myXML <- read_xml(URL)
|
|
GID <- node2text(myXML, "Id")
|
|
|
|
URL <- paste0(eUtilsBase,
|
|
"esummary.fcgi?",
|
|
"db=protein",
|
|
"&id=",
|
|
GID,
|
|
"&version=2.0")
|
|
myXML <- read_xml(URL)
|
|
|
|
x <- as.integer(node2text(myXML, "TaxId"))
|
|
y <- node2text(myXML, "Organism")
|
|
|
|
tID <- data.frame()
|
|
if (length(x) > 0 && length(y) > 0) {
|
|
tID <- data.frame(taxID = x, organism = y)
|
|
}
|
|
return(tID)
|
|
}
|
|
|
|
# [END]
|