bch441-work-abc-units/scripts/ABC-dbUtilities.R

442 lines
14 KiB
R

# tocID <- "scripts/ABC-dbUtilities.R"
#
# database utilities for ABC learning units
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 PACKAGES 32
#TOC> 2 FUNCTIONS 50
#TOC> 2.01 dbSanitizeSequence() 53
#TOC> 2.02 dbConfirmUnique() 88
#TOC> 2.03 dbInit() 106
#TOC> 2.04 dbAutoincrement() 147
#TOC> 2.05 dbAddProtein() 160
#TOC> 2.06 dbAddFeature() 180
#TOC> 2.07 dbAddTaxonomy() 199
#TOC> 2.08 dbAddAnnotation() 215
#TOC> 2.09 dbFetchUniProtSeq() 243
#TOC> 2.10 dbFetchPrositeFeatures() 289
#TOC> 2.11 node2text() 339
#TOC> 2.12 dbFetchNCBItaxData() 351
#TOC> 2.13 UniProtIDmap() 390
#TOC> 3 TESTS 429
#TOC>
#TOC> ==========================================================================
# = 1 PACKAGES ============================================================
if (! requireNamespace("jsonlite", quietly = TRUE)) {
install.packages("jsonlite")
}
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
if (! requireNamespace("xml2", quietly = TRUE)) {
install.packages("xml2")
}
# = 2 FUNCTIONS ===========================================================
# == 2.01 dbSanitizeSequence() =============================================
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
# Remove FASTA header lines, if any,
# flatten any structure that s has,
# remove all non-letters except "-" (gap) and "*" (stop),
# convert to uppercase.
#
# Parameters:
# s chr A DNA or protein sequence plus other characters
# unambiguous bool if TRUE, stop() if any letter remaining after
# processing matches an ambiguity code. This is likely
# due to inadvertently including meta-data, such as
# a FASTA header, with the sequence.
# Note: since U is an ambiguity code for amino acid sequences, you need
# to set unambiguous = FALSE to process RNA sequences with Uracil.
# Value: chr a valid, uppercase, amino acid sequence
#
s <- as.character(unlist(s)) # convert complex object to plain chr vector
s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any
s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)
s <- paste(s, collapse="") # combine into single string
s <- toupper(gsub("[^a-zA-Z*-]", "", s))
if (unambiguous) {
amb <- "([bjouxzBJOUXZ])" # parentheses capture the match
ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
if (! is.na(ambChar)) {
stop(paste("Input contains ambiguous codes(s): \"",
ambChar, "\".", sep=""))
}
}
return(s)
}
# == 2.02 dbConfirmUnique() ================================================
dbConfirmUnique <- function(x) {
# x is a vector of logicals.
# returns x if x has exactly one TRUE element.
# stop() otherwise.
if (any(!is.logical(x))) {
stop("PANIC: Input is not a boolean vector.")
} else if (sum(x) == 0) {
stop("PANIC: No match found.")
} else if (sum(x) > 1) {
stop("PANIC: More than one match found.")
} else {
return(x)
}
}
# == 2.03 dbInit() =========================================================
dbInit <- function() {
# Return an empty instance of the protein database
# Open the link and study the schema:
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
db <- list()
db$version <- "1.0"
db$protein <- data.frame(
ID = numeric(),
name = character(),
RefSeqID = character(),
UniProtID = character(),
taxonomyID = numeric(),
sequence = character())
db$taxonomy <- data.frame(
ID = numeric(),
species = character())
db$annotation <- data.frame(
ID = numeric(),
proteinID = numeric(),
featureID = numeric(),
start = numeric(),
end = numeric())
db$feature <- data.frame(
ID = numeric(),
name = character(),
description = character(),
sourceDB = character(),
accession = character())
return(db)
}
# == 2.04 dbAutoincrement() ================================================
dbAutoincrement <- function(tb) {
# Return a unique integer that can be used as a primary key
# Value:
# num a number one-larger than the largest current value in table$ID
if (length(tb$ID) == 0) {
return(1)
} else {
return(max(tb$ID) + 1)
}
}
# == 2.05 dbAddProtein() ===================================================
dbAddProtein <- function(db, jsonDF) {
# Add one or more protein entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame protein data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(ID = dbAutoincrement(db$protein),
name = jsonDF$name[i],
RefSeqID = jsonDF$RefSeqID[i],
UniProtID = jsonDF$UniProtID[i],
taxonomyID = jsonDF$taxonomyID[i],
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
db$protein <- rbind(db$protein, x)
}
return(db)
}
# == 2.06 dbAddFeature() ===================================================
dbAddFeature <- function(db, jsonDF) {
# Add one or more feature entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame feature data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(ID = dbAutoincrement(db$feature),
name = jsonDF$name[i],
description = jsonDF$description[i],
sourceDB = jsonDF$sourceDB[i],
accession = jsonDF$accession[i])
db$feature <- rbind(db$feature, x)
}
return(db)
}
# == 2.07 dbAddTaxonomy() ==================================================
dbAddTaxonomy <- function(db, jsonDF) {
# Add one or more taxonomy entries to the database db.
# Parameters:
# db list A database created with dbInit()
# jsonDF data frame Taxonomy data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
x <- data.frame(
ID = jsonDF$ID[i],
species = jsonDF$species[i])
db$taxonomy <- rbind(db$taxonomy, x)
}
return(db)
}
# == 2.08 dbAddAnnotation() ================================================
dbAddAnnotation <- function(db, jsonDF) {
# Add one or more annotation entries to the database db.
# Parameters:
# db list a database created with dbInit()
# jsonDF data frame annotation data imported into a data frame with
# fromJSON()
for (i in seq_len(nrow(jsonDF))) {
sel <- jsonDF$pName[i] == db$protein$name
sel <- dbConfirmUnique(sel)
pID <- db$protein$ID[sel]
sel <- jsonDF$fName[i] == db$feature$name
sel <- dbConfirmUnique(sel)
fID <- db$feature$ID[sel]
x <- data.frame(ID = dbAutoincrement(db$annotation),
proteinID = pID,
featureID = fID,
start = as.integer(jsonDF$start[i]),
end = as.integer(jsonDF$end[i]))
db$annotation <- rbind(db$annotation, x)
}
return(db)
}
# == 2.09 dbFetchUniProtSeq() ==============================================
dbFetchUniProtSeq <- function(IDs) {
# Fetch a protein sequence from UniProt.
# Parameters:
# IDs char a vector of UniProt IDs (accession number)
# Value:
# char a vector of the same length as ID. It contains
# sequences where the retrieval was successful, NA where
# it was not successful. The elements are named with
# the ID, the header lines are set as attribute "header"
BASE <- "http://www.uniprot.org/uniprot/"
sq <- character()
hd <- character()
for (i in seq_along(IDs)) {
URL <- sprintf("%s%s.fasta", BASE, IDs[i])
response <- httr::GET(URL)
if (httr::status_code(response) == 200) {
s <- as.character(response)
s <- unlist(strsplit(s, "\n"))
x <- dbSanitizeSequence(s)
} else {
s <- ""
x <- NA
}
hd[i] <- s[1]
sq[i] <- x
}
names(sq) <- IDs
attr(sq, "headers") <- hd
return(sq)
}
if (FALSE) {
inp <- c("P79073", "P0000000", "A0A1W2TKZ7")
s <- dbFetchUniProtSeq(inp)
s[1:3]
str(s)
attr(s, "headers")[1]
}
# == 2.10 dbFetchPrositeFeatures() =========================================
dbFetchPrositeFeatures <- function(ID) {
# Fetch feature annotations from ScanProsite.
# Parameters:
# ID char a UniProt ID (accession number)
# Value:
# data frame uID char UniProt ID
# start num start of motif
# end num end of motif
# psID char PROSITE motif ID
# psName char PROSITE motif name
# psSeq char sequence annotated to the feature
# If the operation is not successful, a 0-length data frame is returned.
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = ID,
skip = "on",
output = "tabular"))
myFeatures <- data.frame()
if (httr::status_code(response) == 200) {
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
patt <- sprintf("\\|%s\\|", ID)
lines <- lines[grep(patt, lines)]
for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|"))
myFeatures <- rbind(myFeatures,
data.frame(uID = tokens[2],
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
psSeq = tokens[11]))
}
}
return(myFeatures)
}
if (FALSE) {
dbFetchPrositeFeatures("P33520") # RES1_SCHPO
}
# == 2.11 node2text() ======================================================
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
# between given tags in an XML response.
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes))
}
# == 2.12 dbFetchNCBItaxData() =============================================
dbFetchNCBItaxData <- function(ID) {
# Fetch feature taxID and Organism from the NCBI.
# Parameters:
# ID char a RefSeq ID (accession number)
# Value:
# data frame taxID num NCBI taxID
# organism char organism for this taxID
# If the operation is not successful, a 0-length data frame is returned.
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
URL <- paste(eUtilsBase,
"esearch.fcgi?",
"db=protein",
"&term=", ID,
sep="")
myXML <- xml2::read_xml(URL)
GID <- node2text(myXML, "Id")
URL <- paste0(eUtilsBase,
"esummary.fcgi?",
"db=protein",
"&id=",
GID,
"&version=2.0")
myXML <- xml2::read_xml(URL)
x <- as.integer(node2text(myXML, "TaxId"))
y <- node2text(myXML, "Organism")
tID <- data.frame()
if (length(x) > 0 && length(y) > 0) {
tID <- data.frame(taxID = x, organism = y)
}
return(tID)
}
# == 2.13 UniProtIDmap() ===================================================
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# Use UniProt ID mapping service to map one or more IDs
# Parameters:
# s char A string of white-space separated IDs
# mapFrom char the database in which the IDs in s are valid.
# Default is RefSeq protein
# mapTo char the database in which the target IDs are valid.
# Default is UniProtKB
# Value
# A data frame of mapped IDs, with column names From and To, or an
# empty data frame if the mapping was unsuccessful. No rows are returned
# for IDs that are not mapped.
# Initialize curl
httr::set_config(httr::config(http_version = 0))
URL <- "https://www.uniprot.org/uploadlists/"
response <- httr::POST(URL,
body = list(from = mapFrom,
to = mapTo,
format = "tab",
query = s))
if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t")
colnames(myMap) <- c("From", "To")
} else {
myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:",
"server sent status",
httr::status_code(response)))
}
return(myMap)
}
# = 3 TESTS ===============================================================
if (FALSE) {
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
}
# ToDo: test everything here
}
# [END]