bch441-work-abc-units/scripts/ABC-dbUtilities.R

# ABC-dbUtilities.R

# database utilities for ABC learning units
#
# ==============================================================================
#


# ====== PACKAGES ==============================================================


if (! require(jsonlite, quietly = TRUE)) {
  install.packages("jsonlite")
  library(jsonlite)
}


if (! require(httr, quietly = TRUE)) {
  install.packages("httr")
  library(httr)
}


if (! require(xml2, quietly = TRUE)) {
  install.packages("xml2")
  library(xml2)
}


# ====== FUNCTIONS =============================================================


dbSanitizeSequence <- function(s, unambiguous = TRUE) {
  # Remove FASTA header lines, if any,
  # flatten any structure that s has,
  # remove all non-letters except "-" (gap) and "*" (stop),
  # convert to uppercase.
  #
  # Parameters:
  #   s  chr  A DNA or protein sequence plus other characters
  #   unambiguous  bool  if TRUE, stop() if any letter remaining after
  #                      processing matches an ambiguity code. This is likely
  #                      due to inadvertently including meta-data, such as
  #                      a FASTA header, with the sequence.
  # Note: since U is an ambiguity code for amino acid sequences, you need
  #         to set unambiguous = FALSE to process RNA sequences with Uracil.
  # Value: chr   a valid, uppercase, amino acid sequence
  #

  s <- as.character(unlist(s))    # convert complex object to plain chr vector
  s <- unlist(strsplit(s, "\n"))  # split up at linebreaks, if any
  s <- s[! grepl("^>", s)]        # drop all lines beginning">" (FASTA header)
  s <- paste(s, collapse="")      # combine into single string
  s <- toupper(gsub("[^a-zA-Z*-]", "", s))
  if (unambiguous) {
    amb <- "([bjouxzBJOUXZ])"  # parentheses capture the match
    ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]
    if (! is.na(ambChar)) {
      stop(paste("Input contains ambiguous codes(s): \"",
                 ambChar, "\".", sep=""))
    }
  }
  return(s)
}


dbConfirmUnique <- function(x) {
  # x is a vector of logicals.
  # returns x if x has exactly one TRUE element.
  # stop() otherwise.

  if (any(!is.logical(x))) {
    stop("PANIC: Input is not a boolean vector.")
  } else if (sum(x) == 0) {
    stop("PANIC: No match found.")
  } else  if (sum(x) > 1) {
    stop("PANIC: More than one match found.")
  } else {
    return(x)
  }
}


dbInit <- function() {
  # Return an empty instance of the protein database

  db <- list()

  db$protein <- data.frame(
    ID = numeric(),
    name = character(),
    RefSeqID = character(),
    UniProtID = character(),
    taxonomyID = numeric(),
    sequence = character(),
    stringsAsFactors = FALSE)

  db$taxonomy <- data.frame(
    ID = numeric(),
    species = character(),
    stringsAsFactors = FALSE)


  db$annotation <- data.frame(
    ID = numeric(),
    proteinID = numeric(),
    featureID = numeric(),
    start = numeric(),
    end = numeric(),
    stringsAsFactors = FALSE)

  db$feature <- data.frame(
    ID = numeric(),
    name = character(),
    description = character(),
    sourceDB = character(),
    accession = character(),
    stringsAsFactors = FALSE)

  return(db)
}


dbAutoincrement <- function(tb) {
  # Return a unique integer that can be used as a primary key
  # Value:
  #   num  a number one-larger than the largest current value in table$ID
  if (length(tb$ID) == 0) {
    return(1)
  } else {
    return(max(tb$ID) + 1)
  }
}


dbAddProtein <- function(db, jsonDF) {
  # Add one or more protein entries to the database db.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  protein data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
    x <- data.frame(ID          = dbAutoincrement(db$protein),
                    name        = jsonDF$name[i],
                    RefSeqID    = jsonDF$RefSeqID[i],
                    UniProtID   = jsonDF$UniProtID[i],
                    taxonomyID  = jsonDF$taxonomyID[i],
                    sequence    = dbSanitizeSequence(jsonDF$sequence[i]),
                    stringsAsFactors = FALSE)
    db$protein <- rbind(db$protein, x)
  }
  return(db)
}


dbAddFeature <- function(db, jsonDF) {
  # Add one or more feature entries to the database db.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  feature data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
    x <- data.frame(ID          = dbAutoincrement(db$feature),
                    name        = jsonDF$name[i],
                    description = jsonDF$description[i],
                    sourceDB    = jsonDF$sourceDB[i],
                    accession   = jsonDF$accession[i],
                    stringsAsFactors = FALSE)
    db$feature <- rbind(db$feature, x)
  }
  return(db)
}


dbAddTaxonomy <- function(db, jsonDF) {
  # Add one or more taxonomy entries to the database db.
  # Parameters:
  #     db      list         A database created with dbInit()
  #     jsonDF  data frame   Taxonomy data imported into a data frame with
  #                            fromJSON()
  for (i in seq_len(nrow(jsonDF))) {
    x <- data.frame(
      ID =  jsonDF$ID[i],
      species = jsonDF$species[i],
      stringsAsFactors = FALSE)
    db$taxonomy <- rbind(db$taxonomy, x)
  }
  return(db)
}

dbAddAnnotation <- function(db, jsonDF) {
  # Add one or more annotation entries to the database db.
  # Parameters:
  #     db   list   a database created with dbInit()
  #     jsonDF  data frame  annotation data imported into a data frame with
  #                           fromJSON()
  for (i in seq_len(nrow(jsonDF))) {

    sel <- jsonDF$pName[i] == db$protein$name
    sel <- dbConfirmUnique(sel)
    pID <- db$protein$ID[sel]

    sel <- jsonDF$fName[i] == db$feature$name
    sel <- dbConfirmUnique(sel)
    fID <- db$feature$ID[sel]

    x <- data.frame(ID        = dbAutoincrement(db$annotation),
                    proteinID = pID,
                    featureID = fID,
                    start     = as.integer(jsonDF$start[i]),
                    end       = as.integer(jsonDF$end[i]),
                    stringsAsFactors = FALSE)
    db$annotation <- rbind(db$annotation, x)
  }
  return(db)
}


dbFetchUniProtSeq <- function(ID) {
  # Fetch a protein sequence from UniProt.
  # Parameters:
  #     ID   char   a UniProt ID (accession number)
  # Value:
  #     char        the sequence
  # If the operation is not successful, a 0-length string is returned

  URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)

  response <- GET(URL)

  mySeq <- character()
  if (status_code(response) == 200) {
    x <- as.character(response)
    x <- strsplit(x, "\n")
    mySeq <- dbSanitizeSequence(x)
  }

  return(mySeq)
}


dbFetchPrositeFeatures <- function(ID) {
  # Fetch feature annotations from ScanProsite.
  # Parameters:
  #     ID   char   a UniProt ID (accession number)
  # Value:
  #     data frame     uID    char  UniProt ID
  #                    start  num   start of motif
  #                    end    num   end of motif
  #                    psID   char  PROSITE motif ID
  #                    psName char  PROSITE motif name
  # If the operation is not successful, a 0-length data frame is returned.

  URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"

  response <- POST(URL,
                   body = list(meta = "opt1",
                               meta1_protein = "opt1",
                               seq = ID,
                               skip = "on",
                               output = "tabular"))

  myFeatures <- data.frame()
  if (status_code(response) == 200) {

    lines <- unlist(strsplit(content(response, "text"), "\\n"))

    patt <- sprintf("\\|%s\\|", UniProtID)
    lines <- lines[grep(patt, lines)]

    for (line in lines) {
      tokens <- unlist(strsplit(line, "\\t|\\|"))
      myFeatures <- rbind(myFeatures,
                          data.frame(uID   =  tokens[2],
                                     start =  as.numeric(tokens[4]),
                                     end   =  as.numeric(tokens[5]),
                                     psID  =  tokens[6],
                                     psName = tokens[7],
                                     stringsAsFactors = FALSE))
    }
  }
  return(myFeatures)
}


node2text <- function(doc, tag) {
  # an extractor function for the contents of elements
  # between given tags in an XML response.
  # Contents of all matching elements is returned in
  # a vector of strings.
  path <- paste0("//", tag)
  nodes <- xml_find_all(doc, path)
  return(xml_text(nodes))
}


dbFetchNCBItaxData <- function(ID) {
  # Fetch feature taxID and Organism from the NCBI.
  # Parameters:
  #     ID   char   a RefSeq ID (accession number)
  # Value:
  #     data frame     taxID      num    NCBI taxID
  #                    organism   char   organism for this taxID
  # If the operation is not successful, a 0-length data frame is returned.

  eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
  URL <- paste(eUtilsBase,
               "esearch.fcgi?",
               "db=protein",
               "&term=", ID,
               sep="")
  myXML <- read_xml(URL)
  GID <- node2text(myXML, "Id")

  URL <- paste0(eUtilsBase,
                "esummary.fcgi?",
                "db=protein",
                "&id=",
                GID,
                "&version=2.0")
  myXML <- read_xml(URL)

  x <- as.integer(node2text(myXML, "TaxId"))
  y <- node2text(myXML, "Organism")

  tID <- data.frame()
  if (length(x) > 0 && length(y) > 0) {
    tID <- data.frame(taxID = x, organism = y)
  }
  return(tID)
}


UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
  # Use UniProt ID mapping service to map one or more IDs
  # Parameters:
  #    s        char  A string of white-space separated IDs
  #    mapFrom  char  the database in which the IDs in s are valid.
  #                     Default is RefSeq protein
  #    mapTo    char  the database in which the target IDs are valid.
  #                     Default is UniProtKB
  # Value
  #    A data frame of mapped IDs, with column names From and To, or an
  #    empty data frame if the mapping was unsuccessful. No rows are returned
  #    for IDs that are not mapped.

  URL <- "http://www.uniprot.org/mapping/"
  response <- POST(URL,
                   body = list(from = mapFrom,
                               to = mapTo,
                               format = "tab",
                               query = s))

  if (status_code(response) == 200) { # 200: oK
    myMap <- read.delim(file = textConnection(content(response)),
                        sep = "\t",
                        stringsAsFactors = FALSE)
  } else {
    myMap <- data.frame()
    warning(paste("No uniProt ID mapping returned:",
                  "server sent status",
                  status_code(response)))
  }

  return(myMap)
}


# [END]
new scripts 2017-09-25 05:31:40 +00:00			`# ABC-dbUtilities.R`

			`# database utilities for ABC learning units`
			`#`
			`# ==============================================================================`
			`#`


			`# ====== PACKAGES ==============================================================`


add httr and xml2 package, add scripted data download functions 2017-10-06 12:51:12 +00:00			`if (! require(jsonlite, quietly = TRUE)) {`
new scripts 2017-09-25 05:31:40 +00:00			`install.packages("jsonlite")`
			`library(jsonlite)`
			`}`


coding style: 2017-10-29 03:08:31 +00:00			`if (! require(httr, quietly = TRUE)) {`
add httr and xml2 package, add scripted data download functions 2017-10-06 12:51:12 +00:00			`install.packages("httr")`
			`library(httr)`
			`}`


coding style: 2017-10-29 03:08:31 +00:00			`if (! require(xml2, quietly = TRUE)) {`
add httr and xml2 package, add scripted data download functions 2017-10-06 12:51:12 +00:00			`install.packages("xml2")`
			`library(xml2)`
			`}`


new scripts 2017-09-25 05:31:40 +00:00			`# ====== FUNCTIONS =============================================================`


			`dbSanitizeSequence <- function(s, unambiguous = TRUE) {`
update function to remove FASTA headers (line-break terminated substrings that begin with a ">") 2017-09-29 16:25:00 +00:00			`# Remove FASTA header lines, if any,`
			`# flatten any structure that s has,`
Update: FASTA files may contain gap and stop characters 2017-10-14 22:26:16 +00:00			`# remove all non-letters except "-" (gap) and "*" (stop),`
update function to remove FASTA headers (line-break terminated substrings that begin with a ">") 2017-09-29 16:25:00 +00:00			`# convert to uppercase.`
new scripts 2017-09-25 05:31:40 +00:00			`#`
			`# Parameters:`
			`# s chr A DNA or protein sequence plus other characters`
			`# unambiguous bool if TRUE, stop() if any letter remaining after`
			`# processing matches an ambiguity code. This is likely`
			`# due to inadvertently including meta-data, such as`
			`# a FASTA header, with the sequence.`
			`# Note: since U is an ambiguity code for amino acid sequences, you need`
			`# to set unambiguous = FALSE to process RNA sequences with Uracil.`
			`# Value: chr a valid, uppercase, amino acid sequence`
			`#`
update function to remove FASTA headers (line-break terminated substrings that begin with a ">") 2017-09-29 16:25:00 +00:00
			`s <- as.character(unlist(s)) # convert complex object to plain chr vector`
			`s <- unlist(strsplit(s, "\n")) # split up at linebreaks, if any`
			`s <- s[! grepl("^>", s)] # drop all lines beginning">" (FASTA header)`
			`s <- paste(s, collapse="") # combine into single string`
Update: FASTA files may contain gap and stop characters 2017-10-14 22:26:16 +00:00			`s <- toupper(gsub("[^a-zA-Z*-]", "", s))`
new scripts 2017-09-25 05:31:40 +00:00			`if (unambiguous) {`
			`amb <- "([bjouxzBJOUXZ])" # parentheses capture the match`
			`ambChar <- unlist(regmatches(s, regexec(amb, s)))[1]`
			`if (! is.na(ambChar)) {`
			`stop(paste("Input contains ambiguous codes(s): \"",`
			`ambChar, "\".", sep=""))`
			`}`
			`}`
			`return(s)`
			`}`


			`dbConfirmUnique <- function(x) {`
			`# x is a vector of logicals.`
			`# returns x if x has exactly one TRUE element.`
			`# stop() otherwise.`

			`if (any(!is.logical(x))) {`
			`stop("PANIC: Input is not a boolean vector.")`
			`} else if (sum(x) == 0) {`
			`stop("PANIC: No match found.")`
			`} else if (sum(x) > 1) {`
			`stop("PANIC: More than one match found.")`
			`} else {`
			`return(x)`
			`}`
			`}`


			`dbInit <- function() {`
			`# Return an empty instance of the protein database`

			`db <- list()`

			`db$protein <- data.frame(`
			`ID = numeric(),`
			`name = character(),`
			`RefSeqID = character(),`
			`UniProtID = character(),`
			`taxonomyID = numeric(),`
			`sequence = character(),`
			`stringsAsFactors = FALSE)`

			`db$taxonomy <- data.frame(`
			`ID = numeric(),`
			`species = character(),`
			`stringsAsFactors = FALSE)`


			`db$annotation <- data.frame(`
			`ID = numeric(),`
			`proteinID = numeric(),`
			`featureID = numeric(),`
			`start = numeric(),`
			`end = numeric(),`
			`stringsAsFactors = FALSE)`

			`db$feature <- data.frame(`
			`ID = numeric(),`
			`name = character(),`
			`description = character(),`
			`sourceDB = character(),`
			`accession = character(),`
			`stringsAsFactors = FALSE)`

			`return(db)`
			`}`


			`dbAutoincrement <- function(tb) {`
			`# Return a unique integer that can be used as a primary key`
			`# Value:`
			`# num a number one-larger than the largest current value in table$ID`
			`if (length(tb$ID) == 0) {`
			`return(1)`
			`} else {`
			`return(max(tb$ID) + 1)`
			`}`
			`}`


			`dbAddProtein <- function(db, jsonDF) {`
			`# Add one or more protein entries to the database db.`
			`# Parameters:`
			`# db list a database created with dbInit()`
			`# jsonDF data frame protein data imported into a data frame with`
			`# fromJSON()`
			`for (i in seq_len(nrow(jsonDF))) {`
			`x <- data.frame(ID = dbAutoincrement(db$protein),`
			`name = jsonDF$name[i],`
			`RefSeqID = jsonDF$RefSeqID[i],`
			`UniProtID = jsonDF$UniProtID[i],`
			`taxonomyID = jsonDF$taxonomyID[i],`
			`sequence = dbSanitizeSequence(jsonDF$sequence[i]),`
			`stringsAsFactors = FALSE)`
			`db$protein <- rbind(db$protein, x)`
			`}`
			`return(db)`
			`}`


			`dbAddFeature <- function(db, jsonDF) {`
			`# Add one or more feature entries to the database db.`
			`# Parameters:`
			`# db list a database created with dbInit()`
			`# jsonDF data frame feature data imported into a data frame with`
			`# fromJSON()`
			`for (i in seq_len(nrow(jsonDF))) {`
			`x <- data.frame(ID = dbAutoincrement(db$feature),`
			`name = jsonDF$name[i],`
			`description = jsonDF$description[i],`
			`sourceDB = jsonDF$sourceDB[i],`
			`accession = jsonDF$accession[i],`
			`stringsAsFactors = FALSE)`
			`db$feature <- rbind(db$feature, x)`
			`}`
			`return(db)`
			`}`


			`dbAddTaxonomy <- function(db, jsonDF) {`
			`# Add one or more taxonomy entries to the database db.`
			`# Parameters:`
			`# db list A database created with dbInit()`
			`# jsonDF data frame Taxonomy data imported into a data frame with`
			`# fromJSON()`
			`for (i in seq_len(nrow(jsonDF))) {`
			`x <- data.frame(`
			`ID = jsonDF$ID[i],`
			`species = jsonDF$species[i],`
			`stringsAsFactors = FALSE)`
			`db$taxonomy <- rbind(db$taxonomy, x)`
			`}`
			`return(db)`
			`}`

			`dbAddAnnotation <- function(db, jsonDF) {`
			`# Add one or more annotation entries to the database db.`
			`# Parameters:`
			`# db list a database created with dbInit()`
			`# jsonDF data frame annotation data imported into a data frame with`
			`# fromJSON()`
			`for (i in seq_len(nrow(jsonDF))) {`

			`sel <- jsonDF$pName[i] == db$protein$name`
			`sel <- dbConfirmUnique(sel)`
			`pID <- db$protein$ID[sel]`

			`sel <- jsonDF$fName[i] == db$feature$name`
			`sel <- dbConfirmUnique(sel)`
			`fID <- db$feature$ID[sel]`

			`x <- data.frame(ID = dbAutoincrement(db$annotation),`
			`proteinID = pID,`
			`featureID = fID,`
			`start = as.integer(jsonDF$start[i]),`
			`end = as.integer(jsonDF$end[i]),`
			`stringsAsFactors = FALSE)`
			`db$annotation <- rbind(db$annotation, x)`
			`}`
			`return(db)`
			`}`


add httr and xml2 package, add scripted data download functions 2017-10-06 12:51:12 +00:00			`dbFetchUniProtSeq <- function(ID) {`
			`# Fetch a protein sequence from UniProt.`
			`# Parameters:`
			`# ID char a UniProt ID (accession number)`
			`# Value:`
			`# char the sequence`
			`# If the operation is not successful, a 0-length string is returned`

			`URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)`

			`response <- GET(URL)`

			`mySeq <- character()`
			`if (status_code(response) == 200) {`
			`x <- as.character(response)`
			`x <- strsplit(x, "\n")`
			`mySeq <- dbSanitizeSequence(x)`
			`}`

			`return(mySeq)`
			`}`


			`dbFetchPrositeFeatures <- function(ID) {`
			`# Fetch feature annotations from ScanProsite.`
			`# Parameters:`
			`# ID char a UniProt ID (accession number)`
			`# Value:`
			`# data frame uID char UniProt ID`
			`# start num start of motif`
			`# end num end of motif`
			`# psID char PROSITE motif ID`
			`# psName char PROSITE motif name`
			`# If the operation is not successful, a 0-length data frame is returned.`

			`URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"`

			`response <- POST(URL,`
			`body = list(meta = "opt1",`
			`meta1_protein = "opt1",`
			`seq = ID,`
			`skip = "on",`
			`output = "tabular"))`

			`myFeatures <- data.frame()`
			`if (status_code(response) == 200) {`

			`lines <- unlist(strsplit(content(response, "text"), "\\n"))`

			`patt <- sprintf("\\\|%s\\\|", UniProtID)`
			`lines <- lines[grep(patt, lines)]`

			`for (line in lines) {`
			`tokens <- unlist(strsplit(line, "\\t\|\\\|"))`
			`myFeatures <- rbind(myFeatures,`
			`data.frame(uID = tokens[2],`
			`start = as.numeric(tokens[4]),`
			`end = as.numeric(tokens[5]),`
			`psID = tokens[6],`
			`psName = tokens[7],`
			`stringsAsFactors = FALSE))`
			`}`
			`}`
			`return(myFeatures)`
			`}`


			`node2text <- function(doc, tag) {`
			`# an extractor function for the contents of elements`
			`# between given tags in an XML response.`
			`# Contents of all matching elements is returned in`
			`# a vector of strings.`
			`path <- paste0("//", tag)`
			`nodes <- xml_find_all(doc, path)`
			`return(xml_text(nodes))`
			`}`


			`dbFetchNCBItaxData <- function(ID) {`
			`# Fetch feature taxID and Organism from the NCBI.`
			`# Parameters:`
			`# ID char a RefSeq ID (accession number)`
			`# Value:`
			`# data frame taxID num NCBI taxID`
			`# organism char organism for this taxID`
			`# If the operation is not successful, a 0-length data frame is returned.`

			`eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"`
			`URL <- paste(eUtilsBase,`
			`"esearch.fcgi?",`
			`"db=protein",`
			`"&term=", ID,`
			`sep="")`
			`myXML <- read_xml(URL)`
			`GID <- node2text(myXML, "Id")`

			`URL <- paste0(eUtilsBase,`
			`"esummary.fcgi?",`
			`"db=protein",`
			`"&id=",`
			`GID,`
			`"&version=2.0")`
			`myXML <- read_xml(URL)`

			`x <- as.integer(node2text(myXML, "TaxId"))`
			`y <- node2text(myXML, "Organism")`

			`tID <- data.frame()`
			`if (length(x) > 0 && length(y) > 0) {`
			`tID <- data.frame(taxID = x, organism = y)`
			`}`
			`return(tID)`
			`}`

Add UniProtIDmap() function 2017-10-08 19:59:56 +00:00

			`UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {`
			`# Use UniProt ID mapping service to map one or more IDs`
			`# Parameters:`
			`# s char A string of white-space separated IDs`
			`# mapFrom char the database in which the IDs in s are valid.`
			`# Default is RefSeq protein`
			`# mapTo char the database in which the target IDs are valid.`
			`# Default is UniProtKB`
			`# Value`
			`# A data frame of mapped IDs, with column names From and To, or an`
			`# empty data frame if the mapping was unsuccessful. No rows are returned`
			`# for IDs that are not mapped.`

			`URL <- "http://www.uniprot.org/mapping/"`
			`response <- POST(URL,`
			`body = list(from = mapFrom,`
			`to = mapTo,`
			`format = "tab",`
			`query = s))`

			`if (status_code(response) == 200) { # 200: oK`
			`myMap <- read.delim(file = textConnection(content(response)),`
			`sep = "\t",`
			`stringsAsFactors = FALSE)`
			`} else {`
			`myMap <- data.frame()`
			`warning(paste("No uniProt ID mapping returned:",`
			`"server sent status",`
			`status_code(response)))`
			`}`

			`return(myMap)`
			`}`



new scripts 2017-09-25 05:31:40 +00:00			`# [END]`