Add code and utility functions for database export of protein annotations to JSON - towards sharing annotations on the Student Wiki Public page
This commit is contained in:
parent
3bee83495f
commit
4d071cf8d5
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
#
|
#
|
||||||
# Date: 2017-11 - 2020-10
|
# Date: 2017-11 - 2020-10
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.3 Add code for database export to JSON and instructions
|
||||||
|
# for uploading annotations to the Public Student Wiki page
|
||||||
# 1.2 Consistently: data in ./myScripts/ ;
|
# 1.2 Consistently: data in ./myScripts/ ;
|
||||||
# begin SHARING DATA section
|
# begin SHARING DATA section
|
||||||
# 1.1 2020 Updates
|
# 1.1 2020 Updates
|
||||||
@ -31,12 +33,14 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------------
|
||||||
#TOC> 1 Update your database script 42
|
#TOC> 1 Update your database script 48
|
||||||
#TOC> 1.1 Preparing an annotation file ... 49
|
#TOC> 1.1 Preparing an annotation file ... 55
|
||||||
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 52
|
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 58
|
||||||
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 97
|
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 106
|
||||||
#TOC> 1.2 Execute and Validate 124
|
#TOC> 1.2 Execute and Validate 133
|
||||||
#TOC> 2 Plot Annotations 149
|
#TOC> 2 Plot Annotations 158
|
||||||
|
#TOC> 3 SHARING DATA 283
|
||||||
|
#TOC> 3.1 Post MBP1_MYSPE as JSON data 298
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -276,16 +280,47 @@ par(oPar) # reset the plot parameters
|
|||||||
# It would be better to align the motif borders, at least approximately (not
|
# It would be better to align the motif borders, at least approximately (not
|
||||||
# all proteins have all motifs). How would you go about doing that?
|
# all proteins have all motifs). How would you go about doing that?
|
||||||
|
|
||||||
# = 1 SHARING DATA ======
|
# = 3 SHARING DATA ========================================================
|
||||||
|
|
||||||
# It's particularly interesting to compare such annotations across many
|
# It's particularly interesting to compare such annotations across many
|
||||||
# homologous proteins. I have created a file on the Student Wiki that you can
|
# homologous proteins. I have created a page on the Student Wiki () that you can
|
||||||
# edit, and then download the data from the entire class directly to your
|
# edit, and then download the data from the entire class directly to your
|
||||||
# RStudio project.
|
# RStudio project.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
# I have provided a function that extracts all information that refers to a
|
||||||
|
# single protein from the database, and prints it out as well-formatted JSON,
|
||||||
|
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
|
||||||
|
# bookkeeping involved, but the code is not otherwise very enlightening so I
|
||||||
|
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
||||||
|
# would want to have a look.
|
||||||
|
|
||||||
|
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
||||||
|
|
||||||
# Task:
|
# Task:
|
||||||
# =====
|
# =====
|
||||||
# TBC ...
|
# 1: Run the following code:
|
||||||
|
|
||||||
|
cat("{{Vspace}}",
|
||||||
|
"<!-- ==== BEGIN PROTEIN ==== -->",
|
||||||
|
"<pre>",
|
||||||
|
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
||||||
|
"</pre>",
|
||||||
|
"<!-- ===== END PROTEIN ====== -->",
|
||||||
|
"", sep = "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2: Copy the entire output,
|
||||||
|
# 3: Navigate to
|
||||||
|
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
||||||
|
# ... edit the page, and paste your output at the top.
|
||||||
|
# 4: Save your edits.
|
||||||
|
|
||||||
|
# Next, once we have collected a number of protein annotations, we can access
|
||||||
|
# the page and import the data into our database.
|
||||||
|
#
|
||||||
|
# Code to come soon ...
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -1,6 +1,30 @@
|
|||||||
# tocID <- "scripts/ABC-dbUtilities.R"
|
# tocID <- "scripts/ABC-dbUtilities.R"
|
||||||
#
|
#
|
||||||
# database utilities for ABC learning units
|
# Purpose: Database utilities for ABC learning units.
|
||||||
|
#
|
||||||
|
# Version 2.1
|
||||||
|
#
|
||||||
|
# Date: 2017-11 - 2020-10
|
||||||
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
|
#
|
||||||
|
# Versions:
|
||||||
|
# 2.1 Add JSON export functions
|
||||||
|
# 2.0 Test all JSON import and prevent addition of duplicates. This
|
||||||
|
# is necessary for import of data from the public page
|
||||||
|
# 1.1 2020 Updates
|
||||||
|
# 1.0 Live version 2017
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# There are no functions to modify or delete entries. To do either,
|
||||||
|
# recreate the database with correct data in the creation script. This is the
|
||||||
|
# preferred way that ensures the entire database can be reproduced by
|
||||||
|
# source()'ing its generating script.
|
||||||
|
#
|
||||||
|
# Inserting data goes only through the very most minimal validation steps. For
|
||||||
|
# production applications, more validation would need to be added, as well
|
||||||
|
# as an overall validation of database integrity
|
||||||
|
#
|
||||||
|
# ToDo:
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
@ -8,49 +32,56 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------
|
#TOC> -------------------------------------------------------
|
||||||
#TOC> 1 PACKAGES 32
|
#TOC> 1 INITIALISATIONS AND PARAMETERS 60
|
||||||
#TOC> 2 FUNCTIONS 50
|
#TOC> 2 PACKAGES 65
|
||||||
#TOC> 2.01 dbSanitizeSequence() 53
|
#TOC> 3 FUNCTIONS 81
|
||||||
#TOC> 2.02 dbConfirmUnique() 88
|
#TOC> 3.01 dbSanitizeSequence() 84
|
||||||
#TOC> 2.03 dbInit() 106
|
#TOC> 3.02 dbConfirmUnique() 119
|
||||||
#TOC> 2.04 dbAutoincrement() 147
|
#TOC> 3.03 dbInit() 137
|
||||||
#TOC> 2.05 dbAddProtein() 160
|
#TOC> 3.04 dbAutoincrement() 177
|
||||||
#TOC> 2.06 dbAddFeature() 180
|
#TOC> 3.05 dbAddProtein() 190
|
||||||
#TOC> 2.07 dbAddTaxonomy() 199
|
#TOC> 3.06 dbAddFeature() 222
|
||||||
#TOC> 2.08 dbAddAnnotation() 215
|
#TOC> 3.07 dbAddTaxonomy() 253
|
||||||
#TOC> 2.09 dbFetchUniProtSeq() 243
|
#TOC> 3.08 dbAddAnnotation() 288
|
||||||
#TOC> 2.10 dbFetchPrositeFeatures() 289
|
#TOC> 3.09 dbFetchUniProtSeq() 335
|
||||||
#TOC> 2.11 node2text() 339
|
#TOC> 3.10 dbFetchPrositeFeatures() 381
|
||||||
#TOC> 2.12 dbFetchNCBItaxData() 351
|
#TOC> 3.11 node2text() 431
|
||||||
#TOC> 2.13 UniProtIDmap() 390
|
#TOC> 3.12 dbFetchNCBItaxData() 443
|
||||||
#TOC> 3 TESTS 429
|
#TOC> 3.13 UniProtIDmap() 482
|
||||||
|
#TOC> 3.14 dbProt2JSON() 521
|
||||||
|
#TOC> 3.15 dbSeq2JSON() 606
|
||||||
|
#TOC> 3.16 dbRow2JSON() 636
|
||||||
|
#TOC> 4 TESTS 656
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 PACKAGES ============================================================
|
# = 1 INITIALISATIONS AND PARAMETERS ======================================
|
||||||
|
|
||||||
|
doTESTS <- FALSE # run tests if TRUE
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 PACKAGES ============================================================
|
||||||
|
|
||||||
|
|
||||||
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||||
install.packages("jsonlite")
|
install.packages("jsonlite")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (! requireNamespace("xml2", quietly = TRUE)) {
|
if (! requireNamespace("xml2", quietly = TRUE)) {
|
||||||
install.packages("xml2")
|
install.packages("xml2")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# = 2 FUNCTIONS ===========================================================
|
# = 3 FUNCTIONS ===========================================================
|
||||||
|
|
||||||
|
|
||||||
# == 2.01 dbSanitizeSequence() =============================================
|
# == 3.01 dbSanitizeSequence() =============================================
|
||||||
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
||||||
# Remove FASTA header lines, if any,
|
# Remove FASTA header lines, if any,
|
||||||
# flatten any structure that s has,
|
# flatten any structure that s has,
|
||||||
@ -85,7 +116,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.02 dbConfirmUnique() ================================================
|
# == 3.02 dbConfirmUnique() ================================================
|
||||||
dbConfirmUnique <- function(x) {
|
dbConfirmUnique <- function(x) {
|
||||||
# x is a vector of logicals.
|
# x is a vector of logicals.
|
||||||
# returns x if x has exactly one TRUE element.
|
# returns x if x has exactly one TRUE element.
|
||||||
@ -103,10 +134,10 @@ dbConfirmUnique <- function(x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.03 dbInit() =========================================================
|
# == 3.03 dbInit() =========================================================
|
||||||
dbInit <- function() {
|
dbInit <- function() {
|
||||||
# Return an empty instance of the protein database
|
# Return an empty instance of the protein database
|
||||||
# Open the link and study the schema:
|
# The schema is here:
|
||||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||||
|
|
||||||
db <- list()
|
db <- list()
|
||||||
@ -125,7 +156,6 @@ dbInit <- function() {
|
|||||||
ID = numeric(),
|
ID = numeric(),
|
||||||
species = character())
|
species = character())
|
||||||
|
|
||||||
|
|
||||||
db$annotation <- data.frame(
|
db$annotation <- data.frame(
|
||||||
ID = numeric(),
|
ID = numeric(),
|
||||||
proteinID = numeric(),
|
proteinID = numeric(),
|
||||||
@ -144,7 +174,7 @@ dbInit <- function() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.04 dbAutoincrement() ================================================
|
# == 3.04 dbAutoincrement() ================================================
|
||||||
dbAutoincrement <- function(tb) {
|
dbAutoincrement <- function(tb) {
|
||||||
# Return a unique integer that can be used as a primary key
|
# Return a unique integer that can be used as a primary key
|
||||||
# Value:
|
# Value:
|
||||||
@ -157,14 +187,25 @@ dbAutoincrement <- function(tb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.05 dbAddProtein() ===================================================
|
# == 3.05 dbAddProtein() ===================================================
|
||||||
dbAddProtein <- function(db, jsonDF) {
|
dbAddProtein <- function(db, jsonDF) {
|
||||||
# Add one or more protein entries to the database db.
|
# Add one or more protein entries to the database db if a protein with the
|
||||||
|
# same name does not yet exist. This enforces that protein names are unique.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# db list a database created with dbInit()
|
# db list a database created with dbInit()
|
||||||
# jsonDF data frame protein data imported into a data frame with
|
# jsonDF data frame protein data imported into a data frame with
|
||||||
# fromJSON()
|
# fromJSON()
|
||||||
|
|
||||||
for (i in seq_len(nrow(jsonDF))) {
|
for (i in seq_len(nrow(jsonDF))) {
|
||||||
|
if (jsonDF$name[i] %in% db$protein$name) {
|
||||||
|
cat(sprintf("Note: Protein No. %d in the input is \"%s\", but %s.\n",
|
||||||
|
i, jsonDF$name[i],
|
||||||
|
"a protein with this name already exists in the database. ",
|
||||||
|
"Skipping this input."))
|
||||||
|
isValid <- FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isValid) {
|
||||||
x <- data.frame(ID = dbAutoincrement(db$protein),
|
x <- data.frame(ID = dbAutoincrement(db$protein),
|
||||||
name = jsonDF$name[i],
|
name = jsonDF$name[i],
|
||||||
RefSeqID = jsonDF$RefSeqID[i],
|
RefSeqID = jsonDF$RefSeqID[i],
|
||||||
@ -173,18 +214,30 @@ dbAddProtein <- function(db, jsonDF) {
|
|||||||
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
|
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
|
||||||
db$protein <- rbind(db$protein, x)
|
db$protein <- rbind(db$protein, x)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.06 dbAddFeature() ===================================================
|
# == 3.06 dbAddFeature() ===================================================
|
||||||
dbAddFeature <- function(db, jsonDF) {
|
dbAddFeature <- function(db, jsonDF) {
|
||||||
# Add one or more feature entries to the database db.
|
# Add one or more feature entries to the database db. Skip if a feature with
|
||||||
|
# the same name already exists.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# db list a database created with dbInit()
|
# db list a database created with dbInit()
|
||||||
# jsonDF data frame feature data imported into a data frame with
|
# jsonDF data frame feature data imported into a data frame with
|
||||||
# fromJSON()
|
# fromJSON()
|
||||||
for (i in seq_len(nrow(jsonDF))) {
|
for (i in seq_len(nrow(jsonDF))) {
|
||||||
|
isValid <- TRUE
|
||||||
|
if (jsonDF$name[i] %in% db$feature$name) {
|
||||||
|
cat(sprintf("Note: Feature No. %d in the input is \"%s\", but %s.\n",
|
||||||
|
i, jsonDF$name[i],
|
||||||
|
"a feature with this name already exists in the database. ",
|
||||||
|
"Skipping this input."))
|
||||||
|
isValid <- FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isVALID) {
|
||||||
x <- data.frame(ID = dbAutoincrement(db$feature),
|
x <- data.frame(ID = dbAutoincrement(db$feature),
|
||||||
name = jsonDF$name[i],
|
name = jsonDF$name[i],
|
||||||
description = jsonDF$description[i],
|
description = jsonDF$description[i],
|
||||||
@ -192,43 +245,81 @@ dbAddFeature <- function(db, jsonDF) {
|
|||||||
accession = jsonDF$accession[i])
|
accession = jsonDF$accession[i])
|
||||||
db$feature <- rbind(db$feature, x)
|
db$feature <- rbind(db$feature, x)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.07 dbAddTaxonomy() ==================================================
|
# == 3.07 dbAddTaxonomy() ==================================================
|
||||||
dbAddTaxonomy <- function(db, jsonDF) {
|
dbAddTaxonomy <- function(db, jsonDF) {
|
||||||
# Add one or more taxonomy entries to the database db.
|
# Add one or more taxonomy entries to the database db. Skip if species name
|
||||||
|
# or taxonomy ID already exist in the database.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# db list A database created with dbInit()
|
# db list A database created with dbInit()
|
||||||
# jsonDF data frame Taxonomy data imported into a data frame with
|
# jsonDF data frame Taxonomy data imported into a data frame with
|
||||||
# fromJSON()
|
# fromJSON()
|
||||||
for (i in seq_len(nrow(jsonDF))) {
|
for (i in seq_len(nrow(jsonDF))) {
|
||||||
|
isValid <- TRUE
|
||||||
|
|
||||||
|
if (jsonDF$species[i] %in% db$taxonomy$species) {
|
||||||
|
cat(sprintf("Note: Species No. %d in the input is \"%s\", but %s%s\n",
|
||||||
|
i, jsonDF$name[i],
|
||||||
|
"a species with this name already exists in the database. ",
|
||||||
|
"Skipping this input."))
|
||||||
|
isValid <- FALSE
|
||||||
|
} else if (jsonDF$ID[i] %in% db$taxonomy$ID) {
|
||||||
|
cat(sprintf("Note: Taxonomy ID No. %d in the input is \"%d\", but %s%s\n",
|
||||||
|
i, jsonDF$ID[i],
|
||||||
|
"this taxonomy ID already exists in the database. ",
|
||||||
|
"Skipping this input."))
|
||||||
|
isValid <- FALSE
|
||||||
|
}
|
||||||
|
if (isValid) {
|
||||||
x <- data.frame(
|
x <- data.frame(
|
||||||
ID = jsonDF$ID[i],
|
ID = as.integer(jsonDF$ID[i]),
|
||||||
species = jsonDF$species[i])
|
species = jsonDF$species[i])
|
||||||
db$taxonomy <- rbind(db$taxonomy, x)
|
db$taxonomy <- rbind(db$taxonomy, x)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
# == 2.08 dbAddAnnotation() ================================================
|
|
||||||
|
# == 3.08 dbAddAnnotation() ================================================
|
||||||
dbAddAnnotation <- function(db, jsonDF) {
|
dbAddAnnotation <- function(db, jsonDF) {
|
||||||
# Add one or more annotation entries to the database db.
|
# Add one or more annotation entries to the database db. Skip the entry if
|
||||||
|
# it already exists in the database.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# db list a database created with dbInit()
|
# db list a database created with dbInit()
|
||||||
# jsonDF data frame annotation data imported into a data frame with
|
# jsonDF data frame annotation data imported into a data frame with
|
||||||
# fromJSON()
|
# fromJSON()
|
||||||
for (i in seq_len(nrow(jsonDF))) {
|
for (i in seq_len(nrow(jsonDF))) {
|
||||||
|
isValid <- TRUE
|
||||||
|
|
||||||
sel <- jsonDF$pName[i] == db$protein$name
|
sel <- jsonDF$pName[i] == db$protein$name
|
||||||
sel <- dbConfirmUnique(sel)
|
sel <- dbConfirmUnique(sel) # Confirm that this protein ID exists
|
||||||
pID <- db$protein$ID[sel]
|
pID <- db$protein$ID[sel]
|
||||||
|
|
||||||
sel <- jsonDF$fName[i] == db$feature$name
|
sel <- jsonDF$fName[i] == db$feature$name
|
||||||
sel <- dbConfirmUnique(sel)
|
sel <- dbConfirmUnique(sel) # Confirm that this feature ID exists
|
||||||
fID <- db$feature$ID[sel]
|
fID <- db$feature$ID[sel]
|
||||||
|
|
||||||
|
sel <- db$annotation$proteinID == pID &
|
||||||
|
db$annotation$featureID == fID &
|
||||||
|
db$annotation$start == as.integer(jsonDF$start[idx]) &
|
||||||
|
db$annotation$end == as.integer(jsonDF$end[idx])
|
||||||
|
|
||||||
|
if (any(sel)) {
|
||||||
|
cat(sprintf("Note: annotation No. %d in the input has %s%s%\n",
|
||||||
|
i,
|
||||||
|
"the same protein name, feature name, start, and end ",
|
||||||
|
"as one that already exists in the database. ",
|
||||||
|
"Skipping this input."))
|
||||||
|
|
||||||
|
isValid <- FALSE
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isValid) {
|
||||||
x <- data.frame(ID = dbAutoincrement(db$annotation),
|
x <- data.frame(ID = dbAutoincrement(db$annotation),
|
||||||
proteinID = pID,
|
proteinID = pID,
|
||||||
featureID = fID,
|
featureID = fID,
|
||||||
@ -236,11 +327,12 @@ dbAddAnnotation <- function(db, jsonDF) {
|
|||||||
end = as.integer(jsonDF$end[i]))
|
end = as.integer(jsonDF$end[i]))
|
||||||
db$annotation <- rbind(db$annotation, x)
|
db$annotation <- rbind(db$annotation, x)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.09 dbFetchUniProtSeq() ==============================================
|
# == 3.09 dbFetchUniProtSeq() ==============================================
|
||||||
dbFetchUniProtSeq <- function(IDs) {
|
dbFetchUniProtSeq <- function(IDs) {
|
||||||
# Fetch a protein sequence from UniProt.
|
# Fetch a protein sequence from UniProt.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -286,7 +378,7 @@ if (FALSE) {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# == 2.10 dbFetchPrositeFeatures() =========================================
|
# == 3.10 dbFetchPrositeFeatures() =========================================
|
||||||
dbFetchPrositeFeatures <- function(ID) {
|
dbFetchPrositeFeatures <- function(ID) {
|
||||||
# Fetch feature annotations from ScanProsite.
|
# Fetch feature annotations from ScanProsite.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -336,7 +428,7 @@ if (FALSE) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# == 2.11 node2text() ======================================================
|
# == 3.11 node2text() ======================================================
|
||||||
node2text <- function(doc, tag) {
|
node2text <- function(doc, tag) {
|
||||||
# an extractor function for the contents of elements
|
# an extractor function for the contents of elements
|
||||||
# between given tags in an XML response.
|
# between given tags in an XML response.
|
||||||
@ -348,7 +440,7 @@ node2text <- function(doc, tag) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# == 2.12 dbFetchNCBItaxData() =============================================
|
# == 3.12 dbFetchNCBItaxData() =============================================
|
||||||
dbFetchNCBItaxData <- function(ID) {
|
dbFetchNCBItaxData <- function(ID) {
|
||||||
# Fetch feature taxID and Organism from the NCBI.
|
# Fetch feature taxID and Organism from the NCBI.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -387,7 +479,7 @@ dbFetchNCBItaxData <- function(ID) {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# == 2.13 UniProtIDmap() ===================================================
|
# == 3.13 UniProtIDmap() ===================================================
|
||||||
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||||
# Use UniProt ID mapping service to map one or more IDs
|
# Use UniProt ID mapping service to map one or more IDs
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -426,9 +518,144 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# = 3 TESTS ===============================================================
|
# == 3.14 dbProt2JSON() ====================================================
|
||||||
|
dbProt2JSON <- function(thisProt) {
|
||||||
|
# Extract all protein related data from myDB and return in JSON format.
|
||||||
|
|
||||||
if (FALSE) {
|
thisData <- list()
|
||||||
|
|
||||||
|
# add a protein table
|
||||||
|
sel <- which(myDB$protein$name == thisProt)
|
||||||
|
thisData$protein <- myDB$protein[sel, ]
|
||||||
|
|
||||||
|
# add a taxonomy table
|
||||||
|
sel <- which(myDB$taxonomy$ID == thisData$protein$taxonomyID)
|
||||||
|
thisData$taxonomy <- myDB$taxonomy[sel, ]
|
||||||
|
|
||||||
|
# add the entries for this protein from the annotation table
|
||||||
|
sel <- which(myDB$annotation$proteinID == thisData$protein$ID)
|
||||||
|
thisData$annotation <- myDB$annotation[sel, ]
|
||||||
|
# our .json convention uses pName and fName as keys, not the db-internal IDs
|
||||||
|
# add empty columns for pName and fName
|
||||||
|
l <- nrow(thisData$annotation)
|
||||||
|
thisData$annotation$pName <- character(l)
|
||||||
|
thisData$annotation$fName <- character(l)
|
||||||
|
# get the appropriate protein and feature names
|
||||||
|
for (i in seq_len(l)) {
|
||||||
|
pID <- thisData$annotation$proteinID[i]
|
||||||
|
sel <- which(myDB$protein$ID == pID)
|
||||||
|
thisData$annotation$pName[i] <- myDB$protein$name[sel] # store pName
|
||||||
|
fID <- thisData$annotation$featureID[i]
|
||||||
|
sel <- which(myDB$feature$ID == fID)
|
||||||
|
thisData$annotation$fName[i] <- myDB$feature$name[sel] # store fName
|
||||||
|
}
|
||||||
|
|
||||||
|
# add the corresponding feature table
|
||||||
|
sel <- which(myDB$feature$ID %in% thisData$annotation$featureID)
|
||||||
|
thisData$feature <- myDB$feature[sel, ]
|
||||||
|
|
||||||
|
# remove columns that are not going into JSON output
|
||||||
|
thisData$protein$ID <- NULL
|
||||||
|
thisData$annotation$ID <- NULL
|
||||||
|
thisData$annotation$proteinID <- NULL
|
||||||
|
thisData$annotation$featureID <- NULL
|
||||||
|
thisData$feature$ID <- NULL
|
||||||
|
|
||||||
|
# create JSON-formatted output
|
||||||
|
# ( jsonlite::prettify() is too wordy for a compact Wikipage )
|
||||||
|
|
||||||
|
out <- character()
|
||||||
|
out <- c(out, '{')
|
||||||
|
|
||||||
|
out <- c(out, ' "protein": {')
|
||||||
|
sel <- colnames(thisData$protein) != "sequence"
|
||||||
|
out <- c(out, sprintf(" %s,", dbRow2JSON(thisData$protein[1, sel],
|
||||||
|
coll = ",\n ")))
|
||||||
|
out <- c(out, dbSeq2JSON(thisData$protein$sequence[1]))
|
||||||
|
out <- c(out, ' },')
|
||||||
|
|
||||||
|
out <- c(out, ' "taxonomy": {')
|
||||||
|
out <- c(out, sprintf(" %s", dbRow2JSON(thisData$taxonomy)))
|
||||||
|
out <- c(out, ' },')
|
||||||
|
|
||||||
|
out <- c(out, ' "annotation": [')
|
||||||
|
for (i in seq_len(nrow(thisData$annotation))) {
|
||||||
|
out <- c(out, sprintf(" {%s},", dbRow2JSON(thisData$annotation[i, ])))
|
||||||
|
}
|
||||||
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
||||||
|
out <- c(out, ' ],')
|
||||||
|
|
||||||
|
out <- c(out, ' "feature": [')
|
||||||
|
sel <- colnames(thisData$feature) != "description"
|
||||||
|
for (i in seq_len(nrow(thisData$feature))) {
|
||||||
|
out <- c(out, sprintf(" {%s,",
|
||||||
|
dbRow2JSON(thisData$feature[i, sel])))
|
||||||
|
out <- c(out, sprintf(" %s},",
|
||||||
|
dbRow2JSON(thisData$feature[i, "description",
|
||||||
|
drop = FALSE])))
|
||||||
|
}
|
||||||
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
||||||
|
out <- c(out, ' ]')
|
||||||
|
|
||||||
|
out <- c(out, '}')
|
||||||
|
|
||||||
|
return(paste0(out, collapse = "\n"))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.15 dbSeq2JSON() =====================================================
|
||||||
|
|
||||||
|
dbSeq2JSON <- function(s, nIndents = 4, width = 70) {
|
||||||
|
# Turn a sequence into a JSON key-value pair, with the value being a JSON
|
||||||
|
# array of elements not exceeding a width of "width", and an indent of
|
||||||
|
# "indents" spaces.
|
||||||
|
ind <- paste0(rep(" ", nIndents), collapse = "")
|
||||||
|
|
||||||
|
out <- character()
|
||||||
|
out <- c(out, sprintf("%s\"sequence\" : [", ind))
|
||||||
|
|
||||||
|
for (i in seq_along(s)) {
|
||||||
|
l <- nchar(s[i])
|
||||||
|
if (l <= width) {
|
||||||
|
out <- c(out, s[i])
|
||||||
|
} else {
|
||||||
|
starts <- seq(1, l, by = width)
|
||||||
|
ends <- seq(width, l, by = width)
|
||||||
|
if (length(ends) < length(starts)) { ends <- c(ends, l) }
|
||||||
|
out <- c(out, sprintf("%s \"%s\",", ind, substring(s[i], starts, ends)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out[length(out)] <- gsub(",$", "", out[length(out)]) # remove last ","
|
||||||
|
|
||||||
|
out <- c(out, sprintf("%s]", ind))
|
||||||
|
return(paste0(out, collapse = "\n"))
|
||||||
|
}
|
||||||
|
cat(dbSeq2JSON(myDB$protein$sequence[1]))
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.16 dbRow2JSON() =====================================================
|
||||||
|
|
||||||
|
dbRow2JSON <- function(df, coll = ", ") {
|
||||||
|
# Turn a single dataframe row into JSON key value pairs, where the keys are the
|
||||||
|
# column names. Respects character / numeric mode.
|
||||||
|
out <- character()
|
||||||
|
for (i in 1:ncol(df)) {
|
||||||
|
if (class(df[1, i]) == "integer") {
|
||||||
|
val <- sprintf("%d", df[1, i])
|
||||||
|
} else if (class(df[1, i]) == "numeric") {
|
||||||
|
val <- sprintf("%f", df[1, i])
|
||||||
|
} else {
|
||||||
|
val <- sprintf("\"%s\"", as.character(df[1, i]))
|
||||||
|
}
|
||||||
|
out <- c(out, sprintf("\"%s\": %s", colnames(df)[i], val))
|
||||||
|
}
|
||||||
|
return(paste0(out, collapse = coll))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# = 4 TESTS ===============================================================
|
||||||
|
|
||||||
|
if (doTESTS) {
|
||||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||||
install.packages("testthat")
|
install.packages("testthat")
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user