Line termination change and old code.
This commit is contained in:
parent
b1e00f52f7
commit
affe00f6fb
258
.Rprofile
258
.Rprofile
@ -1,129 +1,129 @@
|
||||
# .Rprofile
|
||||
#
|
||||
# This script is automatically executed on startup
|
||||
# ==============================================================================
|
||||
|
||||
init <- function() {
|
||||
|
||||
# Create a local copy of myScript.R if not done yet.
|
||||
if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
|
||||
file.copy(".tmp.R", "myScript.R")
|
||||
cat("A new file \"myScript.R\" was created. You can use it for\n")
|
||||
cat("notes and code experiments.\n\n")
|
||||
}
|
||||
|
||||
cat("\n\n")
|
||||
cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
|
||||
cat("\"files\" pane), edit it and save it.\n")
|
||||
cat("Then click the checkbox, and use the More -> Move... dialogue\n")
|
||||
cat("to move it into the \"myScripts\" folder.\n\n")
|
||||
|
||||
file.edit("ABC-units.R")
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
if (! file.exists("./myScripts/.myProfile.R")) {
|
||||
cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
cat(" =================")
|
||||
cat("\n\n")
|
||||
cat(" WELCOME !\n")
|
||||
cat("\n")
|
||||
cat(" Type 'init()' to begin\n\n")
|
||||
cat("\n")
|
||||
cat(" =================")
|
||||
cat("\n\n")
|
||||
|
||||
} else { # local profile exists ... validate state:
|
||||
cat("\n\nLoading local functions ...")
|
||||
|
||||
source(".utilities.R") # local profile appears sane, source utilities
|
||||
source("./myScripts/.myProfile.R")
|
||||
|
||||
if (! exists("myEMail")) { # ... has eMail been defined?
|
||||
cat("ERROR !\n")
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("the variable \"myEMail\" was not loaded.\n")
|
||||
cat("Please contact your instructor to continue.\n\n")
|
||||
}
|
||||
if (! exists("myStudentNumber")) { # ... has the Student Number been defined?
|
||||
cat("ERROR !\n")
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("the variable \"myStudentNumber\" was not loaded.\n")
|
||||
cat("Please contact your instructor to continue.\n\n")
|
||||
}
|
||||
if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
|
||||
cat("ERROR !\n") # is the Student Number valid?
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("your Student Number could not be validated.\n")
|
||||
cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
|
||||
cat(" and fix the problem or contact your instructor to continue.\n\n")
|
||||
}
|
||||
|
||||
|
||||
if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now
|
||||
# ... and write it into the profile.
|
||||
prf <- readLines("./myScripts/.myProfile.R")
|
||||
iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
|
||||
out <- prf[1:iEmail]
|
||||
out <- c(out, sprintf("MYSPE <- \"%s\" ",
|
||||
getMYSPE(myStudentNumber)))
|
||||
out <- c(out, prf[(iEmail+1):length(prf)])
|
||||
writeLines(out, "./myScripts/.myProfile.R")
|
||||
|
||||
cat("\n")
|
||||
cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
|
||||
getMYSPE(myStudentNumber)))
|
||||
MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use
|
||||
rm(prf, iEmail, out) # cleanup
|
||||
}
|
||||
cat("... done.\n\n")
|
||||
}
|
||||
|
||||
if (default.stringsAsFactors()) {
|
||||
cat("WARNING.\n")
|
||||
cat("========\n")
|
||||
cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
|
||||
cat("This will break some of the code.\n")
|
||||
cat("Please contact your instructor to troubleshoot and fix this issue.\n")
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
errText <- list()
|
||||
errText[["noProfileFile"]] <- '
|
||||
Your PROFILE FILE does not exist. This problem must be fixed to continue.
|
||||
|
||||
The code expects the file "./myScripts/.myProfile.R" to exist and to
|
||||
contain your correct eMail address and student number. Detailed
|
||||
instructions were given when you first ran the init() command.
|
||||
|
||||
Try running init() again and follow the instructions. Reload youR RStudio
|
||||
session and start over with this file.
|
||||
|
||||
If this does not fix the problem, ask for help.
|
||||
'
|
||||
|
||||
errText[["noStudentNumber"]] <- '
|
||||
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
|
||||
|
||||
The code expects the file "./myScripts/.myProfile.R" to exist and to
|
||||
contain your correct eMail address and student number. This file gets
|
||||
sourced when you start a new R-session, but since you see this error
|
||||
message there was a problem.
|
||||
|
||||
Perhaps you need to restart your R-session. Try closing the RStudio
|
||||
project and reopening it from the File > Recent Projects menu.
|
||||
|
||||
Perhaps there was a syntax error in your file. Then not all the
|
||||
instructions in the file are executed. Check the file: is your
|
||||
email perhpas not defined? Or did you type it without qwuoataion
|
||||
marks?
|
||||
|
||||
Try fixing problems, and then restart R as described above.
|
||||
|
||||
If none of this fixes the problem, ask for help.
|
||||
'
|
||||
|
||||
# [END]
|
||||
# .Rprofile
|
||||
#
|
||||
# This script is automatically executed on startup
|
||||
# ==============================================================================
|
||||
|
||||
init <- function() {
|
||||
|
||||
# Create a local copy of myScript.R if not done yet.
|
||||
if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
|
||||
file.copy(".tmp.R", "myScript.R")
|
||||
cat("A new file \"myScript.R\" was created. You can use it for\n")
|
||||
cat("notes and code experiments.\n\n")
|
||||
}
|
||||
|
||||
cat("\n\n")
|
||||
cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
|
||||
cat("\"files\" pane), edit it and save it.\n")
|
||||
cat("Then click the checkbox, and use the More -> Move... dialogue\n")
|
||||
cat("to move it into the \"myScripts\" folder.\n\n")
|
||||
|
||||
file.edit("ABC-units.R")
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
if (! file.exists("./myScripts/.myProfile.R")) {
|
||||
cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
|
||||
cat(" =================")
|
||||
cat("\n\n")
|
||||
cat(" WELCOME !\n")
|
||||
cat("\n")
|
||||
cat(" Type 'init()' to begin\n\n")
|
||||
cat("\n")
|
||||
cat(" =================")
|
||||
cat("\n\n")
|
||||
|
||||
} else { # local profile exists ... validate state:
|
||||
cat("\n\nLoading local functions ...")
|
||||
|
||||
source(".utilities.R") # local profile appears sane, source utilities
|
||||
source("./myScripts/.myProfile.R")
|
||||
|
||||
if (! exists("myEMail")) { # ... has eMail been defined?
|
||||
cat("ERROR !\n")
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("the variable \"myEMail\" was not loaded.\n")
|
||||
cat("Please contact your instructor to continue.\n\n")
|
||||
}
|
||||
if (! exists("myStudentNumber")) { # ... has the Student Number been defined?
|
||||
cat("ERROR !\n")
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("the variable \"myStudentNumber\" was not loaded.\n")
|
||||
cat("Please contact your instructor to continue.\n\n")
|
||||
}
|
||||
if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
|
||||
cat("ERROR !\n") # is the Student Number valid?
|
||||
cat("=======\n")
|
||||
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
|
||||
cat("your Student Number could not be validated.\n")
|
||||
cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
|
||||
cat(" and fix the problem or contact your instructor to continue.\n\n")
|
||||
}
|
||||
|
||||
|
||||
if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now
|
||||
# ... and write it into the profile.
|
||||
prf <- readLines("./myScripts/.myProfile.R")
|
||||
iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
|
||||
out <- prf[1:iEmail]
|
||||
out <- c(out, sprintf("MYSPE <- \"%s\" ",
|
||||
getMYSPE(myStudentNumber)))
|
||||
out <- c(out, prf[(iEmail+1):length(prf)])
|
||||
writeLines(out, "./myScripts/.myProfile.R")
|
||||
|
||||
cat("\n")
|
||||
cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
|
||||
getMYSPE(myStudentNumber)))
|
||||
MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use
|
||||
rm(prf, iEmail, out) # cleanup
|
||||
}
|
||||
cat("... done.\n\n")
|
||||
}
|
||||
|
||||
if (default.stringsAsFactors()) {
|
||||
cat("WARNING.\n")
|
||||
cat("========\n")
|
||||
cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
|
||||
cat("This will break some of the code.\n")
|
||||
cat("Please contact your instructor to troubleshoot and fix this issue.\n")
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
errText <- list()
|
||||
errText[["noProfileFile"]] <- '
|
||||
Your PROFILE FILE does not exist. This problem must be fixed to continue.
|
||||
|
||||
The code expects the file "./myScripts/.myProfile.R" to exist and to
|
||||
contain your correct eMail address and student number. Detailed
|
||||
instructions were given when you first ran the init() command.
|
||||
|
||||
Try running init() again and follow the instructions. Reload youR RStudio
|
||||
session and start over with this file.
|
||||
|
||||
If this does not fix the problem, ask for help.
|
||||
'
|
||||
|
||||
errText[["noStudentNumber"]] <- '
|
||||
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
|
||||
|
||||
The code expects the file "./myScripts/.myProfile.R" to exist and to
|
||||
contain your correct eMail address and student number. This file gets
|
||||
sourced when you start a new R-session, but since you see this error
|
||||
message there was a problem.
|
||||
|
||||
Perhaps you need to restart your R-session. Try closing the RStudio
|
||||
project and reopening it from the File > Recent Projects menu.
|
||||
|
||||
Perhaps there was a syntax error in your file. Then not all the
|
||||
instructions in the file are executed. Check the file: is your
|
||||
email perhpas not defined? Or did you type it without qwuoataion
|
||||
marks?
|
||||
|
||||
Try fixing problems, and then restart R as described above.
|
||||
|
||||
If none of this fixes the problem, ask for help.
|
||||
'
|
||||
|
||||
# [END]
|
||||
|
88
.gitignore
vendored
88
.gitignore
vendored
@ -1,44 +1,44 @@
|
||||
# Miscellaneous
|
||||
.Ds_store
|
||||
instructor/
|
||||
dev/
|
||||
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
|
||||
|
||||
# History files
|
||||
.Rhistory
|
||||
.Rapp.history
|
||||
|
||||
# Session Data files
|
||||
# .RData
|
||||
|
||||
# Files produced in assingments
|
||||
data/APSESphyloSet.mfa
|
||||
data/APSEStreeRproml.rds
|
||||
|
||||
# Example code in package build process
|
||||
*-Ex.R
|
||||
|
||||
# Output files from R CMD build
|
||||
/*.tar.gz
|
||||
|
||||
# Output files from R CMD check
|
||||
/*.Rcheck/
|
||||
|
||||
# RStudio files
|
||||
.Rproj.user/
|
||||
|
||||
# produced vignettes
|
||||
vignettes/*.html
|
||||
vignettes/*.pdf
|
||||
|
||||
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
|
||||
.httr-oauth
|
||||
|
||||
# knitr and R markdown default cache directories
|
||||
/*_cache/
|
||||
/cache/
|
||||
|
||||
# Temporary files created by R markdown
|
||||
*.utf8.md
|
||||
*.knit.md
|
||||
.Rproj.user
|
||||
# Miscellaneous
|
||||
.Ds_store
|
||||
instructor/
|
||||
dev/
|
||||
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
|
||||
|
||||
# History files
|
||||
.Rhistory
|
||||
.Rapp.history
|
||||
|
||||
# Session Data files
|
||||
# .RData
|
||||
|
||||
# Files produced in assingments
|
||||
data/APSESphyloSet.mfa
|
||||
data/APSEStreeRproml.rds
|
||||
|
||||
# Example code in package build process
|
||||
*-Ex.R
|
||||
|
||||
# Output files from R CMD build
|
||||
/*.tar.gz
|
||||
|
||||
# Output files from R CMD check
|
||||
/*.Rcheck/
|
||||
|
||||
# RStudio files
|
||||
.Rproj.user/
|
||||
|
||||
# produced vignettes
|
||||
vignettes/*.html
|
||||
vignettes/*.pdf
|
||||
|
||||
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
|
||||
.httr-oauth
|
||||
|
||||
# knitr and R markdown default cache directories
|
||||
/*_cache/
|
||||
/cache/
|
||||
|
||||
# Temporary files created by R markdown
|
||||
*.utf8.md
|
||||
*.knit.md
|
||||
.Rproj.user
|
||||
|
76
.tmp.R
76
.tmp.R
@ -1,38 +1,38 @@
|
||||
# myScript.R
|
||||
#
|
||||
# --- As you work with this file, you can delete the instructions below --------
|
||||
# Write your notes and code experiments into this document. Save it
|
||||
# from time to time - however I recommend that you do not _commit_
|
||||
# your saved version.
|
||||
#
|
||||
# As long as you do not _commit_ this script to version control,
|
||||
# you can _pull_ updated versions of the entire project from GitHub
|
||||
# by using the RStudio version control interface. However, once
|
||||
# you _commit_ any file in your local version, RStudio will require
|
||||
# you to resolve conflicts before you can _pull_ updates.
|
||||
# --- As you work with this file, you can delete the instructions above --------
|
||||
#
|
||||
## Purpose: <...>
|
||||
#
|
||||
# Version: <...>
|
||||
#
|
||||
# Date: <...>
|
||||
# Author: <Name> (<namee@mail.utoronto.ca>)
|
||||
#
|
||||
# Versions:
|
||||
#
|
||||
# <number> <Features>
|
||||
#
|
||||
# TODO:
|
||||
# <...>
|
||||
#
|
||||
# ====================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
||||
# myScript.R
|
||||
#
|
||||
# --- As you work with this file, you can delete the instructions below --------
|
||||
# Write your notes and code experiments into this document. Save it
|
||||
# from time to time - however I recommend that you do not _commit_
|
||||
# your saved version.
|
||||
#
|
||||
# As long as you do not _commit_ this script to version control,
|
||||
# you can _pull_ updated versions of the entire project from GitHub
|
||||
# by using the RStudio version control interface. However, once
|
||||
# you _commit_ any file in your local version, RStudio will require
|
||||
# you to resolve conflicts before you can _pull_ updates.
|
||||
# --- As you work with this file, you can delete the instructions above --------
|
||||
#
|
||||
## Purpose: <...>
|
||||
#
|
||||
# Version: <...>
|
||||
#
|
||||
# Date: <...>
|
||||
# Author: <Name> (<namee@mail.utoronto.ca>)
|
||||
#
|
||||
# Versions:
|
||||
#
|
||||
# <number> <Features>
|
||||
#
|
||||
# TODO:
|
||||
# <...>
|
||||
#
|
||||
# ====================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
||||
|
1308
.utilities.R
1308
.utilities.R
File diff suppressed because it is too large
Load Diff
@ -1,257 +1,257 @@
|
||||
# 2021-10-12_In-Class_exploration.R
|
||||
#
|
||||
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
|
||||
#
|
||||
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
|
||||
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
|
||||
# Scribe: boris.steipe@utoronto.ca
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# In our last session we explored some properties of amino acids and noted that
|
||||
# we can arrange them in a scatter-plot according to some properties. But can
|
||||
# we also arrange them according to generic properties, i.e. taking all
|
||||
# published property scales into account? We will try to use all tables from
|
||||
# the seqinr package.
|
||||
|
||||
# First we load the package - this makes all datasets immediately available and
|
||||
# we don't have to load them one by one.
|
||||
|
||||
library(seqinr)
|
||||
|
||||
# Determine what datasets are available
|
||||
#
|
||||
# Using "find in topic" ... "amino acid"
|
||||
data(aacost)
|
||||
data(aaindex)
|
||||
data(pK)
|
||||
|
||||
# We note that datasets may be sorted in different ways: for example
|
||||
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
|
||||
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
|
||||
# acids are sorted in the same way.
|
||||
|
||||
# Build a datastructure ...
|
||||
# rows: amino acids
|
||||
# columns: properties
|
||||
|
||||
# Are all lists in aaindex organized in the same way?
|
||||
|
||||
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
|
||||
# index as a reference list
|
||||
|
||||
# Loop over each list in aaindex
|
||||
for (i in 1:length(aaindex)) {
|
||||
# get the I-vector
|
||||
x <- aaindex[[i]]$I
|
||||
# get the names
|
||||
x <- names(x)
|
||||
# compare with the names of our reference list
|
||||
# the == and != operators are vectorized. Applying them to two vectors
|
||||
# gives TRUE or FALSE for each pair of elements. any() or all() can be
|
||||
# applied to logical vectors to anylise them and return a soingle result.
|
||||
# if (...) conditions evaluate only a single value and will throw a warning if
|
||||
# there is more than one.
|
||||
|
||||
if (any(x != refNames)) {
|
||||
# There was at least one not-equal pair - so: complain
|
||||
print(sprintf("Problem in list %d: names don't match", i))
|
||||
}
|
||||
}
|
||||
|
||||
# If we get here without identifying problems, it means all pairs of
|
||||
# rownames match throughout the aainfex list.
|
||||
|
||||
|
||||
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
|
||||
# one of the list elements) to our dataframe?
|
||||
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
|
||||
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
|
||||
|
||||
str(aaData) # Confirm: we now have a two-column dataframe
|
||||
|
||||
# Next: add the rest ...
|
||||
for (i in 3:length(aaindex)) {
|
||||
# get the I-vector and write it into our dataframe
|
||||
aaData[,i] <- aaindex[[i]]$I
|
||||
}
|
||||
|
||||
# Sanity check
|
||||
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
|
||||
|
||||
# Looks good.
|
||||
|
||||
# We finished building our data structure ... but let's add the aacost table
|
||||
# aacost is ordered differently:
|
||||
rownames(aaData)
|
||||
aacost[ , 1]
|
||||
|
||||
# using order(), applied to aacost - ordering the column with column-name
|
||||
# "aaa"
|
||||
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
|
||||
aacost[sel, "aaa"] # applying the order vector sorts the column
|
||||
|
||||
# Is this the same order as refNames?
|
||||
refNames == aacost[sel, "aaa"] # Yes!
|
||||
|
||||
# add the data from column "tot" (i.e. total metabolic cost) after the
|
||||
# last column of aaData
|
||||
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
|
||||
|
||||
# Done.
|
||||
str(aaData) # A dataframe with 20 rows and 545 columns
|
||||
|
||||
# To answer the question "Which amino acids are similar to each other?" we
|
||||
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
|
||||
# we will succumb to the "Curse of Dimensionality":
|
||||
#
|
||||
# "in high dimensional data, however, all objects appear
|
||||
# to be sparse and dissimilar in many ways..."
|
||||
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
|
||||
#
|
||||
# A classic way to do this is Principal Component Analysis (PCA) ...
|
||||
# (Principal components analysis)
|
||||
#
|
||||
# PCA expects objects in columns, properties in rows. Therefore we need to
|
||||
# transpose our dataset:
|
||||
|
||||
aaPCA <- prcomp(t(aaData))
|
||||
|
||||
# This creates an error, because some of our indicews contain NA values!
|
||||
# Which indices are this?
|
||||
|
||||
# We create a vector "sel" for which we check whether any element in each
|
||||
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
|
||||
# then use this vector to subset ourt dataframe.
|
||||
|
||||
sel <- logical()
|
||||
|
||||
for (i in 1:ncol(aaData)) { # for each index
|
||||
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
|
||||
sel <- c(sel, FALSE) # add a FALSE element to the vector
|
||||
} else { # else
|
||||
sel <- c(sel, TRUE) # add a TRUE element
|
||||
}
|
||||
}
|
||||
|
||||
# Done. sel now subsets only the NA-free columns
|
||||
545 - sum(sel) # 13 columns excluded
|
||||
|
||||
# Do the PCA ... use the prcomp() function
|
||||
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
|
||||
|
||||
str(aaPCA) # structure of the result
|
||||
|
||||
plot(aaPCA) # plot the contributions of the
|
||||
# components to the variance
|
||||
|
||||
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
|
||||
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
|
||||
type ="n") # just to set up the coordinate system
|
||||
|
||||
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
|
||||
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# PCA results are sensitive to the absolute numeric value of the features that
|
||||
# we are comparing. The prcomp() function has an option scale. = TRUE that
|
||||
# scales each row of features so that the variance of the value is 1.0 This
|
||||
# ensures that each feature is given approximately equal weight
|
||||
|
||||
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
|
||||
|
||||
plot(aaPCA)
|
||||
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
|
||||
# Next we try to identify what the PCs correspond to. We see whether there are
|
||||
# specific features that are highly correlated with the PCs
|
||||
|
||||
# ==== Rotation 1 ===================
|
||||
#
|
||||
|
||||
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
|
||||
|
||||
# The function cor() calculates Pearson coefficients of correlation
|
||||
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
|
||||
|
||||
|
||||
# Iterate over all columns and calculate correlations
|
||||
cors <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors[i] <- cor(PC1, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
|
||||
#
|
||||
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
|
||||
|
||||
which(cors == max(cors, na.rm = TRUE))
|
||||
|
||||
aaindex[[504]] # Linker propensity ???
|
||||
|
||||
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
|
||||
|
||||
# Plot this ...
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This is essentially a random correlation but for Cysteine ...
|
||||
|
||||
|
||||
# ==== Rotation 2 ===================
|
||||
#
|
||||
# same process
|
||||
PC2 <- aaPCA$rotation[ , 2]
|
||||
|
||||
cors2 <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors2[i] <- cor(PC2, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors2)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
|
||||
|
||||
# Here we have quite strong correlations
|
||||
|
||||
which(cors2 == max(cors2, na.rm = TRUE))
|
||||
|
||||
aaindex[[148]]
|
||||
|
||||
# this index itself is correlated with many other indices
|
||||
|
||||
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
|
||||
|
||||
# Plot this too...
|
||||
plot(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This correlates well with hydrophobicity measures. In this case the
|
||||
# PC is to a certain degree interpretable - but this is not always the case
|
||||
# with PCA (see the example of the first PC).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# 2021-10-12_In-Class_exploration.R
|
||||
#
|
||||
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
|
||||
#
|
||||
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
|
||||
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
|
||||
# Scribe: boris.steipe@utoronto.ca
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# In our last session we explored some properties of amino acids and noted that
|
||||
# we can arrange them in a scatter-plot according to some properties. But can
|
||||
# we also arrange them according to generic properties, i.e. taking all
|
||||
# published property scales into account? We will try to use all tables from
|
||||
# the seqinr package.
|
||||
|
||||
# First we load the package - this makes all datasets immediately available and
|
||||
# we don't have to load them one by one.
|
||||
|
||||
library(seqinr)
|
||||
|
||||
# Determine what datasets are available
|
||||
#
|
||||
# Using "find in topic" ... "amino acid"
|
||||
data(aacost)
|
||||
data(aaindex)
|
||||
data(pK)
|
||||
|
||||
# We note that datasets may be sorted in different ways: for example
|
||||
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
|
||||
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
|
||||
# acids are sorted in the same way.
|
||||
|
||||
# Build a datastructure ...
|
||||
# rows: amino acids
|
||||
# columns: properties
|
||||
|
||||
# Are all lists in aaindex organized in the same way?
|
||||
|
||||
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
|
||||
# index as a reference list
|
||||
|
||||
# Loop over each list in aaindex
|
||||
for (i in 1:length(aaindex)) {
|
||||
# get the I-vector
|
||||
x <- aaindex[[i]]$I
|
||||
# get the names
|
||||
x <- names(x)
|
||||
# compare with the names of our reference list
|
||||
# the == and != operators are vectorized. Applying them to two vectors
|
||||
# gives TRUE or FALSE for each pair of elements. any() or all() can be
|
||||
# applied to logical vectors to anylise them and return a soingle result.
|
||||
# if (...) conditions evaluate only a single value and will throw a warning if
|
||||
# there is more than one.
|
||||
|
||||
if (any(x != refNames)) {
|
||||
# There was at least one not-equal pair - so: complain
|
||||
print(sprintf("Problem in list %d: names don't match", i))
|
||||
}
|
||||
}
|
||||
|
||||
# If we get here without identifying problems, it means all pairs of
|
||||
# rownames match throughout the aainfex list.
|
||||
|
||||
|
||||
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
|
||||
# one of the list elements) to our dataframe?
|
||||
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
|
||||
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
|
||||
|
||||
str(aaData) # Confirm: we now have a two-column dataframe
|
||||
|
||||
# Next: add the rest ...
|
||||
for (i in 3:length(aaindex)) {
|
||||
# get the I-vector and write it into our dataframe
|
||||
aaData[,i] <- aaindex[[i]]$I
|
||||
}
|
||||
|
||||
# Sanity check
|
||||
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
|
||||
|
||||
# Looks good.
|
||||
|
||||
# We finished building our data structure ... but let's add the aacost table
|
||||
# aacost is ordered differently:
|
||||
rownames(aaData)
|
||||
aacost[ , 1]
|
||||
|
||||
# using order(), applied to aacost - ordering the column with column-name
|
||||
# "aaa"
|
||||
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
|
||||
aacost[sel, "aaa"] # applying the order vector sorts the column
|
||||
|
||||
# Is this the same order as refNames?
|
||||
refNames == aacost[sel, "aaa"] # Yes!
|
||||
|
||||
# add the data from column "tot" (i.e. total metabolic cost) after the
|
||||
# last column of aaData
|
||||
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
|
||||
|
||||
# Done.
|
||||
str(aaData) # A dataframe with 20 rows and 545 columns
|
||||
|
||||
# To answer the question "Which amino acids are similar to each other?" we
|
||||
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
|
||||
# we will succumb to the "Curse of Dimensionality":
|
||||
#
|
||||
# "in high dimensional data, however, all objects appear
|
||||
# to be sparse and dissimilar in many ways..."
|
||||
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
|
||||
#
|
||||
# A classic way to do this is Principal Component Analysis (PCA) ...
|
||||
# (Principal components analysis)
|
||||
#
|
||||
# PCA expects objects in columns, properties in rows. Therefore we need to
|
||||
# transpose our dataset:
|
||||
|
||||
aaPCA <- prcomp(t(aaData))
|
||||
|
||||
# This creates an error, because some of our indicews contain NA values!
|
||||
# Which indices are this?
|
||||
|
||||
# We create a vector "sel" for which we check whether any element in each
|
||||
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
|
||||
# then use this vector to subset ourt dataframe.
|
||||
|
||||
sel <- logical()
|
||||
|
||||
for (i in 1:ncol(aaData)) { # for each index
|
||||
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
|
||||
sel <- c(sel, FALSE) # add a FALSE element to the vector
|
||||
} else { # else
|
||||
sel <- c(sel, TRUE) # add a TRUE element
|
||||
}
|
||||
}
|
||||
|
||||
# Done. sel now subsets only the NA-free columns
|
||||
545 - sum(sel) # 13 columns excluded
|
||||
|
||||
# Do the PCA ... use the prcomp() function
|
||||
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
|
||||
|
||||
str(aaPCA) # structure of the result
|
||||
|
||||
plot(aaPCA) # plot the contributions of the
|
||||
# components to the variance
|
||||
|
||||
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
|
||||
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
|
||||
type ="n") # just to set up the coordinate system
|
||||
|
||||
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
|
||||
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# PCA results are sensitive to the absolute numeric value of the features that
|
||||
# we are comparing. The prcomp() function has an option scale. = TRUE that
|
||||
# scales each row of features so that the variance of the value is 1.0 This
|
||||
# ensures that each feature is given approximately equal weight
|
||||
|
||||
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
|
||||
|
||||
plot(aaPCA)
|
||||
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaPCA$rotation[ , 2],
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
|
||||
# Next we try to identify what the PCs correspond to. We see whether there are
|
||||
# specific features that are highly correlated with the PCs
|
||||
|
||||
# ==== Rotation 1 ===================
|
||||
#
|
||||
|
||||
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
|
||||
|
||||
# The function cor() calculates Pearson coefficients of correlation
|
||||
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
|
||||
|
||||
|
||||
# Iterate over all columns and calculate correlations
|
||||
cors <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors[i] <- cor(PC1, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
|
||||
#
|
||||
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
|
||||
|
||||
which(cors == max(cors, na.rm = TRUE))
|
||||
|
||||
aaindex[[504]] # Linker propensity ???
|
||||
|
||||
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
|
||||
|
||||
# Plot this ...
|
||||
plot(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 1],
|
||||
aaindex[[504]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This is essentially a random correlation but for Cysteine ...
|
||||
|
||||
|
||||
# ==== Rotation 2 ===================
|
||||
#
|
||||
# same process
|
||||
PC2 <- aaPCA$rotation[ , 2]
|
||||
|
||||
cors2 <- numeric()
|
||||
|
||||
for (i in 1:ncol(aaData)) {
|
||||
cors2[i] <- cor(PC2, aaData[ , i])
|
||||
}
|
||||
|
||||
summary(cors2)
|
||||
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
|
||||
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
|
||||
|
||||
# Here we have quite strong correlations
|
||||
|
||||
which(cors2 == max(cors2, na.rm = TRUE))
|
||||
|
||||
aaindex[[148]]
|
||||
|
||||
# this index itself is correlated with many other indices
|
||||
|
||||
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
|
||||
|
||||
# Plot this too...
|
||||
plot(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
type ="n")
|
||||
text(aaPCA$rotation[ , 2],
|
||||
aaindex[[148]]$I,
|
||||
labels = rownames(aaPCA$rotation))
|
||||
|
||||
# This correlates well with hydrophobicity measures. In this case the
|
||||
# PC is to a certain degree interpretable - but this is not always the case
|
||||
# with PCA (see the example of the first PC).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,161 +1,161 @@
|
||||
# tocID <- "ABC-Install_all_packages.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# Installing all packages in this course
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2021 10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Packages 33
|
||||
#TOC> 2 CRAN packages 98
|
||||
#TOC> 3 Bioconductor packages 127
|
||||
#TOC> 4 Other package sources 142
|
||||
#TOC> 5 Updating packages 148
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Packages ============================================================
|
||||
|
||||
# Much of R's functionality is contributed in packages: bundles of R scripts
|
||||
# or code in other languages, pre-configured objects, and datasets. Making this
|
||||
# functionality available is often done by issuing a library(<package-name>)
|
||||
# command, however this is not the preferred way, since it may override other
|
||||
# R functions and it makes it harder to understand where the source code of
|
||||
# a particular function is located. In this course we call the function name
|
||||
# prefixed with the package name and two colons:
|
||||
# <package-name>::<function-name>()
|
||||
# This is the preferred way, since it is explicit.
|
||||
#
|
||||
# Regardless of which idiom one uses to call the actual function, the package
|
||||
# needs to be "installed" first, i.e. the code must have been downloaded
|
||||
# from CRAN, or using the BiocManager::install() function.
|
||||
#
|
||||
# This script contains download commands for all packages that are used in the
|
||||
# course. You can execute the script line by line (or even source the entire
|
||||
# script) to make sure all packages can be installed on your computer. Just
|
||||
# one reminder: if you are ever asked to install from source, the correct
|
||||
# answer is usually "no" - except if you really know what you are doing and why.
|
||||
#
|
||||
# Once packages are installed you can get additional information about
|
||||
# the contents of a package with the commands:
|
||||
# library(help=<package-name>) # basic information
|
||||
# browseVignettes("<package-name>") # available vignettes
|
||||
# data(package = "<package-name>") # available datasets
|
||||
#
|
||||
# ... and you can load data sets with:
|
||||
# data(<data-set-name>, package = "<package-name>")
|
||||
#
|
||||
# All packages here are installed only when they have not been installed
|
||||
# before, using the following idiom:
|
||||
#
|
||||
# if (! requireNamespace("<package-name>", quietly=TRUE)) {
|
||||
# install.packages("<package-name>")
|
||||
# }
|
||||
#
|
||||
# ... or its BiocManager::install() equivalent:
|
||||
#
|
||||
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
|
||||
# BiocManager::install("<bioconductor-package-name>")
|
||||
# }
|
||||
#
|
||||
# If you want to _force_ a re-installation of the package, simply issue
|
||||
# the install.packages("<package-name>") command on its own. For compactness
|
||||
# we wrap the idiom into a function, which can also switch between CRAN
|
||||
# and BIOconductor sources:
|
||||
|
||||
installIfNeeded <- function(package, s = "CRAN") {
|
||||
# s: "CRAN" or "BIO"
|
||||
if (s == "CRAN") {
|
||||
if (! requireNamespace(package, quietly=TRUE)) {
|
||||
install.packages(package)
|
||||
}
|
||||
} else if (s == "BIO") {
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace(package, quietly=TRUE)) {
|
||||
BiocManager::install(package)
|
||||
}
|
||||
} else {
|
||||
stop(sprintf("Unknown source \"%s\".", s))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# = 2 CRAN packages =======================================================
|
||||
|
||||
installIfNeeded("ape")
|
||||
installIfNeeded("BiocManager")
|
||||
installIfNeeded("bio3d")
|
||||
installIfNeeded("evd")
|
||||
installIfNeeded("ggseqlogo")
|
||||
installIfNeeded("ggtern")
|
||||
installIfNeeded("hexbin")
|
||||
installIfNeeded("httr")
|
||||
installIfNeeded("igraph")
|
||||
installIfNeeded("jsonlite")
|
||||
installIfNeeded("magrittr")
|
||||
installIfNeeded("MASS")
|
||||
installIfNeeded("microbenchmark")
|
||||
installIfNeeded("phangorn")
|
||||
installIfNeeded("plotly")
|
||||
installIfNeeded("plotrix")
|
||||
installIfNeeded("profvis")
|
||||
installIfNeeded("robustbase")
|
||||
installIfNeeded("RColorBrewer")
|
||||
installIfNeeded("Rphylip")
|
||||
installIfNeeded("rvest")
|
||||
installIfNeeded("seqinr")
|
||||
installIfNeeded("stringi")
|
||||
installIfNeeded("taxize")
|
||||
installIfNeeded("testthat")
|
||||
installIfNeeded("xml2")
|
||||
|
||||
# = 3 Bioconductor packages ===============================================
|
||||
|
||||
installIfNeeded("Biobase", s = "BIO")
|
||||
installIfNeeded("biomaRt", s = "BIO")
|
||||
installIfNeeded("Biostrings", s = "BIO")
|
||||
installIfNeeded("DECIPHER", s = "BIO")
|
||||
installIfNeeded("GEOquery", s = "BIO")
|
||||
installIfNeeded("GOSim", s = "BIO")
|
||||
installIfNeeded("limma", s = "BIO")
|
||||
installIfNeeded("msa", s = "BIO")
|
||||
installIfNeeded("org.Sc.sgd.db", s = "BIO")
|
||||
installIfNeeded("prada", s = "BIO")
|
||||
installIfNeeded("topGO", s = "BIO")
|
||||
|
||||
|
||||
# = 4 Other package sources ===============================================
|
||||
|
||||
# Using sources other than CRAN or Bioconductor to download general-purpose
|
||||
# programs that run on your computer is not generally recommended.
|
||||
|
||||
|
||||
# = 5 Updating packages ===================================================
|
||||
|
||||
# From time to time, update CRAN packages with the following command ...
|
||||
|
||||
update.packages()
|
||||
|
||||
# ... and also update Bioconductor packages as follows:
|
||||
|
||||
BiocManager::install()
|
||||
|
||||
# [END]
|
||||
# tocID <- "ABC-Install_all_packages.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# Installing all packages in this course
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2021 10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Packages 33
|
||||
#TOC> 2 CRAN packages 98
|
||||
#TOC> 3 Bioconductor packages 127
|
||||
#TOC> 4 Other package sources 142
|
||||
#TOC> 5 Updating packages 148
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Packages ============================================================
|
||||
|
||||
# Much of R's functionality is contributed in packages: bundles of R scripts
|
||||
# or code in other languages, pre-configured objects, and datasets. Making this
|
||||
# functionality available is often done by issuing a library(<package-name>)
|
||||
# command, however this is not the preferred way, since it may override other
|
||||
# R functions and it makes it harder to understand where the source code of
|
||||
# a particular function is located. In this course we call the function name
|
||||
# prefixed with the package name and two colons:
|
||||
# <package-name>::<function-name>()
|
||||
# This is the preferred way, since it is explicit.
|
||||
#
|
||||
# Regardless of which idiom one uses to call the actual function, the package
|
||||
# needs to be "installed" first, i.e. the code must have been downloaded
|
||||
# from CRAN, or using the BiocManager::install() function.
|
||||
#
|
||||
# This script contains download commands for all packages that are used in the
|
||||
# course. You can execute the script line by line (or even source the entire
|
||||
# script) to make sure all packages can be installed on your computer. Just
|
||||
# one reminder: if you are ever asked to install from source, the correct
|
||||
# answer is usually "no" - except if you really know what you are doing and why.
|
||||
#
|
||||
# Once packages are installed you can get additional information about
|
||||
# the contents of a package with the commands:
|
||||
# library(help=<package-name>) # basic information
|
||||
# browseVignettes("<package-name>") # available vignettes
|
||||
# data(package = "<package-name>") # available datasets
|
||||
#
|
||||
# ... and you can load data sets with:
|
||||
# data(<data-set-name>, package = "<package-name>")
|
||||
#
|
||||
# All packages here are installed only when they have not been installed
|
||||
# before, using the following idiom:
|
||||
#
|
||||
# if (! requireNamespace("<package-name>", quietly=TRUE)) {
|
||||
# install.packages("<package-name>")
|
||||
# }
|
||||
#
|
||||
# ... or its BiocManager::install() equivalent:
|
||||
#
|
||||
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
|
||||
# BiocManager::install("<bioconductor-package-name>")
|
||||
# }
|
||||
#
|
||||
# If you want to _force_ a re-installation of the package, simply issue
|
||||
# the install.packages("<package-name>") command on its own. For compactness
|
||||
# we wrap the idiom into a function, which can also switch between CRAN
|
||||
# and BIOconductor sources:
|
||||
|
||||
installIfNeeded <- function(package, s = "CRAN") {
|
||||
# s: "CRAN" or "BIO"
|
||||
if (s == "CRAN") {
|
||||
if (! requireNamespace(package, quietly=TRUE)) {
|
||||
install.packages(package)
|
||||
}
|
||||
} else if (s == "BIO") {
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace(package, quietly=TRUE)) {
|
||||
BiocManager::install(package)
|
||||
}
|
||||
} else {
|
||||
stop(sprintf("Unknown source \"%s\".", s))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# = 2 CRAN packages =======================================================
|
||||
|
||||
installIfNeeded("ape")
|
||||
installIfNeeded("BiocManager")
|
||||
installIfNeeded("bio3d")
|
||||
installIfNeeded("evd")
|
||||
installIfNeeded("ggseqlogo")
|
||||
installIfNeeded("ggtern")
|
||||
installIfNeeded("hexbin")
|
||||
installIfNeeded("httr")
|
||||
installIfNeeded("igraph")
|
||||
installIfNeeded("jsonlite")
|
||||
installIfNeeded("magrittr")
|
||||
installIfNeeded("MASS")
|
||||
installIfNeeded("microbenchmark")
|
||||
installIfNeeded("phangorn")
|
||||
installIfNeeded("plotly")
|
||||
installIfNeeded("plotrix")
|
||||
installIfNeeded("profvis")
|
||||
installIfNeeded("robustbase")
|
||||
installIfNeeded("RColorBrewer")
|
||||
installIfNeeded("Rphylip")
|
||||
installIfNeeded("rvest")
|
||||
installIfNeeded("seqinr")
|
||||
installIfNeeded("stringi")
|
||||
installIfNeeded("taxize")
|
||||
installIfNeeded("testthat")
|
||||
installIfNeeded("xml2")
|
||||
|
||||
# = 3 Bioconductor packages ===============================================
|
||||
|
||||
installIfNeeded("Biobase", s = "BIO")
|
||||
installIfNeeded("biomaRt", s = "BIO")
|
||||
installIfNeeded("Biostrings", s = "BIO")
|
||||
installIfNeeded("DECIPHER", s = "BIO")
|
||||
installIfNeeded("GEOquery", s = "BIO")
|
||||
installIfNeeded("GOSim", s = "BIO")
|
||||
installIfNeeded("limma", s = "BIO")
|
||||
installIfNeeded("msa", s = "BIO")
|
||||
installIfNeeded("org.Sc.sgd.db", s = "BIO")
|
||||
installIfNeeded("prada", s = "BIO")
|
||||
installIfNeeded("topGO", s = "BIO")
|
||||
|
||||
|
||||
# = 4 Other package sources ===============================================
|
||||
|
||||
# Using sources other than CRAN or Bioconductor to download general-purpose
|
||||
# programs that run on your computer is not generally recommended.
|
||||
|
||||
|
||||
# = 5 Updating packages ===================================================
|
||||
|
||||
# From time to time, update CRAN packages with the following command ...
|
||||
|
||||
update.packages()
|
||||
|
||||
# ... and also update Bioconductor packages as follows:
|
||||
|
||||
BiocManager::install()
|
||||
|
||||
# [END]
|
||||
|
@ -1,100 +1,100 @@
|
||||
# addSACCE_APSESproteins.R
|
||||
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
|
||||
#
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "SWI4_SACCE",
|
||||
RefSeqID = "NP_011036",
|
||||
UniProtID = "P25302",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
|
||||
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
|
||||
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
|
||||
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
|
||||
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
|
||||
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
|
||||
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
|
||||
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
|
||||
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
|
||||
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
|
||||
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
|
||||
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
|
||||
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
|
||||
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
|
||||
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
|
||||
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
|
||||
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
|
||||
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
|
||||
1081 klddiekdlr ana"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "PHD1_SACCE",
|
||||
RefSeqID = "NP_012881",
|
||||
UniProtID = "P36093",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
|
||||
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
|
||||
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
|
||||
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
|
||||
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
|
||||
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
|
||||
361 aknels"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "SOK2_SACCE",
|
||||
RefSeqID = "NP_013729",
|
||||
UniProtID = "P53438",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
|
||||
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
|
||||
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
|
||||
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
|
||||
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
|
||||
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
|
||||
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
|
||||
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
|
||||
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
|
||||
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
|
||||
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
|
||||
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
|
||||
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
|
||||
781 kkqek"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "XBP1_SACCE",
|
||||
RefSeqID = "NP_012165",
|
||||
UniProtID = "P40489",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
|
||||
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
|
||||
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
|
||||
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
|
||||
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
|
||||
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
|
||||
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
|
||||
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
|
||||
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
|
||||
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
|
||||
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
# [END]
|
||||
# addSACCE_APSESproteins.R
|
||||
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
|
||||
#
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "SWI4_SACCE",
|
||||
RefSeqID = "NP_011036",
|
||||
UniProtID = "P25302",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
|
||||
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
|
||||
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
|
||||
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
|
||||
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
|
||||
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
|
||||
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
|
||||
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
|
||||
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
|
||||
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
|
||||
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
|
||||
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
|
||||
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
|
||||
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
|
||||
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
|
||||
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
|
||||
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
|
||||
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
|
||||
1081 klddiekdlr ana"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "PHD1_SACCE",
|
||||
RefSeqID = "NP_012881",
|
||||
UniProtID = "P36093",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
|
||||
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
|
||||
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
|
||||
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
|
||||
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
|
||||
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
|
||||
361 aknels"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "SOK2_SACCE",
|
||||
RefSeqID = "NP_013729",
|
||||
UniProtID = "P53438",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
|
||||
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
|
||||
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
|
||||
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
|
||||
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
|
||||
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
|
||||
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
|
||||
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
|
||||
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
|
||||
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
|
||||
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
|
||||
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
|
||||
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
|
||||
781 kkqek"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
myDB$protein <-
|
||||
rbind(myDB$protein,
|
||||
data.frame(
|
||||
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
|
||||
name = "XBP1_SACCE",
|
||||
RefSeqID = "NP_012165",
|
||||
UniProtID = "P40489",
|
||||
taxonomy.ID = as.integer(4932),
|
||||
sequence = dbSanitizeSequence("
|
||||
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
|
||||
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
|
||||
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
|
||||
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
|
||||
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
|
||||
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
|
||||
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
|
||||
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
|
||||
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
|
||||
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
|
||||
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
|
||||
stringsAsFactors = FALSE))
|
||||
|
||||
# [END]
|
||||
|
138
ABC-units.R
138
ABC-units.R
@ -1,69 +1,69 @@
|
||||
# ABC-units.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course: R code for learning units
|
||||
#
|
||||
# Version: 4.0
|
||||
#
|
||||
# Date: 2020 09 16
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# V 4.0 2020 version
|
||||
# V 3.0 2019 version
|
||||
# V 2.0 2018 version
|
||||
# V 1.0 2017 version
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# The R-scripts and datasets in this project will be continuously updated,
|
||||
# and updates will be posted on GitHub. To bring your version into the latest
|
||||
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
|
||||
# repository. However, this will overwrite locally edited version of files.
|
||||
|
||||
# To edit code and experiment with it, for example to add your own comments and
|
||||
# examples, save your edited version into the "myScripts" folder. Otherwise you
|
||||
# may have problems with git when you update the project to a new version. It's
|
||||
# good practice to change the filename, for example by prepending your initials.
|
||||
# This helps distinguish the files you are working with e.g. in a list of
|
||||
# recent files. For example if your name is Honjo Tasuku, your edited
|
||||
# BIN-Sequence.R might be named HT-BIN-Sequence.R
|
||||
|
||||
# If you pull from github and get the following type of error ...
|
||||
# ---------------
|
||||
# error: Your local changes to the following files would be
|
||||
# overwritten by merge
|
||||
# ...
|
||||
# Please commit your changes or stash them before you can merge.
|
||||
# ---------------
|
||||
# ... then, you need to bring the offending file into its original state.
|
||||
# Open the Commit window, select the file, and click on the Revert button.
|
||||
#
|
||||
# When working with these script DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# Once you have typed and executed the function init(), you will find a file
|
||||
# called myScript.R in the project directory.
|
||||
#
|
||||
# Open it, you can place all of your code-experiments and notes into that
|
||||
# file. This will complement your "Course Journal". If you keep all contents in
|
||||
# this one file, you can find everything by using the <cmd>-F find function. To
|
||||
# cross-reference code in your journal, create section headings.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# The individual learning units' files can be opened by simply clicking on them
|
||||
# in the File pane.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# ABC-units.R
|
||||
#
|
||||
# Purpose: A Bioinformatics Course: R code for learning units
|
||||
#
|
||||
# Version: 4.0
|
||||
#
|
||||
# Date: 2020 09 16
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# V 4.0 2020 version
|
||||
# V 3.0 2019 version
|
||||
# V 2.0 2018 version
|
||||
# V 1.0 2017 version
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# The R-scripts and datasets in this project will be continuously updated,
|
||||
# and updates will be posted on GitHub. To bring your version into the latest
|
||||
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
|
||||
# repository. However, this will overwrite locally edited version of files.
|
||||
|
||||
# To edit code and experiment with it, for example to add your own comments and
|
||||
# examples, save your edited version into the "myScripts" folder. Otherwise you
|
||||
# may have problems with git when you update the project to a new version. It's
|
||||
# good practice to change the filename, for example by prepending your initials.
|
||||
# This helps distinguish the files you are working with e.g. in a list of
|
||||
# recent files. For example if your name is Honjo Tasuku, your edited
|
||||
# BIN-Sequence.R might be named HT-BIN-Sequence.R
|
||||
|
||||
# If you pull from github and get the following type of error ...
|
||||
# ---------------
|
||||
# error: Your local changes to the following files would be
|
||||
# overwritten by merge
|
||||
# ...
|
||||
# Please commit your changes or stash them before you can merge.
|
||||
# ---------------
|
||||
# ... then, you need to bring the offending file into its original state.
|
||||
# Open the Commit window, select the file, and click on the Revert button.
|
||||
#
|
||||
# When working with these script DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# Once you have typed and executed the function init(), you will find a file
|
||||
# called myScript.R in the project directory.
|
||||
#
|
||||
# Open it, you can place all of your code-experiments and notes into that
|
||||
# file. This will complement your "Course Journal". If you keep all contents in
|
||||
# this one file, you can find everything by using the <cmd>-F find function. To
|
||||
# cross-reference code in your journal, create section headings.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# The individual learning units' files can be opened by simply clicking on them
|
||||
# in the File pane.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,16 +1,16 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: No
|
||||
SaveWorkspace: No
|
||||
AlwaysSaveHistory: No
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: knitr
|
||||
LaTeX: XeLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
StripTrailingWhitespace: Yes
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: No
|
||||
SaveWorkspace: No
|
||||
AlwaysSaveHistory: No
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: knitr
|
||||
LaTeX: XeLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
StripTrailingWhitespace: Yes
|
||||
|
222
BIN-ALI-BLAST.R
222
BIN-ALI-BLAST.R
@ -1,111 +1,111 @@
|
||||
# tocID <- "BIN-ALI-BLAST.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-BLAST unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.1 Fixed parsing logic.
|
||||
# 1.0 First live version 2017.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 Defining the APSES domain 45
|
||||
#TOC> 2 Executing the BLAST search 75
|
||||
#TOC> 3 Analysing results 97
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Defining the APSES domain ===========================================
|
||||
|
||||
# Load your protein database
|
||||
source("makeProteinDB.R")
|
||||
|
||||
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
|
||||
# have entered this data into your database in the
|
||||
# BIN-ALI-Optimal_sequence_alignment unit.)
|
||||
|
||||
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
|
||||
# name of the Mbp1 orthologue
|
||||
# of Mbp1 in your protein
|
||||
# database, DON'T continue. We
|
||||
# need to fix this problem.
|
||||
# Get in touch.
|
||||
|
||||
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
|
||||
# BLAST search.
|
||||
|
||||
|
||||
# = 2 Executing the BLAST search ==========================================
|
||||
|
||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||
# through its Web API, and to parse results. Have a look at the script, then
|
||||
# source it:
|
||||
|
||||
source("./scripts/BLAST.R")
|
||||
|
||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
||||
# cerevisiae:
|
||||
|
||||
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
|
||||
db = "refseq_protein", # database to search in
|
||||
nHits = 10, #
|
||||
E = 0.01, #
|
||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||
|
||||
|
||||
length(BLASTresults$hits) # There should be at least one hit there. Ask for
|
||||
# advice in case this step fails.
|
||||
|
||||
|
||||
# = 3 Analysing results ===================================================
|
||||
|
||||
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
||||
|
||||
# What is the refseq ID of the top hit
|
||||
topHit$accession
|
||||
|
||||
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
|
||||
# domain. If it is not, ask me for advice.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-ALI-BLAST.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-BLAST unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.1 Fixed parsing logic.
|
||||
# 1.0 First live version 2017.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 Defining the APSES domain 45
|
||||
#TOC> 2 Executing the BLAST search 75
|
||||
#TOC> 3 Analysing results 97
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Defining the APSES domain ===========================================
|
||||
|
||||
# Load your protein database
|
||||
source("makeProteinDB.R")
|
||||
|
||||
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
|
||||
# have entered this data into your database in the
|
||||
# BIN-ALI-Optimal_sequence_alignment unit.)
|
||||
|
||||
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
|
||||
# name of the Mbp1 orthologue
|
||||
# of Mbp1 in your protein
|
||||
# database, DON'T continue. We
|
||||
# need to fix this problem.
|
||||
# Get in touch.
|
||||
|
||||
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
|
||||
# BLAST search.
|
||||
|
||||
|
||||
# = 2 Executing the BLAST search ==========================================
|
||||
|
||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||
# through its Web API, and to parse results. Have a look at the script, then
|
||||
# source it:
|
||||
|
||||
source("./scripts/BLAST.R")
|
||||
|
||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
|
||||
# cerevisiae:
|
||||
|
||||
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
|
||||
db = "refseq_protein", # database to search in
|
||||
nHits = 10, #
|
||||
E = 0.01, #
|
||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||
|
||||
|
||||
length(BLASTresults$hits) # There should be at least one hit there. Ask for
|
||||
# advice in case this step fails.
|
||||
|
||||
|
||||
# = 3 Analysing results ===================================================
|
||||
|
||||
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
||||
|
||||
# What is the refseq ID of the top hit
|
||||
topHit$accession
|
||||
|
||||
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
|
||||
# domain. If it is not, ask me for advice.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,195 +1,195 @@
|
||||
# tocID <- "BIN-ALI-Dotplot.R"
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Dotplot unit.
|
||||
#
|
||||
# Version: 0.2
|
||||
#
|
||||
# Date: 2019 01 07
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 0.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------
|
||||
#TOC> 1 ___Section___ 42
|
||||
#TOC> 2 Tasks 190
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
if (!requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
|
||||
|
||||
# Let's load BLOSUM62
|
||||
data(BLOSUM62, package = "Biostrings")
|
||||
|
||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
|
||||
# matrix that has as many rows as one sequence, as many columns as another. Then
|
||||
# we go through every cell of the matrix and enter the pairscore we encounter
|
||||
# for the amino acid pair whose position corresponds to the row and column
|
||||
# index. Finally we visualize the matrix in a plot.
|
||||
#
|
||||
|
||||
# First we fetch our sequences and split them into single characters.
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
# Check that we have two character vectors of the expected length.
|
||||
str(MBP1_SACCE)
|
||||
str(MBP1_MYSPE)
|
||||
|
||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
|
||||
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
|
||||
MBP1_SACCE[13]
|
||||
MBP1_MYSPE[21]
|
||||
|
||||
# ... using these as subsetting expressions, we can pull the pairscore
|
||||
# from the MDM
|
||||
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
|
||||
|
||||
# First we build an empty matrix that will hold all pairscores ...
|
||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
|
||||
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
|
||||
|
||||
# ... then we loop over the sequences and store the scores in the matrix.
|
||||
#
|
||||
for (i in 1:length(MBP1_SACCE)) {
|
||||
for (j in 1:length(MBP1_MYSPE)) {
|
||||
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
|
||||
}
|
||||
}
|
||||
|
||||
# Even though this is a large matrix, this does not take much time ...
|
||||
# Let's have a look at a small block of the values:
|
||||
|
||||
dotMat[1:10, 1:10]
|
||||
|
||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
|
||||
# the matrix correspond to an amino acid from MBP1_MYSPE.
|
||||
|
||||
# To plot this, we use the image() function. Here, with default parameters.
|
||||
|
||||
image(dotMat)
|
||||
|
||||
# Be patient, this takes a few moments to render: more than 500,000 values.
|
||||
# Nice.
|
||||
# What do you expect?
|
||||
# What would similar sequences look like?
|
||||
# What do you see?
|
||||
|
||||
#You migh notice a thin line of yellow along the diagonal, moving approximately
|
||||
# from bottom left to top right, fading in and out of existence. This is the
|
||||
# signature of extended sequence similarity.
|
||||
|
||||
# Let's magnify this a bit by looking at only the first 200 amino acids ...
|
||||
image(dotMat[1:200, 1:200])
|
||||
|
||||
# ... and, according to our normal writing convention, we would like the
|
||||
# diagonal to run from top-left to bottom-right since we write from left to
|
||||
# right and from top to bottom...
|
||||
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
|
||||
|
||||
# ... and we would like the range of the x- and y- axis to correspond to the
|
||||
# sequence position ...
|
||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
|
||||
|
||||
# ... and labels! Axis labels would be nice ...
|
||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
|
||||
|
||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
|
||||
# too ...
|
||||
len <- 200
|
||||
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
|
||||
box()
|
||||
axis(1, at = c(1, seq(10, len, by=10)))
|
||||
axis(2, at = c(1, seq(10, len, by=10)))
|
||||
axis(3, at = c(1, seq(10, len, by=10)))
|
||||
axis(4, at = c(1, seq(10, len, by=10)))
|
||||
|
||||
# ... you get the idea, we can infinitely customize our plot. However a good way
|
||||
# to do this is to develop a particular view for, say, a report or publication
|
||||
# in a script and then put it into a function. I have put a function into the
|
||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||
# there already is a dotplot function in the seqinr package:
|
||||
|
||||
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||
|
||||
# Which one do you prefer? You can probably see the block patterns that arise
|
||||
# from segments of repetitive, low complexity sequence. But you probably have to
|
||||
# look very closely to discern the faint diagonals that correspond to similar
|
||||
# sequence.
|
||||
|
||||
|
||||
# Let's see if we can enhance the contrast between distributed noise and the
|
||||
# actual alignment of conserved residues. We can filter the dot matrix with a
|
||||
# pattern that enhances diagonally repeated values. Every value in the matrix
|
||||
# will be replaced by a weighted average of its neighborhood. Here is a
|
||||
# diagonal-filter:
|
||||
|
||||
myFilter <- matrix(numeric(25), nrow = 5)
|
||||
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
|
||||
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
|
||||
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
|
||||
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
|
||||
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
|
||||
|
||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
|
||||
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
||||
|
||||
# I think the result shows quite nicely how the two sequences are globally
|
||||
# related and where the regions of sequence similarity are. Play with this a bit
|
||||
# ... Can you come up with a better filter? If so, eMail us.
|
||||
|
||||
|
||||
|
||||
|
||||
# = 2 Tasks ===============================================================
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-ALI-Dotplot.R"
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Dotplot unit.
|
||||
#
|
||||
# Version: 0.2
|
||||
#
|
||||
# Date: 2019 01 07
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 0.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------
|
||||
#TOC> 1 ___Section___ 42
|
||||
#TOC> 2 Tasks 190
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
if (!requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
|
||||
|
||||
# Let's load BLOSUM62
|
||||
data(BLOSUM62, package = "Biostrings")
|
||||
|
||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
|
||||
# matrix that has as many rows as one sequence, as many columns as another. Then
|
||||
# we go through every cell of the matrix and enter the pairscore we encounter
|
||||
# for the amino acid pair whose position corresponds to the row and column
|
||||
# index. Finally we visualize the matrix in a plot.
|
||||
#
|
||||
|
||||
# First we fetch our sequences and split them into single characters.
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
# Check that we have two character vectors of the expected length.
|
||||
str(MBP1_SACCE)
|
||||
str(MBP1_MYSPE)
|
||||
|
||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
|
||||
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
|
||||
MBP1_SACCE[13]
|
||||
MBP1_MYSPE[21]
|
||||
|
||||
# ... using these as subsetting expressions, we can pull the pairscore
|
||||
# from the MDM
|
||||
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
|
||||
|
||||
# First we build an empty matrix that will hold all pairscores ...
|
||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
|
||||
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
|
||||
|
||||
# ... then we loop over the sequences and store the scores in the matrix.
|
||||
#
|
||||
for (i in 1:length(MBP1_SACCE)) {
|
||||
for (j in 1:length(MBP1_MYSPE)) {
|
||||
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
|
||||
}
|
||||
}
|
||||
|
||||
# Even though this is a large matrix, this does not take much time ...
|
||||
# Let's have a look at a small block of the values:
|
||||
|
||||
dotMat[1:10, 1:10]
|
||||
|
||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
|
||||
# the matrix correspond to an amino acid from MBP1_MYSPE.
|
||||
|
||||
# To plot this, we use the image() function. Here, with default parameters.
|
||||
|
||||
image(dotMat)
|
||||
|
||||
# Be patient, this takes a few moments to render: more than 500,000 values.
|
||||
# Nice.
|
||||
# What do you expect?
|
||||
# What would similar sequences look like?
|
||||
# What do you see?
|
||||
|
||||
#You migh notice a thin line of yellow along the diagonal, moving approximately
|
||||
# from bottom left to top right, fading in and out of existence. This is the
|
||||
# signature of extended sequence similarity.
|
||||
|
||||
# Let's magnify this a bit by looking at only the first 200 amino acids ...
|
||||
image(dotMat[1:200, 1:200])
|
||||
|
||||
# ... and, according to our normal writing convention, we would like the
|
||||
# diagonal to run from top-left to bottom-right since we write from left to
|
||||
# right and from top to bottom...
|
||||
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
|
||||
|
||||
# ... and we would like the range of the x- and y- axis to correspond to the
|
||||
# sequence position ...
|
||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
|
||||
|
||||
# ... and labels! Axis labels would be nice ...
|
||||
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
|
||||
|
||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
|
||||
# too ...
|
||||
len <- 200
|
||||
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
|
||||
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
|
||||
box()
|
||||
axis(1, at = c(1, seq(10, len, by=10)))
|
||||
axis(2, at = c(1, seq(10, len, by=10)))
|
||||
axis(3, at = c(1, seq(10, len, by=10)))
|
||||
axis(4, at = c(1, seq(10, len, by=10)))
|
||||
|
||||
# ... you get the idea, we can infinitely customize our plot. However a good way
|
||||
# to do this is to develop a particular view for, say, a report or publication
|
||||
# in a script and then put it into a function. I have put a function into the
|
||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||
# there already is a dotplot function in the seqinr package:
|
||||
|
||||
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||
|
||||
# Which one do you prefer? You can probably see the block patterns that arise
|
||||
# from segments of repetitive, low complexity sequence. But you probably have to
|
||||
# look very closely to discern the faint diagonals that correspond to similar
|
||||
# sequence.
|
||||
|
||||
|
||||
# Let's see if we can enhance the contrast between distributed noise and the
|
||||
# actual alignment of conserved residues. We can filter the dot matrix with a
|
||||
# pattern that enhances diagonally repeated values. Every value in the matrix
|
||||
# will be replaced by a weighted average of its neighborhood. Here is a
|
||||
# diagonal-filter:
|
||||
|
||||
myFilter <- matrix(numeric(25), nrow = 5)
|
||||
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
|
||||
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
|
||||
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
|
||||
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
|
||||
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
|
||||
|
||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
|
||||
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
||||
|
||||
# I think the result shows quite nicely how the two sequences are globally
|
||||
# related and where the regions of sequence similarity are. Play with this a bit
|
||||
# ... Can you come up with a better filter? If so, eMail us.
|
||||
|
||||
|
||||
|
||||
|
||||
# = 2 Tasks ===============================================================
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
1256
BIN-ALI-MSA.R
1256
BIN-ALI-MSA.R
File diff suppressed because it is too large
Load Diff
@ -1,365 +1,365 @@
|
||||
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
# Version: 1.7.1
|
||||
#
|
||||
# Date: 2017-09 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
|
||||
# 1.7 2020 updates
|
||||
# 1.6 Maintenance
|
||||
# 1.5 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
||||
# entire library.
|
||||
# 1.3 Updated confirmation task with correct logic
|
||||
# 1.2 Added missing load of seqinr package
|
||||
# 1.1 Update annotation file logic - it could already have been
|
||||
# prepared in the BIN-FUNC-Annotation unit.
|
||||
# 1.0.1 bugfix
|
||||
# 1.0 First 2017 live version.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------------
|
||||
#TOC> 1 Prepare 58
|
||||
#TOC> 2 Biostrings Pairwise Alignment 75
|
||||
#TOC> 2.1 Optimal global alignment 93
|
||||
#TOC> 2.2 Optimal local alignment 156
|
||||
#TOC> 3 APSES Domain annotation by alignment 180
|
||||
#TOC> 4 Update your database script 261
|
||||
#TOC> 4.1 Preparing an annotation file ... 267
|
||||
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
|
||||
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
|
||||
#TOC> 4.2 Execute and Validate 338
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Prepare =============================================================
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# You can get package information with the following commands:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
|
||||
# You need to recreate the protein database that you have constructed in the
|
||||
# BIN-Storing_data unit.
|
||||
|
||||
source("./myScripts/makeProteinDB.R")
|
||||
|
||||
|
||||
# = 2 Biostrings Pairwise Alignment =======================================
|
||||
|
||||
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# Biostrings stores sequences in "XString" objects. Once we have converted our
|
||||
# target sequences to AAString objects, the alignment itself is straightforward.
|
||||
|
||||
# == 2.1 Optimal global alignment ==========================================
|
||||
|
||||
# The pairwiseAlignment() function was written to behave
|
||||
# exactly like the functions you encountered on the EMBOSS server.
|
||||
|
||||
# First: make AAString objects ...
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
?pairwiseAlignment
|
||||
# ... and align.
|
||||
# Global optimal alignment with end-gap penalties is default.
|
||||
ali1 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
gapExtension = 0.5)
|
||||
|
||||
str(ali1) # ... it's complicated
|
||||
|
||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
||||
# tame it:
|
||||
ali1
|
||||
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
|
||||
|
||||
# And we can make the internal structure work for us (@ is for classes as
|
||||
# $ is for lists ...)
|
||||
str(ali1@pattern)
|
||||
ali1@pattern
|
||||
ali1@pattern@range
|
||||
ali1@pattern@indel
|
||||
ali1@pattern@mismatch
|
||||
|
||||
# or work with "normal" R functions
|
||||
# the alignment length
|
||||
nchar(as.character(ali1@pattern))
|
||||
|
||||
# the number of identities
|
||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
|
||||
seqinr::s2c(as.character(ali1@subject)))
|
||||
|
||||
# ... e.g. to calculate the percentage of identities
|
||||
100 *
|
||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
|
||||
seqinr::s2c(as.character(ali1@subject))) /
|
||||
nchar(as.character(ali1@pattern))
|
||||
# ... which should be the same as reported in the writePairwiseAlignments()
|
||||
# output. Awkward to type? Then it calls for a function:
|
||||
#
|
||||
percentID <- function(al) {
|
||||
# returns the percent-identity of a Biostrings alignment object
|
||||
return(100 *
|
||||
sum(seqinr::s2c(as.character(al@pattern)) ==
|
||||
seqinr::s2c(as.character(al@subject))) /
|
||||
nchar(as.character(al@pattern)))
|
||||
}
|
||||
|
||||
percentID(ali1)
|
||||
|
||||
# == 2.2 Optimal local alignment ===========================================
|
||||
|
||||
# Compare with local optimal alignment (like EMBOSS Water)
|
||||
ali2 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
type = "local",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 50,
|
||||
gapExtension = 10)
|
||||
|
||||
Biostrings::writePairwiseAlignments(ali2)
|
||||
# This has probably only aligned the N-terminal DNA binding domain - but that
|
||||
# one has quite high sequence identity:
|
||||
percentID(ali2)
|
||||
|
||||
# == TASK: ==
|
||||
|
||||
# Compare the two alignments. I have weighted the local alignment heavily
|
||||
# towards an ungapped alignment by setting very high gap penalties. Try changing
|
||||
# the gap penalties and see what happens: how does the number of indels change,
|
||||
# how does the length of indels change...
|
||||
|
||||
|
||||
# = 3 APSES Domain annotation by alignment ================================
|
||||
|
||||
# In this section we define the MYSPE APSES sequence by performing a global,
|
||||
# optimal sequence alignment of the yeast APSES domain with the full length
|
||||
# protein sequence of the protein that was the most similar to the yeast APSES
|
||||
# domain.
|
||||
#
|
||||
|
||||
# I have annotated the yeast APSES domain as a feature in the
|
||||
# database. To view the annotation, we can retrieve it via the proteinID and
|
||||
# featureID. Here is the yeast protein ID:
|
||||
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
|
||||
|
||||
|
||||
# ... and if you look at the feature table, you can identify the feature ID
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
|
||||
# ... and with the two annotations we can get the corresponding ID from the
|
||||
# annotation table
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
|
||||
myDB$annotation[myDB$annotation$ID == proID &
|
||||
myDB$annotation$ID == ftrID, ]
|
||||
|
||||
# The annotation record contains the start and end coordinates which we can use
|
||||
# to define the APSES domain sequence with a substr() expression.
|
||||
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
|
||||
# have selected from the sequence column of the protein table the sequence whose
|
||||
# name is "MBP1_SACCE", and selected from the annotation table the start
|
||||
# and end coordinates of the annotation that joins an "APSES fold" feature with
|
||||
# the sequence, and used the start and end coordinates to extract a substring.
|
||||
|
||||
# Let's convert this to an AAstring and assign it:
|
||||
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
|
||||
|
||||
# Now let's align these two sequences of very different length without end-gap
|
||||
# penalties using the "overlap" type. "overlap" turns the
|
||||
# end-gap penalties off and that is crucially important since
|
||||
# the sequences have very different length.
|
||||
|
||||
aliApses <- Biostrings::pairwiseAlignment(
|
||||
aaMB1_SACCE_APSES,
|
||||
aaMBP1_MYSPE,
|
||||
type = "overlap",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
gapExtension = 0.5)
|
||||
|
||||
# Inspect the result. The aligned sequences should be clearly
|
||||
# homologous, and have (almost) no indels. The entire "pattern"
|
||||
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
||||
# with the "query". Is this correct?
|
||||
Biostrings::writePairwiseAlignments(aliApses)
|
||||
|
||||
# If this is correct, you can extract the matched sequence from
|
||||
# the alignment object. The syntax is a bit different from what
|
||||
# you have seen before: this is an "S4 object", not a list. No
|
||||
# worries: as.character() returns a normal string.
|
||||
as.character(aliApses@subject)
|
||||
|
||||
# Now, what are the aligned start and end coordinates? You can read them from
|
||||
# the output of writePairwiseAlignments(), or you can get them from the range of
|
||||
# the match.
|
||||
|
||||
str(aliApses@subject@range)
|
||||
|
||||
# start is:
|
||||
aliApses@subject@range@start
|
||||
|
||||
# ... and end is:
|
||||
aliApses@subject@range@start + aliApses@subject@range@width - 1
|
||||
|
||||
|
||||
# = 4 Update your database script =========================================
|
||||
|
||||
|
||||
# Since we have this feature defined now, we can create a feature annotation
|
||||
# right away and store it in myDB.
|
||||
|
||||
# == 4.1 Preparing an annotation file ... ==================================
|
||||
#
|
||||
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
|
||||
#
|
||||
#
|
||||
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
||||
# myScripts/ directory.
|
||||
#
|
||||
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
||||
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
||||
# "MBP1_CRYNE").
|
||||
#
|
||||
# - Open the file in the RStudio editor and delete all blocks for
|
||||
# the Mbp1 protein annotations except the first one.
|
||||
#
|
||||
# - From that block, delete all lines except for the line that says:
|
||||
#
|
||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
|
||||
#
|
||||
# - Then delete the comma at the end of the line (your file will just have
|
||||
# this one annotation).
|
||||
#
|
||||
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
||||
# "start" and "end" features to the coordinates you just discovered for the
|
||||
# APSES domain in your sequence.
|
||||
#
|
||||
# - Save the file in your myScripts/ directory
|
||||
#
|
||||
## - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
# - Update your "./myScripts/makeProteinDB.R" script to load your new
|
||||
# annotation when you recreate the database. Open the script in the
|
||||
# RStudio editor, and add the following command at the end:
|
||||
#
|
||||
# myDB <- dbAddAnnotation(myDB,
|
||||
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
|
||||
# ^^^^^^^
|
||||
# edit this!
|
||||
# - save and close the file.
|
||||
#
|
||||
# Then SKIP the next section.
|
||||
#
|
||||
#
|
||||
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
|
||||
#
|
||||
#
|
||||
# You DO already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Open the file in the RStudio editor.
|
||||
#
|
||||
# - Below the last feature lines (but before the closing "]") add the
|
||||
# following feature line (without the "#")
|
||||
#
|
||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
|
||||
#
|
||||
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
||||
# "start" and "end" features to the coordinates you just discovered for the
|
||||
# APSES domain in your sequence.
|
||||
#
|
||||
# - Add a comma after the preceding feature line.
|
||||
#
|
||||
# - Save your file.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
#
|
||||
# == 4.2 Execute and Validate ==============================================
|
||||
#
|
||||
# - source() your database creation script:
|
||||
#
|
||||
# source("./myScripts/makeProteinDB.R")
|
||||
#
|
||||
# This should run without errors or warnings. If it doesn't work and you
|
||||
# can't figure out quickly what's happening, ask on the mailing list for
|
||||
# help.
|
||||
#
|
||||
# - Confirm
|
||||
# The following commands should retrieve the correct start and end
|
||||
# coordinates and sequence of the MBP1_MYSPE APSES domain:
|
||||
|
||||
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
|
||||
|
||||
(proID <- myDB$protein$ID[sel])
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
# Version: 1.7.1
|
||||
#
|
||||
# Date: 2017-09 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
|
||||
# 1.7 2020 updates
|
||||
# 1.6 Maintenance
|
||||
# 1.5 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
||||
# entire library.
|
||||
# 1.3 Updated confirmation task with correct logic
|
||||
# 1.2 Added missing load of seqinr package
|
||||
# 1.1 Update annotation file logic - it could already have been
|
||||
# prepared in the BIN-FUNC-Annotation unit.
|
||||
# 1.0.1 bugfix
|
||||
# 1.0 First 2017 live version.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------------
|
||||
#TOC> 1 Prepare 58
|
||||
#TOC> 2 Biostrings Pairwise Alignment 75
|
||||
#TOC> 2.1 Optimal global alignment 93
|
||||
#TOC> 2.2 Optimal local alignment 156
|
||||
#TOC> 3 APSES Domain annotation by alignment 180
|
||||
#TOC> 4 Update your database script 261
|
||||
#TOC> 4.1 Preparing an annotation file ... 267
|
||||
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
|
||||
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
|
||||
#TOC> 4.2 Execute and Validate 338
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Prepare =============================================================
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# You can get package information with the following commands:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
|
||||
# You need to recreate the protein database that you have constructed in the
|
||||
# BIN-Storing_data unit.
|
||||
|
||||
source("./myScripts/makeProteinDB.R")
|
||||
|
||||
|
||||
# = 2 Biostrings Pairwise Alignment =======================================
|
||||
|
||||
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# Biostrings stores sequences in "XString" objects. Once we have converted our
|
||||
# target sequences to AAString objects, the alignment itself is straightforward.
|
||||
|
||||
# == 2.1 Optimal global alignment ==========================================
|
||||
|
||||
# The pairwiseAlignment() function was written to behave
|
||||
# exactly like the functions you encountered on the EMBOSS server.
|
||||
|
||||
# First: make AAString objects ...
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
?pairwiseAlignment
|
||||
# ... and align.
|
||||
# Global optimal alignment with end-gap penalties is default.
|
||||
ali1 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
gapExtension = 0.5)
|
||||
|
||||
str(ali1) # ... it's complicated
|
||||
|
||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
||||
# tame it:
|
||||
ali1
|
||||
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
|
||||
|
||||
# And we can make the internal structure work for us (@ is for classes as
|
||||
# $ is for lists ...)
|
||||
str(ali1@pattern)
|
||||
ali1@pattern
|
||||
ali1@pattern@range
|
||||
ali1@pattern@indel
|
||||
ali1@pattern@mismatch
|
||||
|
||||
# or work with "normal" R functions
|
||||
# the alignment length
|
||||
nchar(as.character(ali1@pattern))
|
||||
|
||||
# the number of identities
|
||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
|
||||
seqinr::s2c(as.character(ali1@subject)))
|
||||
|
||||
# ... e.g. to calculate the percentage of identities
|
||||
100 *
|
||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
|
||||
seqinr::s2c(as.character(ali1@subject))) /
|
||||
nchar(as.character(ali1@pattern))
|
||||
# ... which should be the same as reported in the writePairwiseAlignments()
|
||||
# output. Awkward to type? Then it calls for a function:
|
||||
#
|
||||
percentID <- function(al) {
|
||||
# returns the percent-identity of a Biostrings alignment object
|
||||
return(100 *
|
||||
sum(seqinr::s2c(as.character(al@pattern)) ==
|
||||
seqinr::s2c(as.character(al@subject))) /
|
||||
nchar(as.character(al@pattern)))
|
||||
}
|
||||
|
||||
percentID(ali1)
|
||||
|
||||
# == 2.2 Optimal local alignment ===========================================
|
||||
|
||||
# Compare with local optimal alignment (like EMBOSS Water)
|
||||
ali2 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
type = "local",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 50,
|
||||
gapExtension = 10)
|
||||
|
||||
Biostrings::writePairwiseAlignments(ali2)
|
||||
# This has probably only aligned the N-terminal DNA binding domain - but that
|
||||
# one has quite high sequence identity:
|
||||
percentID(ali2)
|
||||
|
||||
# == TASK: ==
|
||||
|
||||
# Compare the two alignments. I have weighted the local alignment heavily
|
||||
# towards an ungapped alignment by setting very high gap penalties. Try changing
|
||||
# the gap penalties and see what happens: how does the number of indels change,
|
||||
# how does the length of indels change...
|
||||
|
||||
|
||||
# = 3 APSES Domain annotation by alignment ================================
|
||||
|
||||
# In this section we define the MYSPE APSES sequence by performing a global,
|
||||
# optimal sequence alignment of the yeast APSES domain with the full length
|
||||
# protein sequence of the protein that was the most similar to the yeast APSES
|
||||
# domain.
|
||||
#
|
||||
|
||||
# I have annotated the yeast APSES domain as a feature in the
|
||||
# database. To view the annotation, we can retrieve it via the proteinID and
|
||||
# featureID. Here is the yeast protein ID:
|
||||
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
|
||||
|
||||
|
||||
# ... and if you look at the feature table, you can identify the feature ID
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
|
||||
# ... and with the two annotations we can get the corresponding ID from the
|
||||
# annotation table
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
|
||||
myDB$annotation[myDB$annotation$ID == proID &
|
||||
myDB$annotation$ID == ftrID, ]
|
||||
|
||||
# The annotation record contains the start and end coordinates which we can use
|
||||
# to define the APSES domain sequence with a substr() expression.
|
||||
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
|
||||
# have selected from the sequence column of the protein table the sequence whose
|
||||
# name is "MBP1_SACCE", and selected from the annotation table the start
|
||||
# and end coordinates of the annotation that joins an "APSES fold" feature with
|
||||
# the sequence, and used the start and end coordinates to extract a substring.
|
||||
|
||||
# Let's convert this to an AAstring and assign it:
|
||||
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
|
||||
|
||||
# Now let's align these two sequences of very different length without end-gap
|
||||
# penalties using the "overlap" type. "overlap" turns the
|
||||
# end-gap penalties off and that is crucially important since
|
||||
# the sequences have very different length.
|
||||
|
||||
aliApses <- Biostrings::pairwiseAlignment(
|
||||
aaMB1_SACCE_APSES,
|
||||
aaMBP1_MYSPE,
|
||||
type = "overlap",
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
gapOpening = 10,
|
||||
gapExtension = 0.5)
|
||||
|
||||
# Inspect the result. The aligned sequences should be clearly
|
||||
# homologous, and have (almost) no indels. The entire "pattern"
|
||||
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
||||
# with the "query". Is this correct?
|
||||
Biostrings::writePairwiseAlignments(aliApses)
|
||||
|
||||
# If this is correct, you can extract the matched sequence from
|
||||
# the alignment object. The syntax is a bit different from what
|
||||
# you have seen before: this is an "S4 object", not a list. No
|
||||
# worries: as.character() returns a normal string.
|
||||
as.character(aliApses@subject)
|
||||
|
||||
# Now, what are the aligned start and end coordinates? You can read them from
|
||||
# the output of writePairwiseAlignments(), or you can get them from the range of
|
||||
# the match.
|
||||
|
||||
str(aliApses@subject@range)
|
||||
|
||||
# start is:
|
||||
aliApses@subject@range@start
|
||||
|
||||
# ... and end is:
|
||||
aliApses@subject@range@start + aliApses@subject@range@width - 1
|
||||
|
||||
|
||||
# = 4 Update your database script =========================================
|
||||
|
||||
|
||||
# Since we have this feature defined now, we can create a feature annotation
|
||||
# right away and store it in myDB.
|
||||
|
||||
# == 4.1 Preparing an annotation file ... ==================================
|
||||
#
|
||||
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
|
||||
#
|
||||
#
|
||||
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
||||
# myScripts/ directory.
|
||||
#
|
||||
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
||||
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
||||
# "MBP1_CRYNE").
|
||||
#
|
||||
# - Open the file in the RStudio editor and delete all blocks for
|
||||
# the Mbp1 protein annotations except the first one.
|
||||
#
|
||||
# - From that block, delete all lines except for the line that says:
|
||||
#
|
||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
|
||||
#
|
||||
# - Then delete the comma at the end of the line (your file will just have
|
||||
# this one annotation).
|
||||
#
|
||||
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
||||
# "start" and "end" features to the coordinates you just discovered for the
|
||||
# APSES domain in your sequence.
|
||||
#
|
||||
# - Save the file in your myScripts/ directory
|
||||
#
|
||||
## - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
# - Update your "./myScripts/makeProteinDB.R" script to load your new
|
||||
# annotation when you recreate the database. Open the script in the
|
||||
# RStudio editor, and add the following command at the end:
|
||||
#
|
||||
# myDB <- dbAddAnnotation(myDB,
|
||||
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
|
||||
# ^^^^^^^
|
||||
# edit this!
|
||||
# - save and close the file.
|
||||
#
|
||||
# Then SKIP the next section.
|
||||
#
|
||||
#
|
||||
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
|
||||
#
|
||||
#
|
||||
# You DO already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Open the file in the RStudio editor.
|
||||
#
|
||||
# - Below the last feature lines (but before the closing "]") add the
|
||||
# following feature line (without the "#")
|
||||
#
|
||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
|
||||
#
|
||||
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
|
||||
# "start" and "end" features to the coordinates you just discovered for the
|
||||
# APSES domain in your sequence.
|
||||
#
|
||||
# - Add a comma after the preceding feature line.
|
||||
#
|
||||
# - Save your file.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
#
|
||||
# == 4.2 Execute and Validate ==============================================
|
||||
#
|
||||
# - source() your database creation script:
|
||||
#
|
||||
# source("./myScripts/makeProteinDB.R")
|
||||
#
|
||||
# This should run without errors or warnings. If it doesn't work and you
|
||||
# can't figure out quickly what's happening, ask on the mailing list for
|
||||
# help.
|
||||
#
|
||||
# - Confirm
|
||||
# The following commands should retrieve the correct start and end
|
||||
# coordinates and sequence of the MBP1_MYSPE APSES domain:
|
||||
|
||||
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
|
||||
|
||||
(proID <- myDB$protein$ID[sel])
|
||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
|
||||
myDB$annotation$featureID == ftrID])
|
||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
|
||||
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
|
||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
|
||||
start,
|
||||
end))
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,313 +1,313 @@
|
||||
# tocID <- "BIN-ALI-Similarity.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Similarity unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Update ggtern:: ternary plot to use aacol dots under text
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Amino Acid Properties 43
|
||||
#TOC> 2 Mutation Data matrix 189
|
||||
#TOC> 3 Background score 230
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Amino Acid Properties ===============================================
|
||||
|
||||
# A large collection of amino acid property tables is available via the seqinr
|
||||
# package:
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
|
||||
# data:
|
||||
|
||||
?aaindex
|
||||
data(aaindex, package = "seqinr") # load the aaindex list from the package
|
||||
|
||||
length(aaindex)
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
# It's a bit cumbersome to search through the descriptions ... here is a
|
||||
# function to make this easier:
|
||||
|
||||
searchAAindex <- function(patt) {
|
||||
# Searches the aaindex descriptions for regular expression "patt"
|
||||
# and prints index number and description.
|
||||
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
|
||||
for (i in seq_along(hits)) {
|
||||
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
searchAAindex("free energy") # Search for "free energy"
|
||||
searchAAindex("(size)|(volume)") # Search for "size" or "volume":
|
||||
|
||||
|
||||
|
||||
|
||||
# Let's examine ...
|
||||
# ... a hydrophobicity index
|
||||
(Y <- aaindex[[528]][c("D", "I")])
|
||||
|
||||
# ... a volume index
|
||||
(V <- aaindex[[150]][c("D", "I")])
|
||||
|
||||
# ... and one of our own: side-chain pK values as reported by
|
||||
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
|
||||
# to 7.4 (physiological pH)
|
||||
K <- list(I = c( 7.4, # Ala
|
||||
12.3, # Arg
|
||||
7.4, # Asn
|
||||
3.9, # Asp
|
||||
8.6, # Cys
|
||||
7.4, # Gln
|
||||
4.3, # Glu
|
||||
7.4, # Gly
|
||||
6.5, # His
|
||||
7.4, # Ile
|
||||
7.4, # Leu
|
||||
10.4, # Lys
|
||||
7.4, # Met
|
||||
7.4, # Phe
|
||||
7.4, # Pro
|
||||
7.4, # Ser
|
||||
7.4, # Thr
|
||||
7.4, # Trp
|
||||
9.8, # Tyr
|
||||
7.4)) # Val
|
||||
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
|
||||
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
|
||||
|
||||
|
||||
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
|
||||
|
||||
# pull the names from Y$I, convert them to single letter code, and reorder the
|
||||
# AACOLS palette accordingly ...
|
||||
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
|
||||
|
||||
plot(Y$I, V$I,
|
||||
xlab = "hydrophobicity", ylab = "volume",
|
||||
pch = 21,
|
||||
cex = 6,
|
||||
col = aac,
|
||||
bg = aac)
|
||||
text(Y$I, V$I, names(Y$I), cex = 0.8)
|
||||
|
||||
plot(Y$I, K$I,
|
||||
xlab = "hydrophobicity", ylab = "pK",
|
||||
pch = 21,
|
||||
cex = 6,
|
||||
col = aac,
|
||||
bg = aac)
|
||||
text(Y$I, K$I, names(Y$I), cex = 0.8)
|
||||
|
||||
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
|
||||
# plots are in general unintuitive and hard to interpret. One alternative is a
|
||||
# so-called "ternary plot":
|
||||
|
||||
if (! requireNamespace("ggtern", quietly=TRUE)) {
|
||||
install.packages("ggtern")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = ggtern) # basic information
|
||||
# browseVignettes("ggtern") # available vignettes
|
||||
# data(package = "ggtern") # available datasets
|
||||
|
||||
|
||||
|
||||
# collect into data frame, normalize to (0.05, 0.95)
|
||||
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
|
||||
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
|
||||
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
|
||||
stringsAsFactors = FALSE)
|
||||
rownames(myDat) <- names(Y$I)
|
||||
|
||||
ggtern::ggtern(data = myDat,
|
||||
ggplot2::aes(x = vol,
|
||||
y = phi,
|
||||
z = pK,
|
||||
label = rownames(myDat))) + ggplot2::geom_text()
|
||||
|
||||
# This results in a mapping of amino acids relative to each other that is
|
||||
# similar to the Venn diagram you have seen in the notes.
|
||||
|
||||
# ... or we could use principal components analysis, to pull out the
|
||||
# best projection of the three feature dimensions into two. (Done here without delving
|
||||
# into the theory ...)
|
||||
prc <- prcomp(myDat)
|
||||
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
|
||||
pch=19, cex=6, col=aad, cex.main=0.7,
|
||||
main="Principal Component Analysis of Amino Acid Features")
|
||||
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
|
||||
|
||||
# This matches the intuition rather well in that "similar" amino acids are close
|
||||
# on the plot. But we can't interpret the distances in terms of just one of the
|
||||
# parameters. Whatever - nature has a different way to define similarity:
|
||||
# mutations to similar amino acids are less likely to break the protein.
|
||||
|
||||
|
||||
# = 2 Mutation Data matrix ================================================
|
||||
|
||||
# A mutation data matrix encodes all amino acid pairscores in a matrix.
|
||||
|
||||
# The Biostrings package contains the most common mutation data matrices.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help=Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
# Let's attach the BLOSUM62 mutation data matrix from the package
|
||||
data(BLOSUM62, package = "Biostrings")
|
||||
|
||||
# ... and see what it contains. (You've seen this matrix before.)
|
||||
BLOSUM62
|
||||
|
||||
# We can simply access values via the row/column names.
|
||||
# Identical amino acids have high scores ...
|
||||
BLOSUM62["H", "H"] # Score for a pair of two histidines
|
||||
BLOSUM62["S", "S"] # Score for a pair of two serines
|
||||
|
||||
# Similar amino acids have low positive scores ...
|
||||
BLOSUM62["L", "I"] # Score for a leucine / lysine pair
|
||||
BLOSUM62["F", "Y"] # etc.
|
||||
|
||||
# Dissimilar amino acids have negative scores ...
|
||||
BLOSUM62["L", "K"] # Score for a leucine / lysine pair
|
||||
BLOSUM62["Q", "P"] # etc.
|
||||
|
||||
|
||||
BLOSUM62["R", "W"] # the matrix is symmetric!
|
||||
BLOSUM62["W", "R"]
|
||||
|
||||
|
||||
# = 3 Background score ====================================================
|
||||
|
||||
# The mutation data matrix is designed to give high scores to homologous
|
||||
# sequences, low scores to non-homologous sequences. What score on average
|
||||
# should we expect for a random sequence?
|
||||
|
||||
# If we sample amino acid pairs at random, we will get a score that is the
|
||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
|
||||
# codes and the gap character:
|
||||
|
||||
sum(BLOSUM62[1:20, 1:20])/400
|
||||
|
||||
# But that score could be higher for real sequences, for which the amino acid
|
||||
# distribution is not random. For example membrane proteins have a large number
|
||||
# of hydrophobic residues - an alignment of unrelated proteins might produce
|
||||
# positive scores. And there are other proteins with biased amino acid
|
||||
# compositions, in particular poteins that interact with multiple other
|
||||
# proteins. Let's test how this impacts the background score by comparing a
|
||||
# sequence with shuffled sequences. These have the same composition, but are
|
||||
# obvioulsy not homologous. The data directory contains the FASTA file for the
|
||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
|
||||
# low-complexity amino acid sequence ...
|
||||
|
||||
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
|
||||
|
||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
|
||||
# with an exceptionally high percentage of hydrophobic residues.
|
||||
|
||||
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
|
||||
|
||||
# Here is a function that takes two sequences and
|
||||
# returns their average pairscore.
|
||||
|
||||
averagePairScore <- function(a, b, MDM = BLOSUM62) {
|
||||
# Returns average pairscore of two sequences.
|
||||
# Parameters:
|
||||
# a, b chr amino acid sequence string
|
||||
# MDM mutation data matrix. Default is BLOSUM62
|
||||
# Value: num average pairscore.
|
||||
a <- unlist(strsplit(a, ""))
|
||||
b <- unlist(strsplit(b, ""))
|
||||
v <- 0
|
||||
for (i in seq_along(a)) {
|
||||
v <- v + MDM[ a[i], b[i] ]
|
||||
}
|
||||
return(v / length(a))
|
||||
}
|
||||
|
||||
orig3FG7 <- toString(aa3FG7)
|
||||
orig2F1C <- toString(aa2F1C)
|
||||
N <- 1000
|
||||
scores3FG7 <- numeric(N)
|
||||
scores2F1C <- numeric(N)
|
||||
for (i in 1:N) {
|
||||
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
|
||||
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
|
||||
}
|
||||
|
||||
# Plot the distributions
|
||||
hist(scores3FG7,
|
||||
col="#5599EE33",
|
||||
breaks = seq(-1.5, 0, by=0.1),
|
||||
main = "Pairscores for randomly shuffled sequences",
|
||||
xlab = "Average pairscore from BLOSUM 62")
|
||||
hist(scores2F1C,
|
||||
col="#55EE9933",
|
||||
breaks = seq(-1.5, 0, by=0.1),
|
||||
add = TRUE)
|
||||
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
|
||||
legend('topright',
|
||||
c("3FG7 (villin)", "2F1C (OmpG)"),
|
||||
fill = c("#5599EE33", "#55EE9933"), bty = 'n',
|
||||
inset = 0.1)
|
||||
|
||||
# This is an important result: even though we have shuffled significantly biased
|
||||
# sequences, and the average scores trend above the average of the mutation data
|
||||
# matrix, the average scores still remain comfortably below zero. This means
|
||||
# that we can't (in general) improve a high-scoring alignment by simply
|
||||
# extending it with randomly matched residues. We will only improve the score if
|
||||
# the similarity of newly added residues is larger than what we expect to get by
|
||||
# random chance!
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-ALI-Similarity.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Similarity unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Update ggtern:: ternary plot to use aacol dots under text
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Amino Acid Properties 43
|
||||
#TOC> 2 Mutation Data matrix 189
|
||||
#TOC> 3 Background score 230
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Amino Acid Properties ===============================================
|
||||
|
||||
# A large collection of amino acid property tables is available via the seqinr
|
||||
# package:
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
|
||||
# data:
|
||||
|
||||
?aaindex
|
||||
data(aaindex, package = "seqinr") # load the aaindex list from the package
|
||||
|
||||
length(aaindex)
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
# It's a bit cumbersome to search through the descriptions ... here is a
|
||||
# function to make this easier:
|
||||
|
||||
searchAAindex <- function(patt) {
|
||||
# Searches the aaindex descriptions for regular expression "patt"
|
||||
# and prints index number and description.
|
||||
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
|
||||
for (i in seq_along(hits)) {
|
||||
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
searchAAindex("free energy") # Search for "free energy"
|
||||
searchAAindex("(size)|(volume)") # Search for "size" or "volume":
|
||||
|
||||
|
||||
|
||||
|
||||
# Let's examine ...
|
||||
# ... a hydrophobicity index
|
||||
(Y <- aaindex[[528]][c("D", "I")])
|
||||
|
||||
# ... a volume index
|
||||
(V <- aaindex[[150]][c("D", "I")])
|
||||
|
||||
# ... and one of our own: side-chain pK values as reported by
|
||||
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
|
||||
# to 7.4 (physiological pH)
|
||||
K <- list(I = c( 7.4, # Ala
|
||||
12.3, # Arg
|
||||
7.4, # Asn
|
||||
3.9, # Asp
|
||||
8.6, # Cys
|
||||
7.4, # Gln
|
||||
4.3, # Glu
|
||||
7.4, # Gly
|
||||
6.5, # His
|
||||
7.4, # Ile
|
||||
7.4, # Leu
|
||||
10.4, # Lys
|
||||
7.4, # Met
|
||||
7.4, # Phe
|
||||
7.4, # Pro
|
||||
7.4, # Ser
|
||||
7.4, # Thr
|
||||
7.4, # Trp
|
||||
9.8, # Tyr
|
||||
7.4)) # Val
|
||||
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
|
||||
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
|
||||
|
||||
|
||||
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
|
||||
|
||||
# pull the names from Y$I, convert them to single letter code, and reorder the
|
||||
# AACOLS palette accordingly ...
|
||||
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
|
||||
|
||||
plot(Y$I, V$I,
|
||||
xlab = "hydrophobicity", ylab = "volume",
|
||||
pch = 21,
|
||||
cex = 6,
|
||||
col = aac,
|
||||
bg = aac)
|
||||
text(Y$I, V$I, names(Y$I), cex = 0.8)
|
||||
|
||||
plot(Y$I, K$I,
|
||||
xlab = "hydrophobicity", ylab = "pK",
|
||||
pch = 21,
|
||||
cex = 6,
|
||||
col = aac,
|
||||
bg = aac)
|
||||
text(Y$I, K$I, names(Y$I), cex = 0.8)
|
||||
|
||||
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
|
||||
# plots are in general unintuitive and hard to interpret. One alternative is a
|
||||
# so-called "ternary plot":
|
||||
|
||||
if (! requireNamespace("ggtern", quietly=TRUE)) {
|
||||
install.packages("ggtern")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = ggtern) # basic information
|
||||
# browseVignettes("ggtern") # available vignettes
|
||||
# data(package = "ggtern") # available datasets
|
||||
|
||||
|
||||
|
||||
# collect into data frame, normalize to (0.05, 0.95)
|
||||
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
|
||||
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
|
||||
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
|
||||
stringsAsFactors = FALSE)
|
||||
rownames(myDat) <- names(Y$I)
|
||||
|
||||
ggtern::ggtern(data = myDat,
|
||||
ggplot2::aes(x = vol,
|
||||
y = phi,
|
||||
z = pK,
|
||||
label = rownames(myDat))) + ggplot2::geom_text()
|
||||
|
||||
# This results in a mapping of amino acids relative to each other that is
|
||||
# similar to the Venn diagram you have seen in the notes.
|
||||
|
||||
# ... or we could use principal components analysis, to pull out the
|
||||
# best projection of the three feature dimensions into two. (Done here without delving
|
||||
# into the theory ...)
|
||||
prc <- prcomp(myDat)
|
||||
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
|
||||
pch=19, cex=6, col=aad, cex.main=0.7,
|
||||
main="Principal Component Analysis of Amino Acid Features")
|
||||
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
|
||||
|
||||
# This matches the intuition rather well in that "similar" amino acids are close
|
||||
# on the plot. But we can't interpret the distances in terms of just one of the
|
||||
# parameters. Whatever - nature has a different way to define similarity:
|
||||
# mutations to similar amino acids are less likely to break the protein.
|
||||
|
||||
|
||||
# = 2 Mutation Data matrix ================================================
|
||||
|
||||
# A mutation data matrix encodes all amino acid pairscores in a matrix.
|
||||
|
||||
# The Biostrings package contains the most common mutation data matrices.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help=Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
# Let's attach the BLOSUM62 mutation data matrix from the package
|
||||
data(BLOSUM62, package = "Biostrings")
|
||||
|
||||
# ... and see what it contains. (You've seen this matrix before.)
|
||||
BLOSUM62
|
||||
|
||||
# We can simply access values via the row/column names.
|
||||
# Identical amino acids have high scores ...
|
||||
BLOSUM62["H", "H"] # Score for a pair of two histidines
|
||||
BLOSUM62["S", "S"] # Score for a pair of two serines
|
||||
|
||||
# Similar amino acids have low positive scores ...
|
||||
BLOSUM62["L", "I"] # Score for a leucine / lysine pair
|
||||
BLOSUM62["F", "Y"] # etc.
|
||||
|
||||
# Dissimilar amino acids have negative scores ...
|
||||
BLOSUM62["L", "K"] # Score for a leucine / lysine pair
|
||||
BLOSUM62["Q", "P"] # etc.
|
||||
|
||||
|
||||
BLOSUM62["R", "W"] # the matrix is symmetric!
|
||||
BLOSUM62["W", "R"]
|
||||
|
||||
|
||||
# = 3 Background score ====================================================
|
||||
|
||||
# The mutation data matrix is designed to give high scores to homologous
|
||||
# sequences, low scores to non-homologous sequences. What score on average
|
||||
# should we expect for a random sequence?
|
||||
|
||||
# If we sample amino acid pairs at random, we will get a score that is the
|
||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
|
||||
# codes and the gap character:
|
||||
|
||||
sum(BLOSUM62[1:20, 1:20])/400
|
||||
|
||||
# But that score could be higher for real sequences, for which the amino acid
|
||||
# distribution is not random. For example membrane proteins have a large number
|
||||
# of hydrophobic residues - an alignment of unrelated proteins might produce
|
||||
# positive scores. And there are other proteins with biased amino acid
|
||||
# compositions, in particular poteins that interact with multiple other
|
||||
# proteins. Let's test how this impacts the background score by comparing a
|
||||
# sequence with shuffled sequences. These have the same composition, but are
|
||||
# obvioulsy not homologous. The data directory contains the FASTA file for the
|
||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
|
||||
# low-complexity amino acid sequence ...
|
||||
|
||||
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
|
||||
|
||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
|
||||
# with an exceptionally high percentage of hydrophobic residues.
|
||||
|
||||
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
|
||||
|
||||
# Here is a function that takes two sequences and
|
||||
# returns their average pairscore.
|
||||
|
||||
averagePairScore <- function(a, b, MDM = BLOSUM62) {
|
||||
# Returns average pairscore of two sequences.
|
||||
# Parameters:
|
||||
# a, b chr amino acid sequence string
|
||||
# MDM mutation data matrix. Default is BLOSUM62
|
||||
# Value: num average pairscore.
|
||||
a <- unlist(strsplit(a, ""))
|
||||
b <- unlist(strsplit(b, ""))
|
||||
v <- 0
|
||||
for (i in seq_along(a)) {
|
||||
v <- v + MDM[ a[i], b[i] ]
|
||||
}
|
||||
return(v / length(a))
|
||||
}
|
||||
|
||||
orig3FG7 <- toString(aa3FG7)
|
||||
orig2F1C <- toString(aa2F1C)
|
||||
N <- 1000
|
||||
scores3FG7 <- numeric(N)
|
||||
scores2F1C <- numeric(N)
|
||||
for (i in 1:N) {
|
||||
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
|
||||
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
|
||||
}
|
||||
|
||||
# Plot the distributions
|
||||
hist(scores3FG7,
|
||||
col="#5599EE33",
|
||||
breaks = seq(-1.5, 0, by=0.1),
|
||||
main = "Pairscores for randomly shuffled sequences",
|
||||
xlab = "Average pairscore from BLOSUM 62")
|
||||
hist(scores2F1C,
|
||||
col="#55EE9933",
|
||||
breaks = seq(-1.5, 0, by=0.1),
|
||||
add = TRUE)
|
||||
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
|
||||
legend('topright',
|
||||
c("3FG7 (villin)", "2F1C (OmpG)"),
|
||||
fill = c("#5599EE33", "#55EE9933"), bty = 'n',
|
||||
inset = 0.1)
|
||||
|
||||
# This is an important result: even though we have shuffled significantly biased
|
||||
# sequences, and the average scores trend above the average of the mutation data
|
||||
# matrix, the average scores still remain comfortably below zero. This means
|
||||
# that we can't (in general) improve a high-scoring alignment by simply
|
||||
# extending it with randomly matched residues. We will only improve the score if
|
||||
# the similarity of newly added residues is larger than what we expect to get by
|
||||
# random chance!
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,216 +1,216 @@
|
||||
# tocID <- "BIN-Data_integration.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Data_integration unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2018-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance and updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0.1 Bugfix: UniProt ID Mapping service API change
|
||||
# 1.0 First live version
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Develop a fungi-specific BioMart example.
|
||||
# (cf.
|
||||
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Identifier mapping 42
|
||||
#TOC> 2 Cross-referencing tables 165
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Identifier mapping ==================================================
|
||||
|
||||
# UniProt provides a well-designed ID mapping tool that can be accessed
|
||||
# online at http://www.uniprot.org/mapping/
|
||||
#
|
||||
# Here we will use the UniProt Web API for this tool to map identifiers. The
|
||||
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
|
||||
# simply via a Web- browsers request. Such requests are commonly sent via the
|
||||
# GET or POST verbs that a Webserver responds to, when a client asks for data.
|
||||
# GET requests are visible in the URL of the request; POST requests are not
|
||||
# directly visible, they are commonly used to send the contents of forms, or
|
||||
# when transmitting larger, complex data items. The UniProt ID mapping sevice
|
||||
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
|
||||
# and POST() functions are part of the httr package.
|
||||
|
||||
# To begin, we load httr, which supports sending and receiving data via the
|
||||
# http protocol, just like a Web browser.
|
||||
if (! requireNamespace("httr", quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
|
||||
# happens if the ID can't be mapped:
|
||||
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
|
||||
|
||||
|
||||
# The UniProt ID mapping service API is very straightforward to use: just define
|
||||
# the URL of the server and send a list of items labelled as "query" in the body
|
||||
# of the request. GET() and POST() are functions from httr.
|
||||
|
||||
# Note. A recent bug in the interaction between the server expectations and the
|
||||
# curl client libraries requires the following initialization
|
||||
httr::set_config(httr::config(http_version = 0))
|
||||
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
|
||||
|
||||
|
||||
URL <- "https://www.uniprot.org/mapping/"
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
||||
to = "ACC", # UniProt ID
|
||||
format = "tab",
|
||||
query = myQueryIDs))
|
||||
|
||||
cat(httr::content(response))
|
||||
|
||||
# We need to check the status code - if it is not 200, an error ocurred and we
|
||||
# can't process the result:
|
||||
httr::status_code(response)
|
||||
|
||||
# If the query is successful, tabbed text is returned. We can assign that to a
|
||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
||||
|
||||
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMappedIDs
|
||||
|
||||
# If this works as expected, you should see:
|
||||
# From To
|
||||
# 1 NP_010227 P39678
|
||||
# 2 NP_011036 P25302
|
||||
#
|
||||
# ... and note that there are only two entries, because nothing was returned
|
||||
# for the dummy "RefSeq ID" NP_00000
|
||||
|
||||
# If the query can't be fulfilled because of a problem with the server, a
|
||||
# WebPage is returned. But the server status is also returned and we can check
|
||||
# the status code. I have lately gotten many "503" status codes: Server Not
|
||||
# Available...
|
||||
|
||||
# We wrap this into a function:
|
||||
|
||||
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
# Use UniProt ID mapping service to map one or more IDs
|
||||
# Parameters:
|
||||
# s char A string of separated IDs
|
||||
# mapFrom char the database in which the IDs in s are valid. Default
|
||||
# is RefSeq protein
|
||||
# mapTo char the database in which the target IDs are valid. Default
|
||||
# is UniProtKB
|
||||
# Value
|
||||
# a data frame of mapped IDs, with column names From and To, or an
|
||||
# empty data frame if the mapping was unsuccessful. No rows are returned
|
||||
# for IDs that are not mapped.
|
||||
|
||||
# Initialize curl
|
||||
httr::set_config(httr::config(http_version = 0))
|
||||
|
||||
URL <- "https://www.uniprot.org/uploadlists/"
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = mapFrom,
|
||||
to = mapTo,
|
||||
format = "tab",
|
||||
query = s))
|
||||
|
||||
if (httr::status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
colnames(myMap) <- c("From", "To")
|
||||
} else {
|
||||
myMap <- data.frame()
|
||||
warning(paste("No uniProt ID mapping returned:",
|
||||
"server sent status",
|
||||
httr::status_code(response)))
|
||||
}
|
||||
|
||||
return(myMap)
|
||||
}
|
||||
|
||||
# Try it out ...
|
||||
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
|
||||
|
||||
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
|
||||
# into your workspace on startup.
|
||||
|
||||
|
||||
# = 2 Cross-referencing tables ============================================
|
||||
|
||||
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
|
||||
# genes in a model organism database such as SGD, or from the Human Genen
|
||||
# Nomenclature commission. How do we map one set of identifiers to another one?
|
||||
|
||||
# The function to use is match().
|
||||
# Here is a tiny set of identifiers taken from a much larger table to
|
||||
# illustrate the principle:
|
||||
#
|
||||
|
||||
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
||||
"Q08641", "P47129", "P52910", "P00330", "P81450"),
|
||||
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
|
||||
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
|
||||
refID = c("NP_014657", "NP_009386",
|
||||
"NP_012683", "NP_012559",
|
||||
"NP_010038", "NP_014882",
|
||||
"NP_012616", "NP_013254",
|
||||
"NP_014555", "NP_013629"))
|
||||
|
||||
myIDs
|
||||
|
||||
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
|
||||
# their gene names.
|
||||
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
|
||||
|
||||
# %in% will only tell us if these IDs are present in the table:
|
||||
myQuery %in% myIDs$refID
|
||||
|
||||
# ... but not where they are located. But match() does what we need here:
|
||||
match(myQuery, myIDs$refID)
|
||||
|
||||
# ... and we can use the result to subset the column that we want to map to:
|
||||
myIDs$name[match(myQuery, myIDs$refID)]
|
||||
|
||||
# Note that the output preserves the NA - i.e. the length of the mapped
|
||||
# values is exactly the same as the length of the query.
|
||||
|
||||
# task: map the three genes to their UniProt Identifier.
|
||||
|
||||
|
||||
#
|
||||
# Note: if you want to do very many queries in very large tables, use the
|
||||
# fmatch() function in the "fastmatch" package for a considerable
|
||||
# speedup.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-Data_integration.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Data_integration unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2018-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance and updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0.1 Bugfix: UniProt ID Mapping service API change
|
||||
# 1.0 First live version
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Develop a fungi-specific BioMart example.
|
||||
# (cf.
|
||||
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Identifier mapping 42
|
||||
#TOC> 2 Cross-referencing tables 165
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Identifier mapping ==================================================
|
||||
|
||||
# UniProt provides a well-designed ID mapping tool that can be accessed
|
||||
# online at http://www.uniprot.org/mapping/
|
||||
#
|
||||
# Here we will use the UniProt Web API for this tool to map identifiers. The
|
||||
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
|
||||
# simply via a Web- browsers request. Such requests are commonly sent via the
|
||||
# GET or POST verbs that a Webserver responds to, when a client asks for data.
|
||||
# GET requests are visible in the URL of the request; POST requests are not
|
||||
# directly visible, they are commonly used to send the contents of forms, or
|
||||
# when transmitting larger, complex data items. The UniProt ID mapping sevice
|
||||
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
|
||||
# and POST() functions are part of the httr package.
|
||||
|
||||
# To begin, we load httr, which supports sending and receiving data via the
|
||||
# http protocol, just like a Web browser.
|
||||
if (! requireNamespace("httr", quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
|
||||
# happens if the ID can't be mapped:
|
||||
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
|
||||
|
||||
|
||||
# The UniProt ID mapping service API is very straightforward to use: just define
|
||||
# the URL of the server and send a list of items labelled as "query" in the body
|
||||
# of the request. GET() and POST() are functions from httr.
|
||||
|
||||
# Note. A recent bug in the interaction between the server expectations and the
|
||||
# curl client libraries requires the following initialization
|
||||
httr::set_config(httr::config(http_version = 0))
|
||||
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
|
||||
|
||||
|
||||
URL <- "https://www.uniprot.org/mapping/"
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
||||
to = "ACC", # UniProt ID
|
||||
format = "tab",
|
||||
query = myQueryIDs))
|
||||
|
||||
cat(httr::content(response))
|
||||
|
||||
# We need to check the status code - if it is not 200, an error ocurred and we
|
||||
# can't process the result:
|
||||
httr::status_code(response)
|
||||
|
||||
# If the query is successful, tabbed text is returned. We can assign that to a
|
||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
||||
|
||||
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMappedIDs
|
||||
|
||||
# If this works as expected, you should see:
|
||||
# From To
|
||||
# 1 NP_010227 P39678
|
||||
# 2 NP_011036 P25302
|
||||
#
|
||||
# ... and note that there are only two entries, because nothing was returned
|
||||
# for the dummy "RefSeq ID" NP_00000
|
||||
|
||||
# If the query can't be fulfilled because of a problem with the server, a
|
||||
# WebPage is returned. But the server status is also returned and we can check
|
||||
# the status code. I have lately gotten many "503" status codes: Server Not
|
||||
# Available...
|
||||
|
||||
# We wrap this into a function:
|
||||
|
||||
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
# Use UniProt ID mapping service to map one or more IDs
|
||||
# Parameters:
|
||||
# s char A string of separated IDs
|
||||
# mapFrom char the database in which the IDs in s are valid. Default
|
||||
# is RefSeq protein
|
||||
# mapTo char the database in which the target IDs are valid. Default
|
||||
# is UniProtKB
|
||||
# Value
|
||||
# a data frame of mapped IDs, with column names From and To, or an
|
||||
# empty data frame if the mapping was unsuccessful. No rows are returned
|
||||
# for IDs that are not mapped.
|
||||
|
||||
# Initialize curl
|
||||
httr::set_config(httr::config(http_version = 0))
|
||||
|
||||
URL <- "https://www.uniprot.org/uploadlists/"
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = mapFrom,
|
||||
to = mapTo,
|
||||
format = "tab",
|
||||
query = s))
|
||||
|
||||
if (httr::status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
colnames(myMap) <- c("From", "To")
|
||||
} else {
|
||||
myMap <- data.frame()
|
||||
warning(paste("No uniProt ID mapping returned:",
|
||||
"server sent status",
|
||||
httr::status_code(response)))
|
||||
}
|
||||
|
||||
return(myMap)
|
||||
}
|
||||
|
||||
# Try it out ...
|
||||
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
|
||||
|
||||
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
|
||||
# into your workspace on startup.
|
||||
|
||||
|
||||
# = 2 Cross-referencing tables ============================================
|
||||
|
||||
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
|
||||
# genes in a model organism database such as SGD, or from the Human Genen
|
||||
# Nomenclature commission. How do we map one set of identifiers to another one?
|
||||
|
||||
# The function to use is match().
|
||||
# Here is a tiny set of identifiers taken from a much larger table to
|
||||
# illustrate the principle:
|
||||
#
|
||||
|
||||
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
||||
"Q08641", "P47129", "P52910", "P00330", "P81450"),
|
||||
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
|
||||
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
|
||||
refID = c("NP_014657", "NP_009386",
|
||||
"NP_012683", "NP_012559",
|
||||
"NP_010038", "NP_014882",
|
||||
"NP_012616", "NP_013254",
|
||||
"NP_014555", "NP_013629"))
|
||||
|
||||
myIDs
|
||||
|
||||
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
|
||||
# their gene names.
|
||||
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
|
||||
|
||||
# %in% will only tell us if these IDs are present in the table:
|
||||
myQuery %in% myIDs$refID
|
||||
|
||||
# ... but not where they are located. But match() does what we need here:
|
||||
match(myQuery, myIDs$refID)
|
||||
|
||||
# ... and we can use the result to subset the column that we want to map to:
|
||||
myIDs$name[match(myQuery, myIDs$refID)]
|
||||
|
||||
# Note that the output preserves the NA - i.e. the length of the mapped
|
||||
# values is exactly the same as the length of the query.
|
||||
|
||||
# task: map the three genes to their UniProt Identifier.
|
||||
|
||||
|
||||
#
|
||||
# Note: if you want to do very many queries in very large tables, use the
|
||||
# fmatch() function in the "fastmatch" package for a considerable
|
||||
# speedup.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,435 +1,435 @@
|
||||
# tocID <- "BIN-FUNC-Domain_annotation.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-11 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.4 Add code for shared data import from the Wiki
|
||||
# 1.3 Add code for database export to JSON and instructions
|
||||
# for uploading annotations to the Public Student Wiki page
|
||||
# 1.2 Consistently: data in ./myScripts/ ;
|
||||
# begin SHARING DATA section
|
||||
# 1.1 2020 Updates
|
||||
# 1.0 Live version 2017
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
# Put the domain plot into a function
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------------
|
||||
#TOC> 1 Update your database script 51
|
||||
#TOC> 1.1 Preparing an annotation file ... 58
|
||||
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
|
||||
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
|
||||
#TOC> 1.2 Execute and Validate 136
|
||||
#TOC> 2 Plot Annotations 161
|
||||
#TOC> 3 SHARING DATA 287
|
||||
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303
|
||||
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Update your database script =========================================
|
||||
|
||||
|
||||
# Since you have recorded domain features at the SMART database, we can store
|
||||
# the feature annotations in myDB ...
|
||||
|
||||
|
||||
# == 1.1 Preparing an annotation file ... ==================================
|
||||
|
||||
|
||||
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
|
||||
#
|
||||
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
||||
#
|
||||
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
||||
# myScripts/ directory.
|
||||
#
|
||||
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
||||
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
||||
# "MBP1_CRYNE").
|
||||
#
|
||||
# - Open the file in the RStudio editor and delete all blocks for
|
||||
# the Mbp1 protein annotations except the first one.
|
||||
#
|
||||
# - From that block, delete all lines that have annotations you did not
|
||||
# find in SMART for MBP1_MYSPE.
|
||||
#
|
||||
# - Make enough copies of the "Ankyrin fold" and "low complexity" region
|
||||
# lines to have a line for each feature you found.
|
||||
#
|
||||
# - Then delete the comma at the end of the last line.
|
||||
#
|
||||
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
|
||||
# and change the "start" and "end" features to the coordinates you
|
||||
# recorded in the SMART database.
|
||||
#
|
||||
# - Save your file in the ./myScripts/ folder.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
# - Update your "./myScripts/makeProteinDB.R" script to load your new
|
||||
# annotation when you recreate the database. Open the script in the
|
||||
# RStudio editor, and add the following command at the end:
|
||||
#
|
||||
# myDB <- dbAddAnnotation(myDB,
|
||||
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
|
||||
# ^^^^^^^
|
||||
# edit this!
|
||||
#
|
||||
# - save and close the file.
|
||||
#
|
||||
# Then SKIP the next section.
|
||||
#
|
||||
#
|
||||
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
|
||||
#
|
||||
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
||||
#
|
||||
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Open the file in the RStudio editor.
|
||||
#
|
||||
# - Make as many copies of the "APSES fold" line as you have found
|
||||
# features in SMART.
|
||||
#
|
||||
# - Add a comma after every line except for the last one
|
||||
#
|
||||
# - Edit the annotations but include only features that are in the
|
||||
# myDB$feature table. Check which features are in the database by executing
|
||||
#
|
||||
# myDB$feature$name
|
||||
#
|
||||
# - Update the "start" and "end" coordinates for each feature to the
|
||||
# values you found.
|
||||
#
|
||||
# - Save your file.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
#
|
||||
# == 1.2 Execute and Validate ==============================================
|
||||
#
|
||||
# - source() your database creation script:
|
||||
#
|
||||
# source("./myScripts/makeProteinDB.R")
|
||||
#
|
||||
# This should run without errors or warnings. If it doesn't work and you
|
||||
# can't figure out quickly what's happening, ask for help on the
|
||||
# Discussion Board.
|
||||
#
|
||||
# - Confirm
|
||||
# The following commands should retrieve all of the features that have been
|
||||
# annotated for MBP1_MYSPE
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
|
||||
(proID <- myDB$protein$ID[sel])
|
||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
|
||||
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
|
||||
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
|
||||
# (once). If not, consider what could have gone wrong
|
||||
# and ask on the list if you have difficulties fixing
|
||||
# it.
|
||||
|
||||
|
||||
# = 2 Plot Annotations ====================================================
|
||||
|
||||
# In this section we will plot domain annotations as colored rectangles on a
|
||||
# sequence, as an example of using the R plotting system for generic, data
|
||||
# driven images.
|
||||
|
||||
# We need a small utility function that draws the annotation boxes on a
|
||||
# representation of sequence. It should accept the start and end coordinates,
|
||||
# the y value where it should be plotted and the color of the box, and plot a
|
||||
# rectangle using R's rect() function.
|
||||
|
||||
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
|
||||
# Draw a box from xStart to xEnd at y, filled with colour myCol
|
||||
# The height of the box is y +- DELTA
|
||||
rect(xStart, (y - DELTA), xEnd, (y + DELTA),
|
||||
border = "black", col = myCol)
|
||||
}
|
||||
|
||||
# test this:
|
||||
plot(c(-1.5, 1.5), c(0, 0), type = "l")
|
||||
drawBox(-1, 1, 0.0, "peachpuff")
|
||||
|
||||
# Next, we define a function to plot annotations for one protein: the name of
|
||||
# the protein, a horizontal grey line for its length, and all of its features.
|
||||
|
||||
plotProtein <- function(DB, name, y) {
|
||||
# DB: protein database
|
||||
# name: the name of the protein in the database.
|
||||
# y: height where to draw the plot
|
||||
#
|
||||
# Define colors: we create a vector of color values, one for
|
||||
# each feature, and we give it names of the feature ID. Then we
|
||||
# can easily get the color value from the feature name.
|
||||
# A: make a vector of color values. The syntax may appear unusual -
|
||||
# colorRampPalette() returns a function, and we simply append
|
||||
# the parameter (number-of-features) without assigning the function
|
||||
# to its own variable name.
|
||||
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
|
||||
"#62C923", "#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(DB$feature))
|
||||
# B: Features may overlap, so we make the colors transparent by setting
|
||||
# their "alpha channel" to 1/3 (hex: 55)
|
||||
ftrCol <- paste0(ftrCol, "55")
|
||||
# C: we asssign names
|
||||
names(ftrCol) <- DB$feature$ID
|
||||
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
|
||||
|
||||
# find the row-index of the protein ID in the protein table of DB
|
||||
iProtein <- which(DB$protein$name == name)
|
||||
|
||||
# write the name of the protein
|
||||
text(-30, y, adj=1, labels=name, cex=0.75 )
|
||||
|
||||
#draw a line from 0 to nchar(sequence-of-the-protein)
|
||||
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
|
||||
lwd=3, col="#999999")
|
||||
|
||||
# get the rows of feature annotations for the protein
|
||||
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
|
||||
|
||||
# draw a colored box for each feature
|
||||
for (i in iFtr) {
|
||||
drawBox(DB$annotation$start[i],
|
||||
DB$annotation$end[i],
|
||||
y,
|
||||
ftrCol[ DB$annotation$featureID[i] ])
|
||||
}
|
||||
}
|
||||
|
||||
# Plot each annotated protein:
|
||||
# Get the rows of all unique annotated Mbp1 proteins in myDB
|
||||
|
||||
iRows <- grep("^MBP1_", myDB$protein$name)
|
||||
|
||||
# define the size of the plot-frame to accomodate all proteins
|
||||
yMax <- length(iRows) * 1.1
|
||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
||||
|
||||
# plot an empty frame
|
||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
|
||||
# decrease margins
|
||||
plot(1, 1,
|
||||
xlim = c(-200, xMax + 100),
|
||||
ylim = c(0, yMax),
|
||||
type = "n",
|
||||
axes = FALSE,
|
||||
bty = "n",
|
||||
main = "Mbp1 orthologue domain annotations",
|
||||
xlab = "sequence position",
|
||||
cex.axis = 0.8,
|
||||
ylab="")
|
||||
axis(1, at = seq(0, xMax, by = 100))
|
||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
||||
"#f0ea00", "#62C923",
|
||||
"#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(myDB$feature))
|
||||
myCol <- paste0(myCol, "55")
|
||||
legend(xMax - 150, 7,
|
||||
legend = myDB$feature$name,
|
||||
cex = 0.7,
|
||||
fill = myCol,
|
||||
bty = "n")
|
||||
|
||||
# Finally, iterate over all proteins and call plotProtein()
|
||||
for (i in seq_along(iRows)) {
|
||||
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
||||
}
|
||||
par(oPar) # reset the plot parameters
|
||||
|
||||
|
||||
# The plot shows what is variable and what is constant about the annotations in
|
||||
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
|
||||
# top.
|
||||
|
||||
# Task:
|
||||
# Put a copy of the plot into your journal and interpret it with respect
|
||||
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
|
||||
|
||||
# Task:
|
||||
# It would be better to align the motif borders, at least approximately (not
|
||||
# all proteins have all motifs). How would you go about doing that?
|
||||
|
||||
# = 3 SHARING DATA ========================================================
|
||||
|
||||
# It's particularly interesting to compare such annotations across many
|
||||
# homologous proteins. I have created a page on the Student Wiki () that you can
|
||||
# edit, and then download the data from the entire class directly to your
|
||||
# RStudio project.
|
||||
#
|
||||
|
||||
# I have provided a function that extracts all information that refers to a
|
||||
# single protein from the database, and prints it out as well-formatted JSON,
|
||||
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
|
||||
# bookkeeping involved, but the code is not otherwise very enlightening so I
|
||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
||||
# would want to have a look.
|
||||
|
||||
|
||||
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# 1: Run the following code:
|
||||
|
||||
cat("{{Vspace}}",
|
||||
"<!-- ==== BEGIN PROTEIN ==== -->",
|
||||
"<pre class=\"protein-data\">",
|
||||
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
||||
"</pre>",
|
||||
"<!-- ===== END PROTEIN ====== -->",
|
||||
"", sep = "\n"
|
||||
)
|
||||
|
||||
# 2: Copy the entire output from the console.
|
||||
# 3: Navigate to
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
||||
# ... edit the page, and paste your output at the top.
|
||||
# 4: Save your edits.
|
||||
|
||||
|
||||
|
||||
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
|
||||
|
||||
# Once we have collected a number of protein annotations, we can access the
|
||||
# Wiki-page and import the data into our database. The Wiki page is an html
|
||||
# document with lots of MediaWiki specific stuff - but the contents we are
|
||||
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
|
||||
# work like normal HTML <pre> tags, but we have defined a special class for them
|
||||
# to make it easy to parse out the contents we want. The rvest:: package in
|
||||
# combination with xml2:: provides us with all the tools we need for such
|
||||
# "Webscraping" of data....
|
||||
|
||||
if (! requireNamespace("rvest", quietly=TRUE)) {
|
||||
install.packages("rvest")
|
||||
}
|
||||
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
|
||||
# Here's the process:
|
||||
# The URL is an "open" page on the student Wiki. Users that are not logged in
|
||||
# can view the contents, but you can only edit if you are logged in.
|
||||
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
|
||||
|
||||
# First thing is to retrieve the HTML from the url...
|
||||
x <- xml2::read_html(myURL)
|
||||
|
||||
# This retrieves the page source, but that still needs to be parsed into its
|
||||
# logical elements. HTML is a subset of XML and such documents are structured as
|
||||
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
|
||||
# parses out the document structure and then uses a so-called "xpath" expression
|
||||
# to select nodes we are interested in. Now, xpath is one of those specialized
|
||||
# languages of which there are a few more to learn than one would care for. You
|
||||
# MUST know how to format sprintf() expressions, and you SHOULD be competent
|
||||
# with regular expressions. But if you want to be really competent in your work,
|
||||
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
|
||||
# to search on Stackoverflow for what you need for parsing data out of Web
|
||||
# documents...
|
||||
|
||||
# The expression we use below is:
|
||||
# - get any node anywhere in the tree ("//*") ...
|
||||
# - that has a particular attribute("[@ ... ]").
|
||||
# - The attribute we want is that the class of the node is "protein-data";
|
||||
# that is the class we have defined for our <pre> tags.
|
||||
# As a result of this selection, we get a list of pointers to the document tree.
|
||||
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
|
||||
|
||||
# Next we fetch the actual payload - the text - from the tree:
|
||||
# rvest::html_text() gets the text from the list of pointers. The result is a
|
||||
# normal list of character strings.
|
||||
z <- rvest::html_text(y)
|
||||
|
||||
# Finally we can iterate over the list, and add all proteins we don't already
|
||||
# have to our database. There may well be items that are rejected because they
|
||||
# are already present in the database - for example, unless somebody has
|
||||
# annotated new features, all of the features are already there. Don't worry -
|
||||
# that is intended; we don't want duplicate entries.
|
||||
|
||||
for (thisJSON in z) {
|
||||
thisData <- jsonlite::fromJSON(thisJSON)
|
||||
if (! thisData$protein$name %in% myDB$protein$name) {
|
||||
myDB <- dbAddProtein(myDB, thisData$protein)
|
||||
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
|
||||
myDB <- dbAddFeature(myDB, thisData$feature)
|
||||
myDB <- dbAddAnnotation(myDB, thisData$annotation)
|
||||
}
|
||||
}
|
||||
|
||||
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
|
||||
|
||||
iRows <- grep("^MBP1_", myDB$protein$name)
|
||||
yMax <- length(iRows) * 1.1
|
||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
||||
|
||||
# plot an empty frame
|
||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
|
||||
plot(1, 1,
|
||||
xlim = c(-200, xMax + 100),
|
||||
ylim = c(0, yMax),
|
||||
type = "n",
|
||||
axes = FALSE,
|
||||
bty = "n",
|
||||
main = "Mbp1 orthologue domain annotations",
|
||||
xlab = "sequence position",
|
||||
cex.axis = 0.8,
|
||||
ylab="")
|
||||
axis(1, at = seq(0, xMax, by = 100))
|
||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
||||
"#f0ea00", "#62C923",
|
||||
"#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(myDB$feature))
|
||||
myCol <- paste0(myCol, "55")
|
||||
legend(xMax - 150, 7,
|
||||
legend = myDB$feature$name,
|
||||
cex = 0.7,
|
||||
fill = myCol,
|
||||
bty = "n")
|
||||
|
||||
for (i in seq_along(iRows)) {
|
||||
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
||||
}
|
||||
par(oPar) # reset the plot parameters
|
||||
|
||||
# ... the more proteins we can compare, the more we learn about the
|
||||
# architectural principles of this family's domains.
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-FUNC-Domain_annotation.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
||||
#
|
||||
# ==============================================================================
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-11 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.4 Add code for shared data import from the Wiki
|
||||
# 1.3 Add code for database export to JSON and instructions
|
||||
# for uploading annotations to the Public Student Wiki page
|
||||
# 1.2 Consistently: data in ./myScripts/ ;
|
||||
# begin SHARING DATA section
|
||||
# 1.1 2020 Updates
|
||||
# 1.0 Live version 2017
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
# Put the domain plot into a function
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------------
|
||||
#TOC> 1 Update your database script 51
|
||||
#TOC> 1.1 Preparing an annotation file ... 58
|
||||
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
|
||||
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
|
||||
#TOC> 1.2 Execute and Validate 136
|
||||
#TOC> 2 Plot Annotations 161
|
||||
#TOC> 3 SHARING DATA 287
|
||||
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303
|
||||
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Update your database script =========================================
|
||||
|
||||
|
||||
# Since you have recorded domain features at the SMART database, we can store
|
||||
# the feature annotations in myDB ...
|
||||
|
||||
|
||||
# == 1.1 Preparing an annotation file ... ==================================
|
||||
|
||||
|
||||
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
|
||||
#
|
||||
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
||||
#
|
||||
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
|
||||
# myScripts/ directory.
|
||||
#
|
||||
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
|
||||
# if MYSPE is called "Crptycoccus neoformans", your file should be called
|
||||
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
|
||||
# "MBP1_CRYNE").
|
||||
#
|
||||
# - Open the file in the RStudio editor and delete all blocks for
|
||||
# the Mbp1 protein annotations except the first one.
|
||||
#
|
||||
# - From that block, delete all lines that have annotations you did not
|
||||
# find in SMART for MBP1_MYSPE.
|
||||
#
|
||||
# - Make enough copies of the "Ankyrin fold" and "low complexity" region
|
||||
# lines to have a line for each feature you found.
|
||||
#
|
||||
# - Then delete the comma at the end of the last line.
|
||||
#
|
||||
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
|
||||
# and change the "start" and "end" features to the coordinates you
|
||||
# recorded in the SMART database.
|
||||
#
|
||||
# - Save your file in the ./myScripts/ folder.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
# - Update your "./myScripts/makeProteinDB.R" script to load your new
|
||||
# annotation when you recreate the database. Open the script in the
|
||||
# RStudio editor, and add the following command at the end:
|
||||
#
|
||||
# myDB <- dbAddAnnotation(myDB,
|
||||
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
|
||||
# ^^^^^^^
|
||||
# edit this!
|
||||
#
|
||||
# - save and close the file.
|
||||
#
|
||||
# Then SKIP the next section.
|
||||
#
|
||||
#
|
||||
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
|
||||
#
|
||||
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
|
||||
#
|
||||
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the
|
||||
# ./myScripts/ directory:
|
||||
#
|
||||
# - Open the file in the RStudio editor.
|
||||
#
|
||||
# - Make as many copies of the "APSES fold" line as you have found
|
||||
# features in SMART.
|
||||
#
|
||||
# - Add a comma after every line except for the last one
|
||||
#
|
||||
# - Edit the annotations but include only features that are in the
|
||||
# myDB$feature table. Check which features are in the database by executing
|
||||
#
|
||||
# myDB$feature$name
|
||||
#
|
||||
# - Update the "start" and "end" coordinates for each feature to the
|
||||
# values you found.
|
||||
#
|
||||
# - Save your file.
|
||||
#
|
||||
# - Validate your file online at https://jsonlint.com/
|
||||
#
|
||||
#
|
||||
# == 1.2 Execute and Validate ==============================================
|
||||
#
|
||||
# - source() your database creation script:
|
||||
#
|
||||
# source("./myScripts/makeProteinDB.R")
|
||||
#
|
||||
# This should run without errors or warnings. If it doesn't work and you
|
||||
# can't figure out quickly what's happening, ask for help on the
|
||||
# Discussion Board.
|
||||
#
|
||||
# - Confirm
|
||||
# The following commands should retrieve all of the features that have been
|
||||
# annotated for MBP1_MYSPE
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
|
||||
(proID <- myDB$protein$ID[sel])
|
||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
|
||||
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
|
||||
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
|
||||
# (once). If not, consider what could have gone wrong
|
||||
# and ask on the list if you have difficulties fixing
|
||||
# it.
|
||||
|
||||
|
||||
# = 2 Plot Annotations ====================================================
|
||||
|
||||
# In this section we will plot domain annotations as colored rectangles on a
|
||||
# sequence, as an example of using the R plotting system for generic, data
|
||||
# driven images.
|
||||
|
||||
# We need a small utility function that draws the annotation boxes on a
|
||||
# representation of sequence. It should accept the start and end coordinates,
|
||||
# the y value where it should be plotted and the color of the box, and plot a
|
||||
# rectangle using R's rect() function.
|
||||
|
||||
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
|
||||
# Draw a box from xStart to xEnd at y, filled with colour myCol
|
||||
# The height of the box is y +- DELTA
|
||||
rect(xStart, (y - DELTA), xEnd, (y + DELTA),
|
||||
border = "black", col = myCol)
|
||||
}
|
||||
|
||||
# test this:
|
||||
plot(c(-1.5, 1.5), c(0, 0), type = "l")
|
||||
drawBox(-1, 1, 0.0, "peachpuff")
|
||||
|
||||
# Next, we define a function to plot annotations for one protein: the name of
|
||||
# the protein, a horizontal grey line for its length, and all of its features.
|
||||
|
||||
plotProtein <- function(DB, name, y) {
|
||||
# DB: protein database
|
||||
# name: the name of the protein in the database.
|
||||
# y: height where to draw the plot
|
||||
#
|
||||
# Define colors: we create a vector of color values, one for
|
||||
# each feature, and we give it names of the feature ID. Then we
|
||||
# can easily get the color value from the feature name.
|
||||
# A: make a vector of color values. The syntax may appear unusual -
|
||||
# colorRampPalette() returns a function, and we simply append
|
||||
# the parameter (number-of-features) without assigning the function
|
||||
# to its own variable name.
|
||||
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
|
||||
"#62C923", "#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(DB$feature))
|
||||
# B: Features may overlap, so we make the colors transparent by setting
|
||||
# their "alpha channel" to 1/3 (hex: 55)
|
||||
ftrCol <- paste0(ftrCol, "55")
|
||||
# C: we asssign names
|
||||
names(ftrCol) <- DB$feature$ID
|
||||
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
|
||||
|
||||
# find the row-index of the protein ID in the protein table of DB
|
||||
iProtein <- which(DB$protein$name == name)
|
||||
|
||||
# write the name of the protein
|
||||
text(-30, y, adj=1, labels=name, cex=0.75 )
|
||||
|
||||
#draw a line from 0 to nchar(sequence-of-the-protein)
|
||||
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
|
||||
lwd=3, col="#999999")
|
||||
|
||||
# get the rows of feature annotations for the protein
|
||||
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
|
||||
|
||||
# draw a colored box for each feature
|
||||
for (i in iFtr) {
|
||||
drawBox(DB$annotation$start[i],
|
||||
DB$annotation$end[i],
|
||||
y,
|
||||
ftrCol[ DB$annotation$featureID[i] ])
|
||||
}
|
||||
}
|
||||
|
||||
# Plot each annotated protein:
|
||||
# Get the rows of all unique annotated Mbp1 proteins in myDB
|
||||
|
||||
iRows <- grep("^MBP1_", myDB$protein$name)
|
||||
|
||||
# define the size of the plot-frame to accomodate all proteins
|
||||
yMax <- length(iRows) * 1.1
|
||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
||||
|
||||
# plot an empty frame
|
||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
|
||||
# decrease margins
|
||||
plot(1, 1,
|
||||
xlim = c(-200, xMax + 100),
|
||||
ylim = c(0, yMax),
|
||||
type = "n",
|
||||
axes = FALSE,
|
||||
bty = "n",
|
||||
main = "Mbp1 orthologue domain annotations",
|
||||
xlab = "sequence position",
|
||||
cex.axis = 0.8,
|
||||
ylab="")
|
||||
axis(1, at = seq(0, xMax, by = 100))
|
||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
||||
"#f0ea00", "#62C923",
|
||||
"#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(myDB$feature))
|
||||
myCol <- paste0(myCol, "55")
|
||||
legend(xMax - 150, 7,
|
||||
legend = myDB$feature$name,
|
||||
cex = 0.7,
|
||||
fill = myCol,
|
||||
bty = "n")
|
||||
|
||||
# Finally, iterate over all proteins and call plotProtein()
|
||||
for (i in seq_along(iRows)) {
|
||||
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
||||
}
|
||||
par(oPar) # reset the plot parameters
|
||||
|
||||
|
||||
# The plot shows what is variable and what is constant about the annotations in
|
||||
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
|
||||
# top.
|
||||
|
||||
# Task:
|
||||
# Put a copy of the plot into your journal and interpret it with respect
|
||||
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
|
||||
|
||||
# Task:
|
||||
# It would be better to align the motif borders, at least approximately (not
|
||||
# all proteins have all motifs). How would you go about doing that?
|
||||
|
||||
# = 3 SHARING DATA ========================================================
|
||||
|
||||
# It's particularly interesting to compare such annotations across many
|
||||
# homologous proteins. I have created a page on the Student Wiki () that you can
|
||||
# edit, and then download the data from the entire class directly to your
|
||||
# RStudio project.
|
||||
#
|
||||
|
||||
# I have provided a function that extracts all information that refers to a
|
||||
# single protein from the database, and prints it out as well-formatted JSON,
|
||||
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
|
||||
# bookkeeping involved, but the code is not otherwise very enlightening so I
|
||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
|
||||
# would want to have a look.
|
||||
|
||||
|
||||
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# 1: Run the following code:
|
||||
|
||||
cat("{{Vspace}}",
|
||||
"<!-- ==== BEGIN PROTEIN ==== -->",
|
||||
"<pre class=\"protein-data\">",
|
||||
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
|
||||
"</pre>",
|
||||
"<!-- ===== END PROTEIN ====== -->",
|
||||
"", sep = "\n"
|
||||
)
|
||||
|
||||
# 2: Copy the entire output from the console.
|
||||
# 3: Navigate to
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
|
||||
# ... edit the page, and paste your output at the top.
|
||||
# 4: Save your edits.
|
||||
|
||||
|
||||
|
||||
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
|
||||
|
||||
# Once we have collected a number of protein annotations, we can access the
|
||||
# Wiki-page and import the data into our database. The Wiki page is an html
|
||||
# document with lots of MediaWiki specific stuff - but the contents we are
|
||||
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
|
||||
# work like normal HTML <pre> tags, but we have defined a special class for them
|
||||
# to make it easy to parse out the contents we want. The rvest:: package in
|
||||
# combination with xml2:: provides us with all the tools we need for such
|
||||
# "Webscraping" of data....
|
||||
|
||||
if (! requireNamespace("rvest", quietly=TRUE)) {
|
||||
install.packages("rvest")
|
||||
}
|
||||
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
|
||||
# Here's the process:
|
||||
# The URL is an "open" page on the student Wiki. Users that are not logged in
|
||||
# can view the contents, but you can only edit if you are logged in.
|
||||
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
|
||||
|
||||
# First thing is to retrieve the HTML from the url...
|
||||
x <- xml2::read_html(myURL)
|
||||
|
||||
# This retrieves the page source, but that still needs to be parsed into its
|
||||
# logical elements. HTML is a subset of XML and such documents are structured as
|
||||
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
|
||||
# parses out the document structure and then uses a so-called "xpath" expression
|
||||
# to select nodes we are interested in. Now, xpath is one of those specialized
|
||||
# languages of which there are a few more to learn than one would care for. You
|
||||
# MUST know how to format sprintf() expressions, and you SHOULD be competent
|
||||
# with regular expressions. But if you want to be really competent in your work,
|
||||
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
|
||||
# to search on Stackoverflow for what you need for parsing data out of Web
|
||||
# documents...
|
||||
|
||||
# The expression we use below is:
|
||||
# - get any node anywhere in the tree ("//*") ...
|
||||
# - that has a particular attribute("[@ ... ]").
|
||||
# - The attribute we want is that the class of the node is "protein-data";
|
||||
# that is the class we have defined for our <pre> tags.
|
||||
# As a result of this selection, we get a list of pointers to the document tree.
|
||||
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
|
||||
|
||||
# Next we fetch the actual payload - the text - from the tree:
|
||||
# rvest::html_text() gets the text from the list of pointers. The result is a
|
||||
# normal list of character strings.
|
||||
z <- rvest::html_text(y)
|
||||
|
||||
# Finally we can iterate over the list, and add all proteins we don't already
|
||||
# have to our database. There may well be items that are rejected because they
|
||||
# are already present in the database - for example, unless somebody has
|
||||
# annotated new features, all of the features are already there. Don't worry -
|
||||
# that is intended; we don't want duplicate entries.
|
||||
|
||||
for (thisJSON in z) {
|
||||
thisData <- jsonlite::fromJSON(thisJSON)
|
||||
if (! thisData$protein$name %in% myDB$protein$name) {
|
||||
myDB <- dbAddProtein(myDB, thisData$protein)
|
||||
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
|
||||
myDB <- dbAddFeature(myDB, thisData$feature)
|
||||
myDB <- dbAddAnnotation(myDB, thisData$annotation)
|
||||
}
|
||||
}
|
||||
|
||||
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
|
||||
|
||||
iRows <- grep("^MBP1_", myDB$protein$name)
|
||||
yMax <- length(iRows) * 1.1
|
||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
|
||||
|
||||
# plot an empty frame
|
||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
|
||||
plot(1, 1,
|
||||
xlim = c(-200, xMax + 100),
|
||||
ylim = c(0, yMax),
|
||||
type = "n",
|
||||
axes = FALSE,
|
||||
bty = "n",
|
||||
main = "Mbp1 orthologue domain annotations",
|
||||
xlab = "sequence position",
|
||||
cex.axis = 0.8,
|
||||
ylab="")
|
||||
axis(1, at = seq(0, xMax, by = 100))
|
||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
|
||||
"#f0ea00", "#62C923",
|
||||
"#0A9A9B", "#1958C3",
|
||||
"#8000D3", "#D0007F"),
|
||||
space="Lab",
|
||||
interpolate="linear")(nrow(myDB$feature))
|
||||
myCol <- paste0(myCol, "55")
|
||||
legend(xMax - 150, 7,
|
||||
legend = myDB$feature$name,
|
||||
cex = 0.7,
|
||||
fill = myCol,
|
||||
bty = "n")
|
||||
|
||||
for (i in seq_along(iRows)) {
|
||||
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
|
||||
}
|
||||
par(oPar) # reset the plot parameters
|
||||
|
||||
# ... the more proteins we can compare, the more we learn about the
|
||||
# architectural principles of this family's domains.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,169 +1,169 @@
|
||||
# tocID <- "BIN-FUNC-Semantic_similarity.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-11 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 New code.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------
|
||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 43
|
||||
#TOC> 2 Fetch GO Annotations 100
|
||||
#TOC> 3 Semantic Similarities 109
|
||||
#TOC> 4 GO Term Enrichment in Gene Sets 127
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
|
||||
# GOSim is an R-package in the Bioconductor project.
|
||||
if (! requireNamespace("GOSim", quietly = TRUE)) {
|
||||
BiocManager::install("GOSim")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = GOSim) # basic information
|
||||
# browseVignettes("GOSim") # available vignettes
|
||||
# data(package = "GOSim") # available datasets
|
||||
|
||||
# GOSim makes extensive assumptions about loaded packages, and many base
|
||||
# methods are masked. We will thus use library(GOSim) to load it
|
||||
# in its entirety and with all packages it depends on. We will still use
|
||||
# the <package>::<function>() syntax in the code below, but this now serves
|
||||
# more of a didactic purpose, rather than actual syntax requirements.
|
||||
|
||||
library(GOSim)
|
||||
|
||||
# GOSim loads human annotations in org.Hs.eg.db by default. We load yeast
|
||||
# annotations instead...
|
||||
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
|
||||
BiocManager::install("org.Sc.sgd.db")
|
||||
}
|
||||
|
||||
# Bioconductor annotation packages won't work stably unless we actually load
|
||||
# them:
|
||||
library(org.Sc.sgd.db)
|
||||
|
||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
||||
# databases exist for all model organisms. It's a kind of a fancy data frame
|
||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
||||
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
|
||||
|
||||
# ... and the types of available annotations with the columns() function
|
||||
AnnotationDbi::columns(org.Sc.sgd.db)
|
||||
|
||||
# Note that one of the columns is "GO" ... and we load that into the
|
||||
# datastructures used by GOSim:
|
||||
|
||||
# Choose GOterms to use
|
||||
GOSim::setEvidenceLevel(evidences = "all",
|
||||
organism = org.Sc.sgdORGANISM,
|
||||
gomap = org.Sc.sgdGO)
|
||||
|
||||
# Use Biological Process ontology
|
||||
GOSim::setOntology("BP", loadIC = FALSE)
|
||||
|
||||
# confirm that we loaded the correct ontology
|
||||
head(get("gomap", envir = GOSimEnv))
|
||||
|
||||
|
||||
|
||||
# = 2 Fetch GO Annotations ================================================
|
||||
|
||||
|
||||
# All keys being used here are yeast systematic names.
|
||||
|
||||
# Get one set of annotations
|
||||
GOSim::getGOInfo(c("YDL056W")) # Mbp1
|
||||
|
||||
|
||||
# = 3 Semantic Similarities ===============================================
|
||||
|
||||
|
||||
# Get semantic similarities between genes
|
||||
?getGeneSim
|
||||
|
||||
# There are _many_ different metrics of term similarity implemented
|
||||
# in this package.
|
||||
|
||||
# Mbp1 and...
|
||||
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
|
||||
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
|
||||
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
|
||||
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||
|
||||
|
||||
# = 4 GO Term Enrichment in Gene Sets =====================================
|
||||
|
||||
|
||||
# Calculating GO term enrichment in gene sets is done with the Bioconductor
|
||||
# topGO package.
|
||||
if (! requireNamespace("topGO", quietly = TRUE)) {
|
||||
BiocManager::install("topGO")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = topGO) # basic information
|
||||
# browseVignettes("topGO") # available vignettes
|
||||
# data(package = "topGO") # available datasets
|
||||
|
||||
# Once again - assumptions are made by GOsim that require us to load the
|
||||
# topGO package wholesale:
|
||||
library(topGO)
|
||||
|
||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
||||
mySet <- c("YFR028C", # Cdc14
|
||||
"YDL056W", # Mbp1
|
||||
"YLR182W", # Swi6
|
||||
"YER111C", # Swi4
|
||||
"YOR083W", # Whi5
|
||||
"YBR160W", # Cdc28
|
||||
"YMR199W", # Cln1
|
||||
"YPL256C", # Cln2
|
||||
"YAL040C") # Cln3
|
||||
|
||||
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
|
||||
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
||||
# we define enrichment
|
||||
|
||||
myEnr <- GOenrichment(mySet, allGenes)
|
||||
|
||||
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
|
||||
|
||||
#Most significantly enriched is GO:0071931. What is this?
|
||||
annotate::getGOTerm("GO:0071931") # ... makes sense.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-FUNC-Semantic_similarity.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-11 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 New code.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------
|
||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 43
|
||||
#TOC> 2 Fetch GO Annotations 100
|
||||
#TOC> 3 Semantic Similarities 109
|
||||
#TOC> 4 GO Term Enrichment in Gene Sets 127
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
|
||||
# GOSim is an R-package in the Bioconductor project.
|
||||
if (! requireNamespace("GOSim", quietly = TRUE)) {
|
||||
BiocManager::install("GOSim")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = GOSim) # basic information
|
||||
# browseVignettes("GOSim") # available vignettes
|
||||
# data(package = "GOSim") # available datasets
|
||||
|
||||
# GOSim makes extensive assumptions about loaded packages, and many base
|
||||
# methods are masked. We will thus use library(GOSim) to load it
|
||||
# in its entirety and with all packages it depends on. We will still use
|
||||
# the <package>::<function>() syntax in the code below, but this now serves
|
||||
# more of a didactic purpose, rather than actual syntax requirements.
|
||||
|
||||
library(GOSim)
|
||||
|
||||
# GOSim loads human annotations in org.Hs.eg.db by default. We load yeast
|
||||
# annotations instead...
|
||||
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
|
||||
BiocManager::install("org.Sc.sgd.db")
|
||||
}
|
||||
|
||||
# Bioconductor annotation packages won't work stably unless we actually load
|
||||
# them:
|
||||
library(org.Sc.sgd.db)
|
||||
|
||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
||||
# databases exist for all model organisms. It's a kind of a fancy data frame
|
||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
||||
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
|
||||
|
||||
# ... and the types of available annotations with the columns() function
|
||||
AnnotationDbi::columns(org.Sc.sgd.db)
|
||||
|
||||
# Note that one of the columns is "GO" ... and we load that into the
|
||||
# datastructures used by GOSim:
|
||||
|
||||
# Choose GOterms to use
|
||||
GOSim::setEvidenceLevel(evidences = "all",
|
||||
organism = org.Sc.sgdORGANISM,
|
||||
gomap = org.Sc.sgdGO)
|
||||
|
||||
# Use Biological Process ontology
|
||||
GOSim::setOntology("BP", loadIC = FALSE)
|
||||
|
||||
# confirm that we loaded the correct ontology
|
||||
head(get("gomap", envir = GOSimEnv))
|
||||
|
||||
|
||||
|
||||
# = 2 Fetch GO Annotations ================================================
|
||||
|
||||
|
||||
# All keys being used here are yeast systematic names.
|
||||
|
||||
# Get one set of annotations
|
||||
GOSim::getGOInfo(c("YDL056W")) # Mbp1
|
||||
|
||||
|
||||
# = 3 Semantic Similarities ===============================================
|
||||
|
||||
|
||||
# Get semantic similarities between genes
|
||||
?getGeneSim
|
||||
|
||||
# There are _many_ different metrics of term similarity implemented
|
||||
# in this package.
|
||||
|
||||
# Mbp1 and...
|
||||
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
|
||||
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
|
||||
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
|
||||
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||
|
||||
|
||||
# = 4 GO Term Enrichment in Gene Sets =====================================
|
||||
|
||||
|
||||
# Calculating GO term enrichment in gene sets is done with the Bioconductor
|
||||
# topGO package.
|
||||
if (! requireNamespace("topGO", quietly = TRUE)) {
|
||||
BiocManager::install("topGO")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = topGO) # basic information
|
||||
# browseVignettes("topGO") # available vignettes
|
||||
# data(package = "topGO") # available datasets
|
||||
|
||||
# Once again - assumptions are made by GOsim that require us to load the
|
||||
# topGO package wholesale:
|
||||
library(topGO)
|
||||
|
||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
||||
mySet <- c("YFR028C", # Cdc14
|
||||
"YDL056W", # Mbp1
|
||||
"YLR182W", # Swi6
|
||||
"YER111C", # Swi4
|
||||
"YOR083W", # Whi5
|
||||
"YBR160W", # Cdc28
|
||||
"YMR199W", # Cln1
|
||||
"YPL256C", # Cln2
|
||||
"YAL040C") # Cln3
|
||||
|
||||
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
|
||||
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
||||
# we define enrichment
|
||||
|
||||
myEnr <- GOenrichment(mySet, allGenes)
|
||||
|
||||
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
|
||||
|
||||
#Most significantly enriched is GO:0071931. What is this?
|
||||
annotate::getGOTerm("GO:0071931") # ... makes sense.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
702
BIN-MYSPE.R
702
BIN-MYSPE.R
@ -1,351 +1,351 @@
|
||||
# tocID <- "BIN-MYSPE.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-MYSPE unit
|
||||
#
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-09 - 2021-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.4 Add troubleshooting hints via errText[[...]]
|
||||
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
|
||||
# V 1.2 Reorganized proportional plot section into a "further reading"
|
||||
# section, added nested-box, and sankey plot visualization of
|
||||
# proportions. Introduced plotly.
|
||||
# V 1.1 2020 Workflow changes
|
||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||
#
|
||||
# TODO: Sample solution for sankey plot function.
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 PREPARATIONS 52
|
||||
#TOC> 2 SUITABLE MYSPE SPECIES 65
|
||||
#TOC> 3 ADOPT "MYSPE" 89
|
||||
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
|
||||
#TOC> 4.1 Percentages 146
|
||||
#TOC> 4.2 Visualizing proportions: Pie chart 165
|
||||
#TOC> 4.3 Visualizing proportions: Nested squares 243
|
||||
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 PREPARATIONS ========================================================
|
||||
#
|
||||
|
||||
# Execute the two conditionals below:
|
||||
if (! file.exists("./myScripts/.myProfile.R")) {
|
||||
stop(errText[["noProfileFile"]]) # message defined in .Rprofile
|
||||
}
|
||||
|
||||
if (! exists("myStudentNumber")) {
|
||||
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
|
||||
}
|
||||
|
||||
|
||||
# = 2 SUITABLE MYSPE SPECIES ==============================================
|
||||
|
||||
|
||||
# In this unit we will select one species from a list of genome sequenced fungi
|
||||
# and write it into your personalized profile file. This species will be called
|
||||
# "MYSPE" (My Species) for other learning units and exercises.
|
||||
|
||||
# A detailed description of the process of compiling the list of genome
|
||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
|
||||
# was retrieved from https://fungi.ensembl.org; a search for homologues to
|
||||
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
|
||||
# A representative organism at each genus-level was chosen from those hits
|
||||
# that actual;ly have a homologue. Finally, a mapping table was constructed to
|
||||
# asymmetrically retrieve unique species: a student number will retrieve
|
||||
# a species, but (public) knowledge of the species cannot reconstruct the
|
||||
# student number.
|
||||
|
||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
||||
# of selecting and combining data from various data resources. Studying
|
||||
# it will give you a better sense of how such workflows can be
|
||||
# implemented in practice.
|
||||
|
||||
|
||||
# = 3 ADOPT "MYSPE" =======================================================
|
||||
|
||||
# Execute:
|
||||
( MYSPE <- getMYSPE(myStudentNumber) )
|
||||
|
||||
# If this produced an error, this session has not been properly set up. You
|
||||
# may not yet have run init() and edited .myProfile.R , or that file is not
|
||||
# in your myScripts/ folder. Fix this, and execute:
|
||||
#
|
||||
# source(".Rprofile") .
|
||||
|
||||
# If this produced NA, your Student Number may not be correct, or you are not in
|
||||
# my class-list. Contact me. Otherwise, this should have printed a species name,
|
||||
# and the taxonomy ID of its genome-sequenced strain. This is your unique
|
||||
# speciesfor this course. Note it in your journal ...
|
||||
|
||||
biCode(MYSPE) # and also note it's "BiCode" ...
|
||||
( myTaxID <- names(MYSPE) ) # and its taxID
|
||||
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# Note down the species name and its five letter BiCode on your Student
|
||||
# Wiki user page. Use this species whenever this or future assignments refer
|
||||
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
||||
# from myScripts/.myProfile.R and is available as MYSPE .
|
||||
|
||||
# Here is some more information about MYSPE, taken from the table of genome-
|
||||
# sequenced fungi that is in your ./data folder.
|
||||
fungiDat <- read.csv("data/Species.csv")
|
||||
iMs <- which(fungiDat$Taxon.ID == myTaxID)
|
||||
|
||||
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
|
||||
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
|
||||
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
|
||||
|
||||
# That's all.
|
||||
|
||||
|
||||
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
|
||||
|
||||
# The material below is an exploration of data-preparation and plotting
|
||||
# techniques; you can treat this as additional practice and further reading and
|
||||
# I expect that some of the code and plotting examples may be useful in a
|
||||
# different context.
|
||||
|
||||
# A frequent task is to visualize the proportion of elements with given
|
||||
# categories in a sample. For example, we might ask what the proportion of the
|
||||
# different orders of fungi is the order of MYSPE? Let's first collect the
|
||||
# numbers.
|
||||
|
||||
( nFungi <- nrow(fungiDat) ) # sequenced fungi
|
||||
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
|
||||
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
|
||||
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
|
||||
|
||||
|
||||
# == 4.1 Percentages =======================================================
|
||||
|
||||
# The zeroth-order approach to visualization is simply to print percentages:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi))
|
||||
|
||||
# ... or, adding the actual numbers:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi,
|
||||
nOrder,
|
||||
nFungi))
|
||||
|
||||
# But that's hard to visualize for most of us, and anyway, we don't know how
|
||||
# that relates to other orders.
|
||||
|
||||
# == 4.2 Visualizing proportions: Pie chart ================================
|
||||
|
||||
# Often, we will use a pie chart instead. Pie charts are rather informal types
|
||||
# of plots, not well suited for analysis. But easy to do:
|
||||
|
||||
# Define four colors to identify the four categories
|
||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
# and remember the
|
||||
# previous setting
|
||||
|
||||
pie(c(nSpecies, # subtract numbers since these
|
||||
nGenus - nSpecies, # categories are mutually contained
|
||||
nOrder - nGenus - nSpecies, # in each other
|
||||
nFungi - nOrder - nGenus - nSpecies),
|
||||
labels = "",
|
||||
radius = 0.9,
|
||||
main = "MYSPE in genome-sequenced fungi",
|
||||
lty = 0, # turn borders for wedges off
|
||||
col = pCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
|
||||
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
|
||||
|
||||
legend(x = 0.95, y = 0.8, # place at legend here
|
||||
legend = c("Species", "Genus", "Order", "Fungi"),
|
||||
y.intersp = 2, # line spacing for labels
|
||||
cex = 0.8, # character size for labels
|
||||
bty = "n", # "no" box around the legend
|
||||
pt.cex = 2, # size of colour boxes
|
||||
pch = 15, # a filled square
|
||||
col = pCol)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# Unless MYSPE is one of the frequently sequenced species, there will only be a
|
||||
# very thin wedge visible. Pie charts are not well suited to visualize small
|
||||
# proportions.
|
||||
|
||||
# It is a little more useful if we have non-nested proportions - like the
|
||||
# number of species in the same order overall:
|
||||
|
||||
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
|
||||
head(myTbl)
|
||||
|
||||
# pie() does a reasonable job out of the box to interpret table() data:
|
||||
pie(myTbl)
|
||||
|
||||
# ... we can improve this quickly with a bit of tweaking:
|
||||
|
||||
N <- length(myTbl)
|
||||
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
|
||||
|
||||
myCol <- rep(pCol[4], N) # N elements of pCol[1]
|
||||
myCol[sel] <- pCol[1] # replace this one color
|
||||
|
||||
myLbl <- rep("", N) # N labels of ""
|
||||
myLbl[sel] <- myOr # replace this one label with the MYSPE order
|
||||
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
|
||||
pie(myTbl,
|
||||
labels = myLbl,
|
||||
radius = 0.9,
|
||||
main = "MYSPE order",
|
||||
border = "#DDDDDD",
|
||||
col = myCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# But the overall problem remains.
|
||||
|
||||
|
||||
# == 4.3 Visualizing proportions: Nested squares ===========================
|
||||
|
||||
# A simple alternative is to draw such proportions as nested squares:
|
||||
|
||||
x <- sqrt(nFungi)
|
||||
|
||||
# set margins to ~ 0 and type to square
|
||||
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
|
||||
|
||||
# empty, square plot
|
||||
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
|
||||
type="n", axes=FALSE, xlab="", ylab="")
|
||||
|
||||
# basic square for all genomes
|
||||
rect(0, 0, x, x, col = pCol[4])
|
||||
|
||||
# grid
|
||||
u <- 0:floor(x)
|
||||
N <- length(u)
|
||||
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
|
||||
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
|
||||
# each square on this grid is one genome
|
||||
|
||||
# colored squares
|
||||
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
|
||||
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
|
||||
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
|
||||
|
||||
# labels
|
||||
text(x/2, x/2, "Fungi")
|
||||
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
|
||||
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
|
||||
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
|
||||
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
|
||||
|
||||
# Sankey diagrams are an excellent way to visualize complicated nested
|
||||
# proportions and their changes (see here for example:
|
||||
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
|
||||
# example with the MYSPE proportions, as an illustration of the plotting
|
||||
# principle.
|
||||
|
||||
if (! requireNamespace("plotly")) {
|
||||
install.packages("plotly")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = plotly) # basic information
|
||||
# browseVignettes("plotly") # available vignettes
|
||||
# data(package = "plotly") # available datasets
|
||||
|
||||
# Here, we use the plotly package that wraps a very well developed javascript
|
||||
# library with many options for interactive plots. I am producing this plot
|
||||
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
|
||||
# to change the code to adapt it to your own MYSPE - or even build a function
|
||||
# for this. Do try this if you have a bit of coding experience, sankey diagrams
|
||||
# are a good way to show hierarchical data relations - and if you get this
|
||||
# working for your own organism you can be proud that you have understood
|
||||
# how preparing the data works.
|
||||
|
||||
|
||||
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
|
||||
"Ophiostomatales (6)", # 1
|
||||
"Other...", # 2
|
||||
"Sporothrix (4)", # 3
|
||||
"Other...", # 4
|
||||
"Sporothrix schenckii (2)", # 5
|
||||
"Other..." # 6
|
||||
),
|
||||
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
|
||||
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
|
||||
color = c("#f2f2f0", #
|
||||
"#ffd5c4",
|
||||
"#CCCCCC",
|
||||
"#ff9582",
|
||||
"#CCCCCC",
|
||||
"#ed394e",
|
||||
"#CCCCCC"
|
||||
),
|
||||
pad = 15,
|
||||
thickness = 20,
|
||||
line = list(color = "black",
|
||||
width = 0.5))
|
||||
|
||||
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
|
||||
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
|
||||
value = c(6, 18, 4, 2, 2, 2)) # and node 1
|
||||
|
||||
# Setting up the actual plot ...
|
||||
fig <- plotly::plot_ly(type = "sankey",
|
||||
arrangement = "snap",
|
||||
orientation = "h",
|
||||
node = myNodes,
|
||||
link = myLinks)
|
||||
|
||||
# Adding and adjusting a few layout parameters
|
||||
fig <- plotly::layout(fig,
|
||||
title = "Fungi Genomes - Classification",
|
||||
font = list(size = 10))
|
||||
|
||||
fig # plot the diagram
|
||||
|
||||
# Note that the plot appears in the Viewer window, not the Plot window, and that
|
||||
# it is interactive: you can hover over nodes and links, and drag the nodes
|
||||
# around.
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-MYSPE.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-MYSPE unit
|
||||
#
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-09 - 2021-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.4 Add troubleshooting hints via errText[[...]]
|
||||
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
|
||||
# V 1.2 Reorganized proportional plot section into a "further reading"
|
||||
# section, added nested-box, and sankey plot visualization of
|
||||
# proportions. Introduced plotly.
|
||||
# V 1.1 2020 Workflow changes
|
||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||
#
|
||||
# TODO: Sample solution for sankey plot function.
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 PREPARATIONS 52
|
||||
#TOC> 2 SUITABLE MYSPE SPECIES 65
|
||||
#TOC> 3 ADOPT "MYSPE" 89
|
||||
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
|
||||
#TOC> 4.1 Percentages 146
|
||||
#TOC> 4.2 Visualizing proportions: Pie chart 165
|
||||
#TOC> 4.3 Visualizing proportions: Nested squares 243
|
||||
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 PREPARATIONS ========================================================
|
||||
#
|
||||
|
||||
# Execute the two conditionals below:
|
||||
if (! file.exists("./myScripts/.myProfile.R")) {
|
||||
stop(errText[["noProfileFile"]]) # message defined in .Rprofile
|
||||
}
|
||||
|
||||
if (! exists("myStudentNumber")) {
|
||||
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
|
||||
}
|
||||
|
||||
|
||||
# = 2 SUITABLE MYSPE SPECIES ==============================================
|
||||
|
||||
|
||||
# In this unit we will select one species from a list of genome sequenced fungi
|
||||
# and write it into your personalized profile file. This species will be called
|
||||
# "MYSPE" (My Species) for other learning units and exercises.
|
||||
|
||||
# A detailed description of the process of compiling the list of genome
|
||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
|
||||
# was retrieved from https://fungi.ensembl.org; a search for homologues to
|
||||
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
|
||||
# A representative organism at each genus-level was chosen from those hits
|
||||
# that actual;ly have a homologue. Finally, a mapping table was constructed to
|
||||
# asymmetrically retrieve unique species: a student number will retrieve
|
||||
# a species, but (public) knowledge of the species cannot reconstruct the
|
||||
# student number.
|
||||
|
||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
||||
# of selecting and combining data from various data resources. Studying
|
||||
# it will give you a better sense of how such workflows can be
|
||||
# implemented in practice.
|
||||
|
||||
|
||||
# = 3 ADOPT "MYSPE" =======================================================
|
||||
|
||||
# Execute:
|
||||
( MYSPE <- getMYSPE(myStudentNumber) )
|
||||
|
||||
# If this produced an error, this session has not been properly set up. You
|
||||
# may not yet have run init() and edited .myProfile.R , or that file is not
|
||||
# in your myScripts/ folder. Fix this, and execute:
|
||||
#
|
||||
# source(".Rprofile") .
|
||||
|
||||
# If this produced NA, your Student Number may not be correct, or you are not in
|
||||
# my class-list. Contact me. Otherwise, this should have printed a species name,
|
||||
# and the taxonomy ID of its genome-sequenced strain. This is your unique
|
||||
# speciesfor this course. Note it in your journal ...
|
||||
|
||||
biCode(MYSPE) # and also note it's "BiCode" ...
|
||||
( myTaxID <- names(MYSPE) ) # and its taxID
|
||||
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# Note down the species name and its five letter BiCode on your Student
|
||||
# Wiki user page. Use this species whenever this or future assignments refer
|
||||
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
||||
# from myScripts/.myProfile.R and is available as MYSPE .
|
||||
|
||||
# Here is some more information about MYSPE, taken from the table of genome-
|
||||
# sequenced fungi that is in your ./data folder.
|
||||
fungiDat <- read.csv("data/Species.csv")
|
||||
iMs <- which(fungiDat$Taxon.ID == myTaxID)
|
||||
|
||||
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
|
||||
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
|
||||
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
|
||||
|
||||
# That's all.
|
||||
|
||||
|
||||
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
|
||||
|
||||
# The material below is an exploration of data-preparation and plotting
|
||||
# techniques; you can treat this as additional practice and further reading and
|
||||
# I expect that some of the code and plotting examples may be useful in a
|
||||
# different context.
|
||||
|
||||
# A frequent task is to visualize the proportion of elements with given
|
||||
# categories in a sample. For example, we might ask what the proportion of the
|
||||
# different orders of fungi is the order of MYSPE? Let's first collect the
|
||||
# numbers.
|
||||
|
||||
( nFungi <- nrow(fungiDat) ) # sequenced fungi
|
||||
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
|
||||
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
|
||||
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
|
||||
|
||||
|
||||
# == 4.1 Percentages =======================================================
|
||||
|
||||
# The zeroth-order approach to visualization is simply to print percentages:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi))
|
||||
|
||||
# ... or, adding the actual numbers:
|
||||
|
||||
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
|
||||
myOr,
|
||||
(nOrder * 100) / nFungi,
|
||||
nOrder,
|
||||
nFungi))
|
||||
|
||||
# But that's hard to visualize for most of us, and anyway, we don't know how
|
||||
# that relates to other orders.
|
||||
|
||||
# == 4.2 Visualizing proportions: Pie chart ================================
|
||||
|
||||
# Often, we will use a pie chart instead. Pie charts are rather informal types
|
||||
# of plots, not well suited for analysis. But easy to do:
|
||||
|
||||
# Define four colors to identify the four categories
|
||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
# and remember the
|
||||
# previous setting
|
||||
|
||||
pie(c(nSpecies, # subtract numbers since these
|
||||
nGenus - nSpecies, # categories are mutually contained
|
||||
nOrder - nGenus - nSpecies, # in each other
|
||||
nFungi - nOrder - nGenus - nSpecies),
|
||||
labels = "",
|
||||
radius = 0.9,
|
||||
main = "MYSPE in genome-sequenced fungi",
|
||||
lty = 0, # turn borders for wedges off
|
||||
col = pCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
|
||||
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
|
||||
|
||||
legend(x = 0.95, y = 0.8, # place at legend here
|
||||
legend = c("Species", "Genus", "Order", "Fungi"),
|
||||
y.intersp = 2, # line spacing for labels
|
||||
cex = 0.8, # character size for labels
|
||||
bty = "n", # "no" box around the legend
|
||||
pt.cex = 2, # size of colour boxes
|
||||
pch = 15, # a filled square
|
||||
col = pCol)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# Unless MYSPE is one of the frequently sequenced species, there will only be a
|
||||
# very thin wedge visible. Pie charts are not well suited to visualize small
|
||||
# proportions.
|
||||
|
||||
# It is a little more useful if we have non-nested proportions - like the
|
||||
# number of species in the same order overall:
|
||||
|
||||
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
|
||||
head(myTbl)
|
||||
|
||||
# pie() does a reasonable job out of the box to interpret table() data:
|
||||
pie(myTbl)
|
||||
|
||||
# ... we can improve this quickly with a bit of tweaking:
|
||||
|
||||
N <- length(myTbl)
|
||||
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
|
||||
|
||||
myCol <- rep(pCol[4], N) # N elements of pCol[1]
|
||||
myCol[sel] <- pCol[1] # replace this one color
|
||||
|
||||
myLbl <- rep("", N) # N labels of ""
|
||||
myLbl[sel] <- myOr # replace this one label with the MYSPE order
|
||||
|
||||
|
||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
|
||||
|
||||
pie(myTbl,
|
||||
labels = myLbl,
|
||||
radius = 0.9,
|
||||
main = "MYSPE order",
|
||||
border = "#DDDDDD",
|
||||
col = myCol,
|
||||
clockwise = TRUE,
|
||||
init.angle = 90)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
# But the overall problem remains.
|
||||
|
||||
|
||||
# == 4.3 Visualizing proportions: Nested squares ===========================
|
||||
|
||||
# A simple alternative is to draw such proportions as nested squares:
|
||||
|
||||
x <- sqrt(nFungi)
|
||||
|
||||
# set margins to ~ 0 and type to square
|
||||
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
|
||||
|
||||
# empty, square plot
|
||||
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
|
||||
type="n", axes=FALSE, xlab="", ylab="")
|
||||
|
||||
# basic square for all genomes
|
||||
rect(0, 0, x, x, col = pCol[4])
|
||||
|
||||
# grid
|
||||
u <- 0:floor(x)
|
||||
N <- length(u)
|
||||
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
|
||||
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
|
||||
# each square on this grid is one genome
|
||||
|
||||
# colored squares
|
||||
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
|
||||
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
|
||||
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
|
||||
|
||||
# labels
|
||||
text(x/2, x/2, "Fungi")
|
||||
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
|
||||
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
|
||||
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
|
||||
|
||||
par(oPar) # reset graphics state
|
||||
|
||||
|
||||
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
|
||||
|
||||
# Sankey diagrams are an excellent way to visualize complicated nested
|
||||
# proportions and their changes (see here for example:
|
||||
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
|
||||
# example with the MYSPE proportions, as an illustration of the plotting
|
||||
# principle.
|
||||
|
||||
if (! requireNamespace("plotly")) {
|
||||
install.packages("plotly")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = plotly) # basic information
|
||||
# browseVignettes("plotly") # available vignettes
|
||||
# data(package = "plotly") # available datasets
|
||||
|
||||
# Here, we use the plotly package that wraps a very well developed javascript
|
||||
# library with many options for interactive plots. I am producing this plot
|
||||
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
|
||||
# to change the code to adapt it to your own MYSPE - or even build a function
|
||||
# for this. Do try this if you have a bit of coding experience, sankey diagrams
|
||||
# are a good way to show hierarchical data relations - and if you get this
|
||||
# working for your own organism you can be proud that you have understood
|
||||
# how preparing the data works.
|
||||
|
||||
|
||||
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
|
||||
"Ophiostomatales (6)", # 1
|
||||
"Other...", # 2
|
||||
"Sporothrix (4)", # 3
|
||||
"Other...", # 4
|
||||
"Sporothrix schenckii (2)", # 5
|
||||
"Other..." # 6
|
||||
),
|
||||
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
|
||||
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
|
||||
color = c("#f2f2f0", #
|
||||
"#ffd5c4",
|
||||
"#CCCCCC",
|
||||
"#ff9582",
|
||||
"#CCCCCC",
|
||||
"#ed394e",
|
||||
"#CCCCCC"
|
||||
),
|
||||
pad = 15,
|
||||
thickness = 20,
|
||||
line = list(color = "black",
|
||||
width = 0.5))
|
||||
|
||||
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
|
||||
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
|
||||
value = c(6, 18, 4, 2, 2, 2)) # and node 1
|
||||
|
||||
# Setting up the actual plot ...
|
||||
fig <- plotly::plot_ly(type = "sankey",
|
||||
arrangement = "snap",
|
||||
orientation = "h",
|
||||
node = myNodes,
|
||||
link = myLinks)
|
||||
|
||||
# Adding and adjusting a few layout parameters
|
||||
fig <- plotly::layout(fig,
|
||||
title = "Fungi Genomes - Classification",
|
||||
font = list(size = 10))
|
||||
|
||||
fig # plot the diagram
|
||||
|
||||
# Note that the plot appears in the Viewer window, not the Plot window, and that
|
||||
# it is interactive: you can hover over nodes and links, and drag the nodes
|
||||
# around.
|
||||
|
||||
# [END]
|
||||
|
@ -1,234 +1,234 @@
|
||||
# tocID <- "BIN-PHYLO-Data_preparation.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> 1 Preparations 45
|
||||
#TOC> 2 Fetching sequences 77
|
||||
#TOC> 3 Multiple Sequence Alignment 118
|
||||
#TOC> 4 Reviewing and Editing Alignments 137
|
||||
#TOC> 4.1 Masking workflow 153
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
|
||||
|
||||
# You need to reload your protein database, including changes that might have
|
||||
# been made to the reference files. If you have worked with the prerequiste
|
||||
# units, you should have a script named "makeProteinDB.R" that will create the
|
||||
# myDB object with a protein and feature database. Ask for advice if not.
|
||||
source("myScripts/makeProteinDB.R")
|
||||
|
||||
# Load packages we need
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
if (! requireNamespace("msa", quietly = TRUE)) {
|
||||
BiocManager::install("msa")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = msa) # basic information
|
||||
# browseVignettes("msa") # available vignettes
|
||||
# data(package = "msa") # available datasets
|
||||
|
||||
|
||||
# = 2 Fetching sequences ==================================================
|
||||
|
||||
|
||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
||||
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
|
||||
# domains. You have annotated their ranges as a feature. The following code
|
||||
# retrieves the sequences from myDB. You have seen similar code in other units.
|
||||
|
||||
sel <- grep("^MBP1_", myDB$protein$name)
|
||||
(proNames <- myDB$protein$name[sel])
|
||||
(proIDs <- myDB$protein$ID[sel])
|
||||
|
||||
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
|
||||
myDB$annotation$featureID == sel]) # == !
|
||||
# Why?
|
||||
APSI <- character(length(fanIDs))
|
||||
|
||||
for (i in seq_along(fanIDs)) {
|
||||
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
|
||||
proID <- myDB$annotation$proteinID[sel] # get its protein ID
|
||||
start <- myDB$annotation$start[sel] # get start ...
|
||||
end <- myDB$annotation$end[sel] # ... and end
|
||||
|
||||
sel <- myDB$protein$ID == proID # get the protein row index ...
|
||||
# ... and the sequence
|
||||
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
|
||||
names(APSI)[i] <- (myDB$protein$name[sel])
|
||||
}
|
||||
|
||||
head(APSI)
|
||||
|
||||
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
|
||||
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
|
||||
|
||||
APSI <- c(APSI,
|
||||
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
|
||||
names(APSI)[length(APSI)] <- "KILA_ESCCO"
|
||||
tail(APSI)
|
||||
|
||||
|
||||
# = 3 Multiple Sequence Alignment =========================================
|
||||
|
||||
# This vector of sequences with named elements fulfills the requirements to be
|
||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
||||
# the MSA algorithms in Biostrings.
|
||||
#
|
||||
|
||||
APSESSet <- Biostrings::AAStringSet(APSI)
|
||||
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
|
||||
|
||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
|
||||
# that happens in your case, just use msaClustalOmega() instead.
|
||||
|
||||
# inspect the alignment.
|
||||
writeALN(APSESMsa)
|
||||
|
||||
# What do you think? Is this a good alignment for phylogenetic inference?
|
||||
|
||||
|
||||
# = 4 Reviewing and Editing Alignments ====================================
|
||||
|
||||
|
||||
# Head back to the Wiki page for this unit and read up on the background
|
||||
# first.
|
||||
|
||||
# Let's mask out all columns that have observations for
|
||||
# less than 1/3 of the sequences in the dataset. This
|
||||
# means they have more than round(nrow(msaSet) * (2/3))
|
||||
# hyphens in a column.
|
||||
#
|
||||
# We take all sequences, split them into single
|
||||
# characters, and put them into a matrix. Then we
|
||||
# go through the matrix, column by column and decide
|
||||
# whether we want to include that column.
|
||||
|
||||
# == 4.1 Masking workflow ==================================================
|
||||
|
||||
# get the length of the alignment
|
||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
||||
|
||||
# initialize a matrix that can hold all characters
|
||||
# individually
|
||||
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
|
||||
ncol = lenAli)
|
||||
|
||||
# assign the correct rownames
|
||||
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
|
||||
for (i in 1:nrow(APSESMsa)) {
|
||||
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
|
||||
}
|
||||
|
||||
# inspect the result
|
||||
msaMatrix[1:7, 30:40]
|
||||
|
||||
# Now let's make a logical vector with an element for each column that selects
|
||||
# which columns should be masked out.
|
||||
|
||||
# The number of hyphens in a column is easy to count. Consider:
|
||||
|
||||
msaMatrix[ , 20] # column 20
|
||||
msaMatrix[ , 20] == "-" # TRUE for all gap characters
|
||||
sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE
|
||||
|
||||
# Thus filling our logical vector is simple:
|
||||
|
||||
# initialize a mask
|
||||
colMask <- logical(ncol(msaMatrix))
|
||||
|
||||
# define the threshold for rejecting a column
|
||||
limit <- round(nrow(APSESMsa) * (2/3))
|
||||
|
||||
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
|
||||
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
|
||||
# and FALSE columns will be rejected.
|
||||
for (i in 1:ncol(msaMatrix)) {
|
||||
count <- sum(msaMatrix[ , i] == "-")
|
||||
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
|
||||
}
|
||||
|
||||
# Inspect the mask
|
||||
colMask
|
||||
|
||||
# How many positions are being kept?
|
||||
sum(colMask)
|
||||
|
||||
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
|
||||
100 * (1 - (sum(colMask) / length(colMask)))))
|
||||
|
||||
|
||||
# Next, we use colMask to remove the masked columns from the matrix
|
||||
# in one step:
|
||||
maskedMatrix <- msaMatrix[ , colMask]
|
||||
|
||||
# check:
|
||||
ncol(maskedMatrix)
|
||||
|
||||
# ... then collapse each row of single characters back into a string ...
|
||||
APSESphyloSet <- character()
|
||||
for (i in 1:nrow(maskedMatrix)) {
|
||||
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
|
||||
}
|
||||
names(APSESphyloSet) <- rownames(maskedMatrix)
|
||||
|
||||
# inspect ...
|
||||
writeALN(APSESphyloSet)
|
||||
|
||||
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
|
||||
# several indels from the KILA_ESCCO outgroup sequence.
|
||||
|
||||
|
||||
# We save the aligned, masked domains to a file in the data/ directory,
|
||||
# in multi-FASTA format.
|
||||
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-PHYLO-Data_preparation.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> 1 Preparations 45
|
||||
#TOC> 2 Fetching sequences 77
|
||||
#TOC> 3 Multiple Sequence Alignment 118
|
||||
#TOC> 4 Reviewing and Editing Alignments 137
|
||||
#TOC> 4.1 Masking workflow 153
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
|
||||
|
||||
# You need to reload your protein database, including changes that might have
|
||||
# been made to the reference files. If you have worked with the prerequiste
|
||||
# units, you should have a script named "makeProteinDB.R" that will create the
|
||||
# myDB object with a protein and feature database. Ask for advice if not.
|
||||
source("myScripts/makeProteinDB.R")
|
||||
|
||||
# Load packages we need
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
if (! requireNamespace("msa", quietly = TRUE)) {
|
||||
BiocManager::install("msa")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = msa) # basic information
|
||||
# browseVignettes("msa") # available vignettes
|
||||
# data(package = "msa") # available datasets
|
||||
|
||||
|
||||
# = 2 Fetching sequences ==================================================
|
||||
|
||||
|
||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
|
||||
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
|
||||
# domains. You have annotated their ranges as a feature. The following code
|
||||
# retrieves the sequences from myDB. You have seen similar code in other units.
|
||||
|
||||
sel <- grep("^MBP1_", myDB$protein$name)
|
||||
(proNames <- myDB$protein$name[sel])
|
||||
(proIDs <- myDB$protein$ID[sel])
|
||||
|
||||
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
|
||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
|
||||
myDB$annotation$featureID == sel]) # == !
|
||||
# Why?
|
||||
APSI <- character(length(fanIDs))
|
||||
|
||||
for (i in seq_along(fanIDs)) {
|
||||
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
|
||||
proID <- myDB$annotation$proteinID[sel] # get its protein ID
|
||||
start <- myDB$annotation$start[sel] # get start ...
|
||||
end <- myDB$annotation$end[sel] # ... and end
|
||||
|
||||
sel <- myDB$protein$ID == proID # get the protein row index ...
|
||||
# ... and the sequence
|
||||
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
|
||||
names(APSI)[i] <- (myDB$protein$name[sel])
|
||||
}
|
||||
|
||||
head(APSI)
|
||||
|
||||
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
|
||||
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
|
||||
|
||||
APSI <- c(APSI,
|
||||
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
|
||||
names(APSI)[length(APSI)] <- "KILA_ESCCO"
|
||||
tail(APSI)
|
||||
|
||||
|
||||
# = 3 Multiple Sequence Alignment =========================================
|
||||
|
||||
# This vector of sequences with named elements fulfills the requirements to be
|
||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
|
||||
# the MSA algorithms in Biostrings.
|
||||
#
|
||||
|
||||
APSESSet <- Biostrings::AAStringSet(APSI)
|
||||
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
|
||||
|
||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
|
||||
# that happens in your case, just use msaClustalOmega() instead.
|
||||
|
||||
# inspect the alignment.
|
||||
writeALN(APSESMsa)
|
||||
|
||||
# What do you think? Is this a good alignment for phylogenetic inference?
|
||||
|
||||
|
||||
# = 4 Reviewing and Editing Alignments ====================================
|
||||
|
||||
|
||||
# Head back to the Wiki page for this unit and read up on the background
|
||||
# first.
|
||||
|
||||
# Let's mask out all columns that have observations for
|
||||
# less than 1/3 of the sequences in the dataset. This
|
||||
# means they have more than round(nrow(msaSet) * (2/3))
|
||||
# hyphens in a column.
|
||||
#
|
||||
# We take all sequences, split them into single
|
||||
# characters, and put them into a matrix. Then we
|
||||
# go through the matrix, column by column and decide
|
||||
# whether we want to include that column.
|
||||
|
||||
# == 4.1 Masking workflow ==================================================
|
||||
|
||||
# get the length of the alignment
|
||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
|
||||
|
||||
# initialize a matrix that can hold all characters
|
||||
# individually
|
||||
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
|
||||
ncol = lenAli)
|
||||
|
||||
# assign the correct rownames
|
||||
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
|
||||
for (i in 1:nrow(APSESMsa)) {
|
||||
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
|
||||
}
|
||||
|
||||
# inspect the result
|
||||
msaMatrix[1:7, 30:40]
|
||||
|
||||
# Now let's make a logical vector with an element for each column that selects
|
||||
# which columns should be masked out.
|
||||
|
||||
# The number of hyphens in a column is easy to count. Consider:
|
||||
|
||||
msaMatrix[ , 20] # column 20
|
||||
msaMatrix[ , 20] == "-" # TRUE for all gap characters
|
||||
sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE
|
||||
|
||||
# Thus filling our logical vector is simple:
|
||||
|
||||
# initialize a mask
|
||||
colMask <- logical(ncol(msaMatrix))
|
||||
|
||||
# define the threshold for rejecting a column
|
||||
limit <- round(nrow(APSESMsa) * (2/3))
|
||||
|
||||
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
|
||||
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
|
||||
# and FALSE columns will be rejected.
|
||||
for (i in 1:ncol(msaMatrix)) {
|
||||
count <- sum(msaMatrix[ , i] == "-")
|
||||
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
|
||||
}
|
||||
|
||||
# Inspect the mask
|
||||
colMask
|
||||
|
||||
# How many positions are being kept?
|
||||
sum(colMask)
|
||||
|
||||
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
|
||||
100 * (1 - (sum(colMask) / length(colMask)))))
|
||||
|
||||
|
||||
# Next, we use colMask to remove the masked columns from the matrix
|
||||
# in one step:
|
||||
maskedMatrix <- msaMatrix[ , colMask]
|
||||
|
||||
# check:
|
||||
ncol(maskedMatrix)
|
||||
|
||||
# ... then collapse each row of single characters back into a string ...
|
||||
APSESphyloSet <- character()
|
||||
for (i in 1:nrow(maskedMatrix)) {
|
||||
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
|
||||
}
|
||||
names(APSESphyloSet) <- rownames(maskedMatrix)
|
||||
|
||||
# inspect ...
|
||||
writeALN(APSESphyloSet)
|
||||
|
||||
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
|
||||
# several indels from the KILA_ESCCO outgroup sequence.
|
||||
|
||||
|
||||
# We save the aligned, masked domains to a file in the data/ directory,
|
||||
# in multi-FASTA format.
|
||||
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,406 +1,406 @@
|
||||
# tocID <- "BIN-PHYLO-Tree_analysis.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
|
||||
# Rewrite of tip re-ordering. Better handling of
|
||||
# messages. pBar() for randomization.
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.2 Typo in variable name, style changes
|
||||
# 1.0.1 Wrong section heading
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------
|
||||
#TOC> 1 Preparation and Tree Plot 50
|
||||
#TOC> 2 SPECIES REFERENCE TREE 66
|
||||
#TOC> 3 Tree Analysis 117
|
||||
#TOC> 3.1 Rooting Trees 177
|
||||
#TOC> 3.2 Rotating Clades 222
|
||||
#TOC> 3.3 Computing tree distances 309
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation and Tree Plot ===========================================
|
||||
|
||||
|
||||
if (! requireNamespace("ape", quietly = TRUE)) {
|
||||
install.packages("ape")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = ape) # basic information
|
||||
# browseVignettes("ape") # available vignettes
|
||||
# data(package = "ape") # available datasets
|
||||
|
||||
# We change the graphics parameters from time to time, let's define the
|
||||
# default so we can recreate a sane state:
|
||||
dev.off()
|
||||
PAR <- par()
|
||||
|
||||
# = 2 SPECIES REFERENCE TREE ==============================================
|
||||
|
||||
# Before we do any kind of phylogenetic analysis of genes from several species,
|
||||
# we MUST have a reference tree of the taxonomic relationships in hand. This
|
||||
# context is absolutely required for the interpretation of our tree.
|
||||
|
||||
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
|
||||
|
||||
if (! requireNamespace("taxize", quietly = TRUE)) {
|
||||
install.packages("taxize")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = taxize) # basic information
|
||||
# browseVignettes("taxize") # available vignettes
|
||||
# data(package = "taxize") # available datasets
|
||||
|
||||
( mySOI <- c(myDB$taxonomy$ID, "83333") )
|
||||
myClass <- taxize::classification(mySOI, db = "ncbi")
|
||||
str(myClass)
|
||||
|
||||
myClass[[1]]
|
||||
|
||||
fungiTree <- taxize::class2tree(myClass, check = TRUE)
|
||||
plot(fungiTree)
|
||||
|
||||
# The tree produced by taxize:: contains full length species names,
|
||||
# but it would be more convenient if it had bicodes instead. Also, the actual
|
||||
# tree is only part of the list(), which will cause problems later:
|
||||
str(fungiTree)
|
||||
|
||||
# we therefor simplify
|
||||
fungiTree <- fungiTree$phylo
|
||||
str(fungiTree)
|
||||
|
||||
# The species names are in a vector $phylo$tip.label of this list.
|
||||
# We can use biCode() to shorten them.
|
||||
fungiTree$tip.label <- biCode(fungiTree$tip.label)
|
||||
|
||||
# Plot the tree
|
||||
nSP <- length(fungiTree$tip.label)
|
||||
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
|
||||
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.6,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
# Note that you can use the arrow buttons in the menu above the plot pane to
|
||||
# scroll back to plots you have created earlier - so you can reference back to
|
||||
# this species tree in your later analysis.
|
||||
|
||||
|
||||
# = 3 Tree Analysis =======================================================
|
||||
|
||||
|
||||
# 1.1 Visualizing your tree
|
||||
# The trees that are produced by Rphylip are stored as an object of class
|
||||
# "phylo". This is a class for phylogenetic trees that is widely used in the
|
||||
# community, practically all R phylogenetics packages will options to read and
|
||||
# manipulate such trees. Outside of R, a popular interchange format is the
|
||||
# Newick_format that you have seen above. It's easy to output your calculated
|
||||
# trees in Newick format and visualize them elsewhere.
|
||||
|
||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
||||
# print it have been defined with the Rphylip package, and in ape. You can
|
||||
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
|
||||
# to plot it. The underlying function is plot.phylo(), and documentation for its
|
||||
# many options can by found by typing:
|
||||
|
||||
?plot.phylo
|
||||
|
||||
# We load the APSES sequence tree that you produced in the
|
||||
# BIN-PHYLO-Tree_building unit:
|
||||
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
|
||||
|
||||
plot(apsTree) # default type is "phylogram"
|
||||
plot(apsTree, type = "unrooted")
|
||||
plot(apsTree, type = "fan", no.margin = TRUE)
|
||||
|
||||
# rescale to show all of the labels:
|
||||
# record the current plot parameters by assigning them to a variable ...
|
||||
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
|
||||
# ... and adjust the plot limits for a new plot:
|
||||
plot(apsTree,
|
||||
type = "fan",
|
||||
x.lim = tmp$x.lim * 1.8,
|
||||
y.lim = tmp$y.lim * 1.8,
|
||||
cex = 0.8,
|
||||
no.margin = TRUE)
|
||||
|
||||
# Inspect the tree object
|
||||
str(apsTree)
|
||||
apsTree$tip.label
|
||||
apsTree$edge
|
||||
apsTree$edge.length
|
||||
|
||||
# show the node / edge and tip labels on a plot
|
||||
plot(apsTree)
|
||||
ape::nodelabels()
|
||||
ape::edgelabels()
|
||||
ape::tiplabels()
|
||||
|
||||
# show the number of nodes, edges and tips
|
||||
ape::Nnode(apsTree)
|
||||
ape::Nedge(apsTree)
|
||||
ape::Ntip(apsTree)
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# Finally, write the tree to console in Newick format
|
||||
ape::write.tree(apsTree)
|
||||
|
||||
# == 3.1 Rooting Trees =====================================================
|
||||
|
||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# You can root the tree with the command root() from the "ape" package.
|
||||
|
||||
plot(apsTree)
|
||||
|
||||
# add labels for internal nodes and tips
|
||||
ape::nodelabels(cex = 0.5, frame = "circle")
|
||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||
|
||||
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
|
||||
# number in yours. Substitute the correct node number below for "outgroup".
|
||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||
plot(apsTree)
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
|
||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
|
||||
# Ancestor) of the ingroup.
|
||||
|
||||
# The edge lengths are stored in the phylo object:
|
||||
apsTree$edge.length
|
||||
|
||||
# ... and you can assign a small arbitrary value to the edge
|
||||
# to show how it connects to the tree without having an
|
||||
# overlap.
|
||||
apsTree$edge.length[1] <- 0.1
|
||||
plot(apsTree, cex = 0.7)
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||
|
||||
|
||||
# This procedure does however not assign an actual length to a root edge, and
|
||||
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
|
||||
# myself that too. We'll just add a length by hand.
|
||||
|
||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
|
||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||
|
||||
|
||||
# == 3.2 Rotating Clades ===================================================
|
||||
|
||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||
# in the order expected from the cladogram of species.
|
||||
|
||||
# We can either rotate around individual internal nodes ...
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
|
||||
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
||||
# Note that the species at the bottom of the clade descending from node
|
||||
# 17 is now plotted at the top.
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# ... or we can rearrange the tree so it corresponds as well as possible to a
|
||||
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
|
||||
# from the NCBI taxonomic classification.
|
||||
|
||||
nOrg <- length(apsTree$tip.label)
|
||||
|
||||
plot(fungiTree,
|
||||
no.margin = FALSE, root.edge = TRUE)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.5,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
|
||||
# These are the fungi tree tips ...
|
||||
fungiTree$tip.label
|
||||
# ... and their order is determined by the edge-list that is stored in
|
||||
fungiTree$edge
|
||||
# which edges join the tips?
|
||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
|
||||
# ordered from bottom to top.
|
||||
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
|
||||
|
||||
sel <- fungiTree$edge[ , 2 ] <= nOrg
|
||||
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
|
||||
|
||||
# Now, here are the genes of the apsTree tips ...
|
||||
apsTree$tip.label
|
||||
|
||||
# ... and the "constraint" we need for reordering, according to the help page
|
||||
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
|
||||
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
|
||||
oSp <- gsub("^", "MBP1_", oSp)
|
||||
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
|
||||
|
||||
# Then we can plot the two trees to compare: the fungi- tree
|
||||
par(PAR) # reset graphics state
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(fungiTree,
|
||||
no.margin = TRUE,
|
||||
root.edge = TRUE)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.5,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
|
||||
# and the re-organized apsesTree ...
|
||||
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
|
||||
no.margin = TRUE,
|
||||
root.edge = TRUE)
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# As you can see, the reordering is not perfect, since the topologies are
|
||||
# different, mostly due to the unresolved nodes in the reference tree. One
|
||||
# could play with that ...
|
||||
|
||||
|
||||
# Task: Study the two trees and consider their similarities and differences.
|
||||
# What do you expect? What do you find? Note that this is not a "mixed"
|
||||
# gene tree yet, since it contains only a single gene for the species
|
||||
# we considered. All of the branch points in this tree are speciation
|
||||
# events. Thus the gene tree should have the same topology as the
|
||||
# species tree. Does it? Are the differences important? How many
|
||||
# branches would you need to remove and reinsert elsewhere to get the
|
||||
# same topology as the species tree?
|
||||
|
||||
# In order to quantify how different these two trees are, we need to compute
|
||||
# tree distances.
|
||||
|
||||
|
||||
# == 3.3 Computing tree distances ==========================================
|
||||
|
||||
|
||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||
|
||||
if (! requireNamespace("phangorn", quietly = TRUE)) {
|
||||
install.packages("phangorn")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = phangorn) # basic information
|
||||
# browseVignettes("phangorn") # available vignettes
|
||||
# data(package = "phangorn") # available datasets
|
||||
|
||||
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
|
||||
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
|
||||
apsTree2 <- apsTree
|
||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||
|
||||
|
||||
# phangorn provides several functions to compute tree-differences (and there
|
||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||
# "symmetric difference"
|
||||
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||
|
||||
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||
# The ape package provides the function rtree()
|
||||
# to compute random trees.
|
||||
|
||||
ape::rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
rooted = TRUE, # we rooted the tree above,
|
||||
# and fungiTree is rooted anyway
|
||||
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||
br = NULL) # don't generate branch lengths since
|
||||
# fungiTree has none, so we can't
|
||||
# compare them anyway.
|
||||
|
||||
# (Note the warning message about non-binary trees; we'll suppress that later
|
||||
# by wrapping the function call in supressMessages(); we don't want to
|
||||
# print it 10,000 times :-)
|
||||
|
||||
|
||||
# Let's compute some random trees this way, calculate the distances to
|
||||
# fungiTree, and then compare the values we get for apsTree2. The random
|
||||
# trees are provided by ape::rtree().
|
||||
|
||||
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
|
||||
# defined in .utilities.R to keep track of where we are at:
|
||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||
colnames(myTreeDistances) <- c("symm", "path")
|
||||
|
||||
set.seed(112358)
|
||||
for (i in 1:N) {
|
||||
pBar(i, N)
|
||||
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
||||
rooted = TRUE,
|
||||
tip.label = apsTree2$tip.label,
|
||||
br = NULL)
|
||||
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
|
||||
}
|
||||
set.seed(NULL) # reset the random number generator
|
||||
|
||||
table(myTreeDistances[, "symm"])
|
||||
|
||||
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
hist(myTreeDistances[, "path"],
|
||||
col = "aliceblue",
|
||||
main = "Distances of random Trees to fungiTree")
|
||||
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
|
||||
abline(v = pathObs, col = "chartreuse")
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
# Indeed, our apsTree is _very_ much more similar to the species tree than
|
||||
# we would expect by random chance.
|
||||
|
||||
# What do we gain from that analysis? Analyzing the tree we get from a single
|
||||
# gene of orthologous sequences is a positive control in our computational
|
||||
# experiment. If these genes are indeed orthologues, a correct tree-building
|
||||
# program ought to give us a tree that exactly matches the species tree.
|
||||
# Evaluating how far off we are from the known correct result gives us a way to
|
||||
# validate our workflow and our algorithm. If we can't get that right, we can't
|
||||
# expect to get "real" data right either. Employing such positive controls in
|
||||
# every computational experiment is essential for research. Not doing so is
|
||||
# Cargo Cult Bioinformatics.
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-PHYLO-Tree_analysis.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
|
||||
# Rewrite of tip re-ordering. Better handling of
|
||||
# messages. pBar() for randomization.
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.2 Typo in variable name, style changes
|
||||
# 1.0.1 Wrong section heading
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------
|
||||
#TOC> 1 Preparation and Tree Plot 50
|
||||
#TOC> 2 SPECIES REFERENCE TREE 66
|
||||
#TOC> 3 Tree Analysis 117
|
||||
#TOC> 3.1 Rooting Trees 177
|
||||
#TOC> 3.2 Rotating Clades 222
|
||||
#TOC> 3.3 Computing tree distances 309
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation and Tree Plot ===========================================
|
||||
|
||||
|
||||
if (! requireNamespace("ape", quietly = TRUE)) {
|
||||
install.packages("ape")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = ape) # basic information
|
||||
# browseVignettes("ape") # available vignettes
|
||||
# data(package = "ape") # available datasets
|
||||
|
||||
# We change the graphics parameters from time to time, let's define the
|
||||
# default so we can recreate a sane state:
|
||||
dev.off()
|
||||
PAR <- par()
|
||||
|
||||
# = 2 SPECIES REFERENCE TREE ==============================================
|
||||
|
||||
# Before we do any kind of phylogenetic analysis of genes from several species,
|
||||
# we MUST have a reference tree of the taxonomic relationships in hand. This
|
||||
# context is absolutely required for the interpretation of our tree.
|
||||
|
||||
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
|
||||
|
||||
if (! requireNamespace("taxize", quietly = TRUE)) {
|
||||
install.packages("taxize")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = taxize) # basic information
|
||||
# browseVignettes("taxize") # available vignettes
|
||||
# data(package = "taxize") # available datasets
|
||||
|
||||
( mySOI <- c(myDB$taxonomy$ID, "83333") )
|
||||
myClass <- taxize::classification(mySOI, db = "ncbi")
|
||||
str(myClass)
|
||||
|
||||
myClass[[1]]
|
||||
|
||||
fungiTree <- taxize::class2tree(myClass, check = TRUE)
|
||||
plot(fungiTree)
|
||||
|
||||
# The tree produced by taxize:: contains full length species names,
|
||||
# but it would be more convenient if it had bicodes instead. Also, the actual
|
||||
# tree is only part of the list(), which will cause problems later:
|
||||
str(fungiTree)
|
||||
|
||||
# we therefor simplify
|
||||
fungiTree <- fungiTree$phylo
|
||||
str(fungiTree)
|
||||
|
||||
# The species names are in a vector $phylo$tip.label of this list.
|
||||
# We can use biCode() to shorten them.
|
||||
fungiTree$tip.label <- biCode(fungiTree$tip.label)
|
||||
|
||||
# Plot the tree
|
||||
nSP <- length(fungiTree$tip.label)
|
||||
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
|
||||
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.6,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
# Note that you can use the arrow buttons in the menu above the plot pane to
|
||||
# scroll back to plots you have created earlier - so you can reference back to
|
||||
# this species tree in your later analysis.
|
||||
|
||||
|
||||
# = 3 Tree Analysis =======================================================
|
||||
|
||||
|
||||
# 1.1 Visualizing your tree
|
||||
# The trees that are produced by Rphylip are stored as an object of class
|
||||
# "phylo". This is a class for phylogenetic trees that is widely used in the
|
||||
# community, practically all R phylogenetics packages will options to read and
|
||||
# manipulate such trees. Outside of R, a popular interchange format is the
|
||||
# Newick_format that you have seen above. It's easy to output your calculated
|
||||
# trees in Newick format and visualize them elsewhere.
|
||||
|
||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
||||
# print it have been defined with the Rphylip package, and in ape. You can
|
||||
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
|
||||
# to plot it. The underlying function is plot.phylo(), and documentation for its
|
||||
# many options can by found by typing:
|
||||
|
||||
?plot.phylo
|
||||
|
||||
# We load the APSES sequence tree that you produced in the
|
||||
# BIN-PHYLO-Tree_building unit:
|
||||
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
|
||||
|
||||
plot(apsTree) # default type is "phylogram"
|
||||
plot(apsTree, type = "unrooted")
|
||||
plot(apsTree, type = "fan", no.margin = TRUE)
|
||||
|
||||
# rescale to show all of the labels:
|
||||
# record the current plot parameters by assigning them to a variable ...
|
||||
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
|
||||
# ... and adjust the plot limits for a new plot:
|
||||
plot(apsTree,
|
||||
type = "fan",
|
||||
x.lim = tmp$x.lim * 1.8,
|
||||
y.lim = tmp$y.lim * 1.8,
|
||||
cex = 0.8,
|
||||
no.margin = TRUE)
|
||||
|
||||
# Inspect the tree object
|
||||
str(apsTree)
|
||||
apsTree$tip.label
|
||||
apsTree$edge
|
||||
apsTree$edge.length
|
||||
|
||||
# show the node / edge and tip labels on a plot
|
||||
plot(apsTree)
|
||||
ape::nodelabels()
|
||||
ape::edgelabels()
|
||||
ape::tiplabels()
|
||||
|
||||
# show the number of nodes, edges and tips
|
||||
ape::Nnode(apsTree)
|
||||
ape::Nedge(apsTree)
|
||||
ape::Ntip(apsTree)
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# Finally, write the tree to console in Newick format
|
||||
ape::write.tree(apsTree)
|
||||
|
||||
# == 3.1 Rooting Trees =====================================================
|
||||
|
||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# You can root the tree with the command root() from the "ape" package.
|
||||
|
||||
plot(apsTree)
|
||||
|
||||
# add labels for internal nodes and tips
|
||||
ape::nodelabels(cex = 0.5, frame = "circle")
|
||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||
|
||||
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
|
||||
# number in yours. Substitute the correct node number below for "outgroup".
|
||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||
plot(apsTree)
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
|
||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
|
||||
# Ancestor) of the ingroup.
|
||||
|
||||
# The edge lengths are stored in the phylo object:
|
||||
apsTree$edge.length
|
||||
|
||||
# ... and you can assign a small arbitrary value to the edge
|
||||
# to show how it connects to the tree without having an
|
||||
# overlap.
|
||||
apsTree$edge.length[1] <- 0.1
|
||||
plot(apsTree, cex = 0.7)
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||
|
||||
|
||||
# This procedure does however not assign an actual length to a root edge, and
|
||||
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
|
||||
# myself that too. We'll just add a length by hand.
|
||||
|
||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
|
||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||
|
||||
|
||||
# == 3.2 Rotating Clades ===================================================
|
||||
|
||||
# To interpret the tree, it is useful to rotate the clades so that they appear
|
||||
# in the order expected from the cladogram of species.
|
||||
|
||||
# We can either rotate around individual internal nodes ...
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
|
||||
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
||||
# Note that the species at the bottom of the clade descending from node
|
||||
# 17 is now plotted at the top.
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# ... or we can rearrange the tree so it corresponds as well as possible to a
|
||||
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
|
||||
# from the NCBI taxonomic classification.
|
||||
|
||||
nOrg <- length(apsTree$tip.label)
|
||||
|
||||
plot(fungiTree,
|
||||
no.margin = FALSE, root.edge = TRUE)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.5,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
|
||||
# These are the fungi tree tips ...
|
||||
fungiTree$tip.label
|
||||
# ... and their order is determined by the edge-list that is stored in
|
||||
fungiTree$edge
|
||||
# which edges join the tips?
|
||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
|
||||
# ordered from bottom to top.
|
||||
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
|
||||
|
||||
sel <- fungiTree$edge[ , 2 ] <= nOrg
|
||||
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
|
||||
|
||||
# Now, here are the genes of the apsTree tips ...
|
||||
apsTree$tip.label
|
||||
|
||||
# ... and the "constraint" we need for reordering, according to the help page
|
||||
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
|
||||
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
|
||||
oSp <- gsub("^", "MBP1_", oSp)
|
||||
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
|
||||
|
||||
# Then we can plot the two trees to compare: the fungi- tree
|
||||
par(PAR) # reset graphics state
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(fungiTree,
|
||||
no.margin = TRUE,
|
||||
root.edge = TRUE)
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.5,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
|
||||
# and the re-organized apsesTree ...
|
||||
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
|
||||
no.margin = TRUE,
|
||||
root.edge = TRUE)
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
|
||||
# As you can see, the reordering is not perfect, since the topologies are
|
||||
# different, mostly due to the unresolved nodes in the reference tree. One
|
||||
# could play with that ...
|
||||
|
||||
|
||||
# Task: Study the two trees and consider their similarities and differences.
|
||||
# What do you expect? What do you find? Note that this is not a "mixed"
|
||||
# gene tree yet, since it contains only a single gene for the species
|
||||
# we considered. All of the branch points in this tree are speciation
|
||||
# events. Thus the gene tree should have the same topology as the
|
||||
# species tree. Does it? Are the differences important? How many
|
||||
# branches would you need to remove and reinsert elsewhere to get the
|
||||
# same topology as the species tree?
|
||||
|
||||
# In order to quantify how different these two trees are, we need to compute
|
||||
# tree distances.
|
||||
|
||||
|
||||
# == 3.3 Computing tree distances ==========================================
|
||||
|
||||
|
||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||
|
||||
if (! requireNamespace("phangorn", quietly = TRUE)) {
|
||||
install.packages("phangorn")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = phangorn) # basic information
|
||||
# browseVignettes("phangorn") # available vignettes
|
||||
# data(package = "phangorn") # available datasets
|
||||
|
||||
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
|
||||
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
|
||||
apsTree2 <- apsTree
|
||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||
|
||||
|
||||
# phangorn provides several functions to compute tree-differences (and there
|
||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||
# "symmetric difference"
|
||||
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||
|
||||
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||
# The ape package provides the function rtree()
|
||||
# to compute random trees.
|
||||
|
||||
ape::rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
rooted = TRUE, # we rooted the tree above,
|
||||
# and fungiTree is rooted anyway
|
||||
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||
br = NULL) # don't generate branch lengths since
|
||||
# fungiTree has none, so we can't
|
||||
# compare them anyway.
|
||||
|
||||
# (Note the warning message about non-binary trees; we'll suppress that later
|
||||
# by wrapping the function call in supressMessages(); we don't want to
|
||||
# print it 10,000 times :-)
|
||||
|
||||
|
||||
# Let's compute some random trees this way, calculate the distances to
|
||||
# fungiTree, and then compare the values we get for apsTree2. The random
|
||||
# trees are provided by ape::rtree().
|
||||
|
||||
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
|
||||
# defined in .utilities.R to keep track of where we are at:
|
||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
|
||||
colnames(myTreeDistances) <- c("symm", "path")
|
||||
|
||||
set.seed(112358)
|
||||
for (i in 1:N) {
|
||||
pBar(i, N)
|
||||
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
||||
rooted = TRUE,
|
||||
tip.label = apsTree2$tip.label,
|
||||
br = NULL)
|
||||
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
|
||||
}
|
||||
set.seed(NULL) # reset the random number generator
|
||||
|
||||
table(myTreeDistances[, "symm"])
|
||||
|
||||
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
par(PAR) # reset graphics state
|
||||
hist(myTreeDistances[, "path"],
|
||||
col = "aliceblue",
|
||||
main = "Distances of random Trees to fungiTree")
|
||||
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
|
||||
abline(v = pathObs, col = "chartreuse")
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
|
||||
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
|
||||
|
||||
# Indeed, our apsTree is _very_ much more similar to the species tree than
|
||||
# we would expect by random chance.
|
||||
|
||||
# What do we gain from that analysis? Analyzing the tree we get from a single
|
||||
# gene of orthologous sequences is a positive control in our computational
|
||||
# experiment. If these genes are indeed orthologues, a correct tree-building
|
||||
# program ought to give us a tree that exactly matches the species tree.
|
||||
# Evaluating how far off we are from the known correct result gives us a way to
|
||||
# validate our workflow and our algorithm. If we can't get that right, we can't
|
||||
# expect to get "real" data right either. Employing such positive controls in
|
||||
# every computational experiment is essential for research. Not doing so is
|
||||
# Cargo Cult Bioinformatics.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,168 +1,168 @@
|
||||
# tocID <- "BIN-PHYLO-Tree_building.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac:
|
||||
# instructions to authorize proml.app
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Add MrBayes
|
||||
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Calculating Trees 48
|
||||
#TOC> 1.1 PROMLPATH ... 68
|
||||
#TOC> 1.1.1 ... on the Mac 73
|
||||
#TOC> 1.1.2 ... on Windows 101
|
||||
#TOC> 1.1.3 ... on Linux 115
|
||||
#TOC> 1.1.4 Confirming PROMLPATH 120
|
||||
#TOC> 1.2 Building a maximum likelihood tree 134
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Calculating Trees ===================================================
|
||||
|
||||
|
||||
# Follow the instructions found at phylip's home on the Web to install. If you
|
||||
# are on a Windows computer, take note of the installation directory.
|
||||
|
||||
# After you have installed Phylip on your computer, install the R package that
|
||||
# provides an interface to the Phylip functions.
|
||||
|
||||
if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
||||
install.packages("Rphylip")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Rphylip) # basic information
|
||||
# browseVignettes("Rphylip") # available vignettes
|
||||
# data(package = "Rphylip") # available datasets
|
||||
|
||||
# This will install RPhylip, as well as its dependency, the package "ape".
|
||||
|
||||
|
||||
# == 1.1 PROMLPATH ... =====================================================
|
||||
# The next part may be tricky. You will need to figure out where
|
||||
# on your computer Phylip has been installed and define the path
|
||||
# to the proml program that calculates a maximum-likelihood tree.
|
||||
|
||||
# === 1.1.1 ... on the Mac
|
||||
# On the Mac, the standard installation places a phylip folder
|
||||
# in the /Applications directory. That folder contains all the
|
||||
# individual phylip programs as <name>.app files. These are not
|
||||
# the actual executables, but "app" files are actually directories
|
||||
# that contain the required resources for a program to run.
|
||||
|
||||
# The executable is in a subdirectory and you can point Rphylip
|
||||
# directly to that subdirectory to find the program it needs:
|
||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
||||
|
||||
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
|
||||
# because they have not been "signed" by the PHYLIP developers. The process
|
||||
# will terminate by your system, with a warning.
|
||||
|
||||
# - Navigate to the phylip folder in your ~/Applications directory
|
||||
# - Descend into the "exe" folder and find proml.app
|
||||
# - Ctrl-click proml.app and choose "Open". A dialogue will show that
|
||||
# says: "macOS cannot verify the developer of “proml.app”.
|
||||
# Are you sure you want to open it?"
|
||||
# - Click open to continue. You may need to allow access to the terminal
|
||||
# as well. When the proml terminal session open, you can type
|
||||
# Ctrl-c to abort the program and close the window.
|
||||
#
|
||||
# This adds proml.app to the list of known-good programs and you will not
|
||||
# need to repeat this process.
|
||||
#
|
||||
|
||||
# === 1.1.2 ... on Windows
|
||||
# On Windows you need to know where the programs have been installed, and you
|
||||
# need to specify a path that is correct for the Windows OS. Find the folder
|
||||
# that is named "exe", and right-click to inspect its properties. The path
|
||||
# should be listed among them.
|
||||
|
||||
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
|
||||
# assignment has to be
|
||||
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
|
||||
# (Note: "/", not "\")
|
||||
|
||||
# I have heard that your path must not contain spaces, and it is prudent to
|
||||
# avoid other special characters as well.
|
||||
|
||||
# === 1.1.3 ... on Linux
|
||||
# If you are running Linux I trust you know what to do. It's probably
|
||||
# something like
|
||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
||||
|
||||
# === 1.1.4 Confirming PROMLPATH
|
||||
# Confirm that the settings are right.
|
||||
PROMLPATH # returns the path
|
||||
list.dirs(PROMLPATH) # returns the directories in that path
|
||||
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
||||
|
||||
# If "proml" is NOT among the files that the last command returns, you
|
||||
# can't continue. Ask on the mailing list for advice.
|
||||
|
||||
# If everything is good, you can add the line that defines PROMLPATH to
|
||||
# myScripts/.myProfile.R - the path will then be automatically set when
|
||||
# you quit RStudio and return.
|
||||
|
||||
|
||||
# == 1.2 Building a maximum likelihood tree ================================
|
||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||
|
||||
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
|
||||
str(apsIn)
|
||||
|
||||
# ... and you are ready to build a tree.
|
||||
|
||||
# There are many fast options in PHYLIP - we will use the most _accurate_ one
|
||||
# that it has: proml, a maximum-likelihood tree building program for protein
|
||||
# data.
|
||||
|
||||
# Building maximum-likelihood trees can eat as much computer time
|
||||
# as you can throw at it. Calculating a tree of 48 APSES domains
|
||||
# with default parameters of Rproml() runs for more than half a day
|
||||
# on my computer. But we have only twelve sequences here, so the
|
||||
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
|
||||
# of coffee while you are waiting.
|
||||
|
||||
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
|
||||
|
||||
# A quick first look:
|
||||
|
||||
plot(apsTree)
|
||||
|
||||
# save your tree:
|
||||
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
|
||||
|
||||
# If this did not work, ask for advice.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-PHYLO-Tree_building.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac:
|
||||
# instructions to authorize proml.app
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# Add MrBayes
|
||||
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Calculating Trees 48
|
||||
#TOC> 1.1 PROMLPATH ... 68
|
||||
#TOC> 1.1.1 ... on the Mac 73
|
||||
#TOC> 1.1.2 ... on Windows 101
|
||||
#TOC> 1.1.3 ... on Linux 115
|
||||
#TOC> 1.1.4 Confirming PROMLPATH 120
|
||||
#TOC> 1.2 Building a maximum likelihood tree 134
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Calculating Trees ===================================================
|
||||
|
||||
|
||||
# Follow the instructions found at phylip's home on the Web to install. If you
|
||||
# are on a Windows computer, take note of the installation directory.
|
||||
|
||||
# After you have installed Phylip on your computer, install the R package that
|
||||
# provides an interface to the Phylip functions.
|
||||
|
||||
if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
||||
install.packages("Rphylip")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Rphylip) # basic information
|
||||
# browseVignettes("Rphylip") # available vignettes
|
||||
# data(package = "Rphylip") # available datasets
|
||||
|
||||
# This will install RPhylip, as well as its dependency, the package "ape".
|
||||
|
||||
|
||||
# == 1.1 PROMLPATH ... =====================================================
|
||||
# The next part may be tricky. You will need to figure out where
|
||||
# on your computer Phylip has been installed and define the path
|
||||
# to the proml program that calculates a maximum-likelihood tree.
|
||||
|
||||
# === 1.1.1 ... on the Mac
|
||||
# On the Mac, the standard installation places a phylip folder
|
||||
# in the /Applications directory. That folder contains all the
|
||||
# individual phylip programs as <name>.app files. These are not
|
||||
# the actual executables, but "app" files are actually directories
|
||||
# that contain the required resources for a program to run.
|
||||
|
||||
# The executable is in a subdirectory and you can point Rphylip
|
||||
# directly to that subdirectory to find the program it needs:
|
||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
||||
|
||||
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
|
||||
# because they have not been "signed" by the PHYLIP developers. The process
|
||||
# will terminate by your system, with a warning.
|
||||
|
||||
# - Navigate to the phylip folder in your ~/Applications directory
|
||||
# - Descend into the "exe" folder and find proml.app
|
||||
# - Ctrl-click proml.app and choose "Open". A dialogue will show that
|
||||
# says: "macOS cannot verify the developer of “proml.app”.
|
||||
# Are you sure you want to open it?"
|
||||
# - Click open to continue. You may need to allow access to the terminal
|
||||
# as well. When the proml terminal session open, you can type
|
||||
# Ctrl-c to abort the program and close the window.
|
||||
#
|
||||
# This adds proml.app to the list of known-good programs and you will not
|
||||
# need to repeat this process.
|
||||
#
|
||||
|
||||
# === 1.1.2 ... on Windows
|
||||
# On Windows you need to know where the programs have been installed, and you
|
||||
# need to specify a path that is correct for the Windows OS. Find the folder
|
||||
# that is named "exe", and right-click to inspect its properties. The path
|
||||
# should be listed among them.
|
||||
|
||||
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
|
||||
# assignment has to be
|
||||
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
|
||||
# (Note: "/", not "\")
|
||||
|
||||
# I have heard that your path must not contain spaces, and it is prudent to
|
||||
# avoid other special characters as well.
|
||||
|
||||
# === 1.1.3 ... on Linux
|
||||
# If you are running Linux I trust you know what to do. It's probably
|
||||
# something like
|
||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
||||
|
||||
# === 1.1.4 Confirming PROMLPATH
|
||||
# Confirm that the settings are right.
|
||||
PROMLPATH # returns the path
|
||||
list.dirs(PROMLPATH) # returns the directories in that path
|
||||
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
||||
|
||||
# If "proml" is NOT among the files that the last command returns, you
|
||||
# can't continue. Ask on the mailing list for advice.
|
||||
|
||||
# If everything is good, you can add the line that defines PROMLPATH to
|
||||
# myScripts/.myProfile.R - the path will then be automatically set when
|
||||
# you quit RStudio and return.
|
||||
|
||||
|
||||
# == 1.2 Building a maximum likelihood tree ================================
|
||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||
|
||||
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
|
||||
str(apsIn)
|
||||
|
||||
# ... and you are ready to build a tree.
|
||||
|
||||
# There are many fast options in PHYLIP - we will use the most _accurate_ one
|
||||
# that it has: proml, a maximum-likelihood tree building program for protein
|
||||
# data.
|
||||
|
||||
# Building maximum-likelihood trees can eat as much computer time
|
||||
# as you can throw at it. Calculating a tree of 48 APSES domains
|
||||
# with default parameters of Rproml() runs for more than half a day
|
||||
# on my computer. But we have only twelve sequences here, so the
|
||||
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
|
||||
# of coffee while you are waiting.
|
||||
|
||||
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
|
||||
|
||||
# A quick first look:
|
||||
|
||||
plot(apsTree)
|
||||
|
||||
# save your tree:
|
||||
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
|
||||
|
||||
# If this did not work, ask for advice.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,323 +1,323 @@
|
||||
# tocID <- "BIN-PPI-Analysis.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PPI-Analysis unit.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-08 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.4 Update vector ID's for betweenness centrality.
|
||||
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
|
||||
# 1.2 2020 Updates; Rewrite for new STRINg V11;
|
||||
# Deprecate save()/load() for saveRDS()/readRDS()
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First live version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Setup and data 50
|
||||
#TOC> 2 Functional Edges in the Human Proteome 86
|
||||
#TOC> 2.1 Cliques 129
|
||||
#TOC> 2.2 Communities 170
|
||||
#TOC> 2.3 Betweenness Centrality 184
|
||||
#TOC> 3 biomaRt 231
|
||||
#TOC> 4 Task for submission 302
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Setup and data ======================================================
|
||||
|
||||
|
||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
||||
|
||||
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||
install.packages("igraph")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = igraph) # basic information
|
||||
# browseVignettes("igraph") # available vignettes
|
||||
# data(package = "igraph") # available datasets
|
||||
|
||||
# In order for you to explore some real, biological networks, I give you a
|
||||
# dataframe of functional relationships of human proteins that I have downloaded
|
||||
# from the STRING database. The full table has 8.5 million records, here is a
|
||||
# subset of records with combined confidence scores > 980
|
||||
|
||||
# The selected set of edges with a confidence of > 964 is a dataframe with about
|
||||
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
|
||||
# a fungal proteome. You can load the saved dataframe here (To read more about
|
||||
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
||||
|
||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
|
||||
|
||||
head(STRINGedges)
|
||||
|
||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
|
||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
|
||||
|
||||
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
|
||||
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
|
||||
|
||||
head(STRINGedges)
|
||||
|
||||
|
||||
# = 2 Functional Edges in the Human Proteome ==============================
|
||||
|
||||
|
||||
# There are many possibilities to explore interesting aspects of biological
|
||||
# networks, we will keep with some very simple procedures here but you have
|
||||
# to be aware that this is barely scratching the surface of possibilities.
|
||||
# However, once the network exists in your computer, it is comparatively
|
||||
# easy to find information online about the many, many options to analyze.
|
||||
|
||||
|
||||
# Make a graph from this dataframe
|
||||
?igraph::graph_from_data_frame
|
||||
|
||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||
|
||||
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
|
||||
# layout of such large graphs is possible, but requires specialized code. Google
|
||||
# for <layout large graphs> if you are curious. Also, consider what one can
|
||||
# really learn from plotting such a graph ...
|
||||
|
||||
# Of course simple computations on this graph are reasonably fast:
|
||||
|
||||
compSTR <- igraph::components(gSTR)
|
||||
summary(compSTR) # our graph is fully connected!
|
||||
|
||||
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
|
||||
# this actually does look rather scale-free
|
||||
|
||||
(freqRank <- table(igraph::degree(gSTR)))
|
||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
log10(as.numeric(freqRank)), type = "b",
|
||||
pch = 21, bg = "#FEE0AF",
|
||||
xlab = "log(Rank)", ylab = "log(frequency)",
|
||||
main = "8,400 nodes from the human functional interaction network")
|
||||
|
||||
# This looks very scale-free indeed.
|
||||
|
||||
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
|
||||
log10(as.numeric(names(freqRank)) + 1)))
|
||||
abline(regressionLine, col = "firebrick")
|
||||
|
||||
# Now explore some more:
|
||||
|
||||
# == 2.1 Cliques ===========================================================
|
||||
|
||||
# Let's find the largest cliques. Remember: a clique is a fully connected
|
||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
||||
# Biological complexes often appear as cliques in interaction graphs.
|
||||
|
||||
igraph::clique_num(gSTR)
|
||||
# The largest clique has 81 members.
|
||||
|
||||
(C <- igraph::largest_cliques(gSTR)[[1]])
|
||||
|
||||
# Pick one of the proteins and find out what this fully connected cluster of 81
|
||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||
|
||||
# Plot this ...
|
||||
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
|
||||
|
||||
# color the vertices along a color spectrum
|
||||
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
|
||||
|
||||
# color the edges to have the same color as the originating node
|
||||
eCol <- character()
|
||||
for (i in seq_along(vCol)) {
|
||||
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
|
||||
}
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(R,
|
||||
layout = igraph::layout_in_circle(R),
|
||||
vertex.size = 3,
|
||||
vertex.color = vCol,
|
||||
edge.color = eCol,
|
||||
edge.width = 0.1,
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
|
||||
# ... well: remember: a clique means every node is connected to every other
|
||||
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
|
||||
# networks looks like for large complexes.
|
||||
|
||||
|
||||
# == 2.2 Communities =======================================================
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
gSTRclusters <- igraph::cluster_infomap(gSTR)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
||||
# membership types are from each other
|
||||
tMem <- table(igraph::membership(gSTRclusters))
|
||||
length(tMem) # About 700 communities identified
|
||||
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
||||
range(tMem) # ... but one has > 200 members
|
||||
|
||||
|
||||
# == 2.3 Betweenness Centrality ============================================
|
||||
|
||||
# Let's find the nodes with the 10 - highest betweenness centralities.
|
||||
#
|
||||
BC <- igraph::centr_betw(gSTR)
|
||||
|
||||
# remember: BC$res contains the results
|
||||
head(BC$res)
|
||||
|
||||
BC$res[1] # betweenness centrality of node 1 in the graph ...
|
||||
# ... which one is node 1?
|
||||
igraph::V(gSTR)[1]
|
||||
|
||||
# to get the ten-highest nodes, we simply label the elements of BC with their
|
||||
# index ...
|
||||
names(BC$res) <- as.character(1:length(BC$res))
|
||||
|
||||
# ... and then we sort:
|
||||
sBC <- sort(BC$res, decreasing = TRUE)
|
||||
head(sBC)
|
||||
|
||||
# This ordered vector means: node 3 has the highest betweenness centrality,
|
||||
# node 721 has the second highest, etc.
|
||||
|
||||
(BCsel <- as.numeric(names(sBC)[1:10]))
|
||||
|
||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
||||
# IDs...
|
||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
|
||||
# We are going to use these IDs to produce some output for a submitted task:
|
||||
# therefore I need you to execute the following line, note the "seal" that this
|
||||
# returns, and not change myENSPsel later:
|
||||
|
||||
myENSPsel <- selectENSP(ENSPsel)
|
||||
|
||||
# Next, to find what these proteins are...
|
||||
|
||||
# We could now Google for all of these IDs to learn more about them. But really,
|
||||
# googling for IDs one after the other, that would be lame. Let's instead use
|
||||
# the very, very useful biomaRt package to translate these Ensemble IDs into
|
||||
# gene symbols.
|
||||
|
||||
|
||||
# = 3 biomaRt =============================================================
|
||||
|
||||
|
||||
# IDs are just labels, but for _bio_informatics we need to learn more about the
|
||||
# biological function of the genes or proteins that we retrieve via graph data
|
||||
# mining. biomaRt is the tool of choice. It's a package distributed by the
|
||||
# bioconductor project. This here is not a biomaRt tutorial (that's for another
|
||||
# day), simply a few lines of sample code to get you started on the specific use
|
||||
# case of retrieving descriptions for ensembl protein IDs.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("biomaRt", quietly = TRUE)) {
|
||||
BiocManager::install("biomaRt")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = biomaRt) # basic information
|
||||
# browseVignettes("biomaRt") # available vignettes
|
||||
# data(package = "biomaRt") # available datasets
|
||||
|
||||
# define which dataset to use ... this takes a while for download
|
||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||
|
||||
# what filters are defined?
|
||||
( filters <- biomaRt::listFilters(myMart) )
|
||||
|
||||
|
||||
# and what attributes can we filter for?
|
||||
( attributes <- biomaRt::listAttributes(myMart) )
|
||||
|
||||
|
||||
# Soooo many options - let's look for the correct name of filters that are
|
||||
# useful for ENSP IDs ...
|
||||
filters[grep("ENSP", filters$description), ]
|
||||
|
||||
# ... and the correct attribute names for gene symbols and descriptions ...
|
||||
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
|
||||
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
|
||||
|
||||
|
||||
# ... so we can put this together: here is a syntax example:
|
||||
biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
"phenotype_description"),
|
||||
values = "ENSP00000000442",
|
||||
mart = myMart)
|
||||
|
||||
# A simple loop will now get us the information for our 10 most central genes
|
||||
# from the human subset of STRING.
|
||||
|
||||
CPdefs <- list() # Since we don't know how many matches one of our queries
|
||||
# will return, we'll put the result dataframes into a list.
|
||||
|
||||
for (ID in myENSPsel) {
|
||||
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
"phenotype_description"),
|
||||
values = ID,
|
||||
mart = myMart)
|
||||
}
|
||||
|
||||
|
||||
# So what are the proteins with the ten highest betweenness centralities?
|
||||
# ... are you surprised? (I am! Really.)
|
||||
|
||||
|
||||
# = 4 Task for submission =================================================
|
||||
|
||||
# Write a loop that will go through your personalized list of Ensemble IDs and
|
||||
# for each ID:
|
||||
# -- print the ID,
|
||||
# -- print the first row's HGNC symbol,
|
||||
# -- print the first row's wikigene description.
|
||||
# -- print the first row's phenotype.
|
||||
#
|
||||
# Write your thoughts about this group of genes.
|
||||
#
|
||||
# (Hint, you can structure your loop in the same way as the loop that
|
||||
# created CPdefs. )
|
||||
|
||||
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
|
||||
# for this loop and its output into your report if you are submitting
|
||||
# anything for credit for this unit. Please read the requirements carefully.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-PPI-Analysis.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PPI-Analysis unit.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017-08 - 2020-10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.4 Update vector ID's for betweenness centrality.
|
||||
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
|
||||
# 1.2 2020 Updates; Rewrite for new STRINg V11;
|
||||
# Deprecate save()/load() for saveRDS()/readRDS()
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First live version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Setup and data 50
|
||||
#TOC> 2 Functional Edges in the Human Proteome 86
|
||||
#TOC> 2.1 Cliques 129
|
||||
#TOC> 2.2 Communities 170
|
||||
#TOC> 2.3 Betweenness Centrality 184
|
||||
#TOC> 3 biomaRt 231
|
||||
#TOC> 4 Task for submission 302
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Setup and data ======================================================
|
||||
|
||||
|
||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
||||
|
||||
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||
install.packages("igraph")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = igraph) # basic information
|
||||
# browseVignettes("igraph") # available vignettes
|
||||
# data(package = "igraph") # available datasets
|
||||
|
||||
# In order for you to explore some real, biological networks, I give you a
|
||||
# dataframe of functional relationships of human proteins that I have downloaded
|
||||
# from the STRING database. The full table has 8.5 million records, here is a
|
||||
# subset of records with combined confidence scores > 980
|
||||
|
||||
# The selected set of edges with a confidence of > 964 is a dataframe with about
|
||||
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
|
||||
# a fungal proteome. You can load the saved dataframe here (To read more about
|
||||
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
||||
|
||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
|
||||
|
||||
head(STRINGedges)
|
||||
|
||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
|
||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
|
||||
|
||||
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
|
||||
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
|
||||
|
||||
head(STRINGedges)
|
||||
|
||||
|
||||
# = 2 Functional Edges in the Human Proteome ==============================
|
||||
|
||||
|
||||
# There are many possibilities to explore interesting aspects of biological
|
||||
# networks, we will keep with some very simple procedures here but you have
|
||||
# to be aware that this is barely scratching the surface of possibilities.
|
||||
# However, once the network exists in your computer, it is comparatively
|
||||
# easy to find information online about the many, many options to analyze.
|
||||
|
||||
|
||||
# Make a graph from this dataframe
|
||||
?igraph::graph_from_data_frame
|
||||
|
||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||
|
||||
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
|
||||
# layout of such large graphs is possible, but requires specialized code. Google
|
||||
# for <layout large graphs> if you are curious. Also, consider what one can
|
||||
# really learn from plotting such a graph ...
|
||||
|
||||
# Of course simple computations on this graph are reasonably fast:
|
||||
|
||||
compSTR <- igraph::components(gSTR)
|
||||
summary(compSTR) # our graph is fully connected!
|
||||
|
||||
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
|
||||
# this actually does look rather scale-free
|
||||
|
||||
(freqRank <- table(igraph::degree(gSTR)))
|
||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
log10(as.numeric(freqRank)), type = "b",
|
||||
pch = 21, bg = "#FEE0AF",
|
||||
xlab = "log(Rank)", ylab = "log(frequency)",
|
||||
main = "8,400 nodes from the human functional interaction network")
|
||||
|
||||
# This looks very scale-free indeed.
|
||||
|
||||
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
|
||||
log10(as.numeric(names(freqRank)) + 1)))
|
||||
abline(regressionLine, col = "firebrick")
|
||||
|
||||
# Now explore some more:
|
||||
|
||||
# == 2.1 Cliques ===========================================================
|
||||
|
||||
# Let's find the largest cliques. Remember: a clique is a fully connected
|
||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
||||
# Biological complexes often appear as cliques in interaction graphs.
|
||||
|
||||
igraph::clique_num(gSTR)
|
||||
# The largest clique has 81 members.
|
||||
|
||||
(C <- igraph::largest_cliques(gSTR)[[1]])
|
||||
|
||||
# Pick one of the proteins and find out what this fully connected cluster of 81
|
||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||
|
||||
# Plot this ...
|
||||
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
|
||||
|
||||
# color the vertices along a color spectrum
|
||||
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
|
||||
|
||||
# color the edges to have the same color as the originating node
|
||||
eCol <- character()
|
||||
for (i in seq_along(vCol)) {
|
||||
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
|
||||
}
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(R,
|
||||
layout = igraph::layout_in_circle(R),
|
||||
vertex.size = 3,
|
||||
vertex.color = vCol,
|
||||
edge.color = eCol,
|
||||
edge.width = 0.1,
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
|
||||
# ... well: remember: a clique means every node is connected to every other
|
||||
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
|
||||
# networks looks like for large complexes.
|
||||
|
||||
|
||||
# == 2.2 Communities =======================================================
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
gSTRclusters <- igraph::cluster_infomap(gSTR)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
||||
# membership types are from each other
|
||||
tMem <- table(igraph::membership(gSTRclusters))
|
||||
length(tMem) # About 700 communities identified
|
||||
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
||||
range(tMem) # ... but one has > 200 members
|
||||
|
||||
|
||||
# == 2.3 Betweenness Centrality ============================================
|
||||
|
||||
# Let's find the nodes with the 10 - highest betweenness centralities.
|
||||
#
|
||||
BC <- igraph::centr_betw(gSTR)
|
||||
|
||||
# remember: BC$res contains the results
|
||||
head(BC$res)
|
||||
|
||||
BC$res[1] # betweenness centrality of node 1 in the graph ...
|
||||
# ... which one is node 1?
|
||||
igraph::V(gSTR)[1]
|
||||
|
||||
# to get the ten-highest nodes, we simply label the elements of BC with their
|
||||
# index ...
|
||||
names(BC$res) <- as.character(1:length(BC$res))
|
||||
|
||||
# ... and then we sort:
|
||||
sBC <- sort(BC$res, decreasing = TRUE)
|
||||
head(sBC)
|
||||
|
||||
# This ordered vector means: node 3 has the highest betweenness centrality,
|
||||
# node 721 has the second highest, etc.
|
||||
|
||||
(BCsel <- as.numeric(names(sBC)[1:10]))
|
||||
|
||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
||||
# IDs...
|
||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
||||
|
||||
# Task:
|
||||
# =====
|
||||
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
|
||||
# We are going to use these IDs to produce some output for a submitted task:
|
||||
# therefore I need you to execute the following line, note the "seal" that this
|
||||
# returns, and not change myENSPsel later:
|
||||
|
||||
myENSPsel <- selectENSP(ENSPsel)
|
||||
|
||||
# Next, to find what these proteins are...
|
||||
|
||||
# We could now Google for all of these IDs to learn more about them. But really,
|
||||
# googling for IDs one after the other, that would be lame. Let's instead use
|
||||
# the very, very useful biomaRt package to translate these Ensemble IDs into
|
||||
# gene symbols.
|
||||
|
||||
|
||||
# = 3 biomaRt =============================================================
|
||||
|
||||
|
||||
# IDs are just labels, but for _bio_informatics we need to learn more about the
|
||||
# biological function of the genes or proteins that we retrieve via graph data
|
||||
# mining. biomaRt is the tool of choice. It's a package distributed by the
|
||||
# bioconductor project. This here is not a biomaRt tutorial (that's for another
|
||||
# day), simply a few lines of sample code to get you started on the specific use
|
||||
# case of retrieving descriptions for ensembl protein IDs.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("biomaRt", quietly = TRUE)) {
|
||||
BiocManager::install("biomaRt")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = biomaRt) # basic information
|
||||
# browseVignettes("biomaRt") # available vignettes
|
||||
# data(package = "biomaRt") # available datasets
|
||||
|
||||
# define which dataset to use ... this takes a while for download
|
||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||
|
||||
# what filters are defined?
|
||||
( filters <- biomaRt::listFilters(myMart) )
|
||||
|
||||
|
||||
# and what attributes can we filter for?
|
||||
( attributes <- biomaRt::listAttributes(myMart) )
|
||||
|
||||
|
||||
# Soooo many options - let's look for the correct name of filters that are
|
||||
# useful for ENSP IDs ...
|
||||
filters[grep("ENSP", filters$description), ]
|
||||
|
||||
# ... and the correct attribute names for gene symbols and descriptions ...
|
||||
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
|
||||
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
|
||||
|
||||
|
||||
# ... so we can put this together: here is a syntax example:
|
||||
biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
"phenotype_description"),
|
||||
values = "ENSP00000000442",
|
||||
mart = myMart)
|
||||
|
||||
# A simple loop will now get us the information for our 10 most central genes
|
||||
# from the human subset of STRING.
|
||||
|
||||
CPdefs <- list() # Since we don't know how many matches one of our queries
|
||||
# will return, we'll put the result dataframes into a list.
|
||||
|
||||
for (ID in myENSPsel) {
|
||||
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
"phenotype_description"),
|
||||
values = ID,
|
||||
mart = myMart)
|
||||
}
|
||||
|
||||
|
||||
# So what are the proteins with the ten highest betweenness centralities?
|
||||
# ... are you surprised? (I am! Really.)
|
||||
|
||||
|
||||
# = 4 Task for submission =================================================
|
||||
|
||||
# Write a loop that will go through your personalized list of Ensemble IDs and
|
||||
# for each ID:
|
||||
# -- print the ID,
|
||||
# -- print the first row's HGNC symbol,
|
||||
# -- print the first row's wikigene description.
|
||||
# -- print the first row's phenotype.
|
||||
#
|
||||
# Write your thoughts about this group of genes.
|
||||
#
|
||||
# (Hint, you can structure your loop in the same way as the loop that
|
||||
# created CPdefs. )
|
||||
|
||||
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
|
||||
# for this loop and its output into your report if you are submitting
|
||||
# anything for credit for this unit. Please read the requirements carefully.
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,252 +1,252 @@
|
||||
# tocID <- "BIN-SEQA-Composition.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-SEQA-Comparison unit
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-11 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# Versions:
|
||||
# 1.0 First live version 2017
|
||||
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 Preparation 48
|
||||
#TOC> 2 Aggregate properties 69
|
||||
#TOC> 3 Sequence Composition Enrichment 113
|
||||
#TOC> 3.1 Barplot, and side-by-side barplot 136
|
||||
#TOC> 3.2 Plotting ratios 171
|
||||
#TOC> 3.3 Plotting log ratios 188
|
||||
#TOC> 3.4 Sort by frequency 204
|
||||
#TOC> 3.5 Color by amino acid type 221
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation =========================================================
|
||||
|
||||
if (! requireNamespace("seqinr", quietly = TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Load a reference sequence to work with:
|
||||
|
||||
# If you have done the BIN-Storing_data unit:
|
||||
source("makeProteinDB.R")
|
||||
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
|
||||
mySeq <- myDB$protein$sequence[sel]
|
||||
|
||||
# If not, use the yeast Mbp1 sequence:
|
||||
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
|
||||
|
||||
|
||||
# = 2 Aggregate properties ================================================
|
||||
|
||||
|
||||
# Let's try a simple function from seqinr: computing the pI of the sequence
|
||||
?seqinr::computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
|
||||
# We can use the function strsplit() to split the string
|
||||
# into single characters
|
||||
|
||||
(s <- strsplit(mySeq, "")) # splitting on the empty spring
|
||||
# splits into single characters
|
||||
s <- unlist(s) # strsplit() returns a list! Why?
|
||||
# (But we don't need a list now...)
|
||||
|
||||
# Alternatively, seqinr provides
|
||||
# the function s2c() to convert strings into
|
||||
# character vectors (and c2s to convert them back).
|
||||
|
||||
seqinr::s2c(mySeq)
|
||||
|
||||
|
||||
seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point
|
||||
seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight
|
||||
seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of
|
||||
# values along the sequence
|
||||
|
||||
# A true Labor of Love has gone into the
|
||||
# compilation of the "aaindex" data:
|
||||
|
||||
?seqinr::aaindex
|
||||
data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it
|
||||
# accessible as an R object
|
||||
|
||||
length(aaindex) # no seqinr:: needed for the dataset since we just
|
||||
# "attached" it with data()
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
|
||||
# = 3 Sequence Composition Enrichment =====================================
|
||||
|
||||
|
||||
# Lets use one of the indices to calculate and plot amino-acid
|
||||
# composition enrichment:
|
||||
aaindex[[459]]$D
|
||||
|
||||
#
|
||||
# Let's construct an enrichment plot to compare average frequencies
|
||||
# with the amino acid counts in our sequence.
|
||||
|
||||
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
||||
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
|
||||
# code using seqinr's "a()" function
|
||||
sum(refData)
|
||||
refData # ... in %
|
||||
|
||||
|
||||
# tabulate the amino acid counts in mySeq
|
||||
(obsData <- table(seqinr::s2c(mySeq))) # counts
|
||||
(obsData <- 100 * (obsData / sum(obsData))) # frequencies
|
||||
|
||||
|
||||
# == 3.1 Barplot, and side-by-side barplot =================================
|
||||
|
||||
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#BB0000")
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# Ok: first problem - the values in obsData are in alphabetical order. But the
|
||||
# values in refData are in alphabetical order of amino acid name: alanine,
|
||||
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
|
||||
# order a lot - one of the old biochemistry tropes in the field. So we need to
|
||||
# re-order one of the vectors to match the other. That's easy though:
|
||||
refData
|
||||
(refData <- refData[names(obsData)])
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# To compare the values, we want to see them in a barplot, side-by-side ...
|
||||
barplot(rbind(obsData, refData),
|
||||
ylim = c(0, 12),
|
||||
beside = TRUE,
|
||||
col = c("#CCCCCC", "#BB0000"),
|
||||
cex.names = 0.7)
|
||||
abline(h = 100/20, col="#00000044")
|
||||
|
||||
# ... and add a legend
|
||||
legend (x = 1, y = 12,
|
||||
legend = c("mySeq", "Average composition"),
|
||||
fill = c("#CCCCCC", "#BB0000"),
|
||||
cex = 0.7,
|
||||
bty = "n")
|
||||
|
||||
|
||||
# == 3.2 Plotting ratios ===================================================
|
||||
|
||||
# To better compare the values, we'll calculate ratios between
|
||||
# obsData and refData
|
||||
|
||||
barplot(obsData / refData,
|
||||
col = "#CCCCCC",
|
||||
ylab = "Sequence / Average",
|
||||
ylim = c(0, 2.5),
|
||||
cex.names = 0.7)
|
||||
abline(h = 1, col="#BB0000")
|
||||
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
|
||||
|
||||
# ... but ratios are not very good here, since the difference in height on the
|
||||
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
|
||||
# lines) are exactly the same fold-difference !
|
||||
|
||||
# == 3.3 Plotting log ratios ===============================================
|
||||
|
||||
# A better way to display this
|
||||
# is to plot log(ratios).
|
||||
|
||||
barplot(log(obsData / refData),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
ylim = log(c(1/3, 3)),
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
|
||||
|
||||
# Note how the two-fold difference lines are now the same distance from the
|
||||
# line of equal ratio.
|
||||
|
||||
# == 3.4 Sort by frequency =================================================
|
||||
|
||||
barplot(sort(log(obsData / refData), decreasing = TRUE),
|
||||
ylim = log(c(1/3, 3)),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
|
||||
|
||||
yTxt <- log(0.9)
|
||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
|
||||
text(5.5, yTxt, "Enriched", cex = 0.7)
|
||||
yTxt <- log(1.1)
|
||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
|
||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
# == 3.5 Color by amino acid type ==========================================
|
||||
|
||||
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
|
||||
# script, or define your own.
|
||||
|
||||
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
|
||||
|
||||
lR <- sort(log(obsData / refData), decreasing = TRUE)
|
||||
barplot(lR,
|
||||
ylim = log(c(1/3, 3)),
|
||||
col = AACOLS[names(lR)],
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#00000055")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
|
||||
|
||||
yTxt <- log(0.9)
|
||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
|
||||
text(5.5, yTxt, "Enriched", cex = 0.7)
|
||||
yTxt <- log(1.1)
|
||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
|
||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
|
||||
# Task:
|
||||
# Interpret this plot. (Can you?) Which types of amino acids are enriched?
|
||||
# Depleted?
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-SEQA-Composition.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-SEQA-Comparison unit
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-11 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# Versions:
|
||||
# 1.0 First live version 2017
|
||||
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 Preparation 48
|
||||
#TOC> 2 Aggregate properties 69
|
||||
#TOC> 3 Sequence Composition Enrichment 113
|
||||
#TOC> 3.1 Barplot, and side-by-side barplot 136
|
||||
#TOC> 3.2 Plotting ratios 171
|
||||
#TOC> 3.3 Plotting log ratios 188
|
||||
#TOC> 3.4 Sort by frequency 204
|
||||
#TOC> 3.5 Color by amino acid type 221
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation =========================================================
|
||||
|
||||
if (! requireNamespace("seqinr", quietly = TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Load a reference sequence to work with:
|
||||
|
||||
# If you have done the BIN-Storing_data unit:
|
||||
source("makeProteinDB.R")
|
||||
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
|
||||
mySeq <- myDB$protein$sequence[sel]
|
||||
|
||||
# If not, use the yeast Mbp1 sequence:
|
||||
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
|
||||
|
||||
|
||||
# = 2 Aggregate properties ================================================
|
||||
|
||||
|
||||
# Let's try a simple function from seqinr: computing the pI of the sequence
|
||||
?seqinr::computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
|
||||
# We can use the function strsplit() to split the string
|
||||
# into single characters
|
||||
|
||||
(s <- strsplit(mySeq, "")) # splitting on the empty spring
|
||||
# splits into single characters
|
||||
s <- unlist(s) # strsplit() returns a list! Why?
|
||||
# (But we don't need a list now...)
|
||||
|
||||
# Alternatively, seqinr provides
|
||||
# the function s2c() to convert strings into
|
||||
# character vectors (and c2s to convert them back).
|
||||
|
||||
seqinr::s2c(mySeq)
|
||||
|
||||
|
||||
seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point
|
||||
seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight
|
||||
seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of
|
||||
# values along the sequence
|
||||
|
||||
# A true Labor of Love has gone into the
|
||||
# compilation of the "aaindex" data:
|
||||
|
||||
?seqinr::aaindex
|
||||
data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it
|
||||
# accessible as an R object
|
||||
|
||||
length(aaindex) # no seqinr:: needed for the dataset since we just
|
||||
# "attached" it with data()
|
||||
|
||||
# Here are all the index descriptions
|
||||
for (i in 1:length(aaindex)) {
|
||||
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
|
||||
}
|
||||
|
||||
|
||||
# = 3 Sequence Composition Enrichment =====================================
|
||||
|
||||
|
||||
# Lets use one of the indices to calculate and plot amino-acid
|
||||
# composition enrichment:
|
||||
aaindex[[459]]$D
|
||||
|
||||
#
|
||||
# Let's construct an enrichment plot to compare average frequencies
|
||||
# with the amino acid counts in our sequence.
|
||||
|
||||
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
||||
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
|
||||
# code using seqinr's "a()" function
|
||||
sum(refData)
|
||||
refData # ... in %
|
||||
|
||||
|
||||
# tabulate the amino acid counts in mySeq
|
||||
(obsData <- table(seqinr::s2c(mySeq))) # counts
|
||||
(obsData <- 100 * (obsData / sum(obsData))) # frequencies
|
||||
|
||||
|
||||
# == 3.1 Barplot, and side-by-side barplot =================================
|
||||
|
||||
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#BB0000")
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# Ok: first problem - the values in obsData are in alphabetical order. But the
|
||||
# values in refData are in alphabetical order of amino acid name: alanine,
|
||||
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
|
||||
# order a lot - one of the old biochemistry tropes in the field. So we need to
|
||||
# re-order one of the vectors to match the other. That's easy though:
|
||||
refData
|
||||
(refData <- refData[names(obsData)])
|
||||
|
||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
|
||||
abline(h = 100/20, col="#555555")
|
||||
|
||||
# To compare the values, we want to see them in a barplot, side-by-side ...
|
||||
barplot(rbind(obsData, refData),
|
||||
ylim = c(0, 12),
|
||||
beside = TRUE,
|
||||
col = c("#CCCCCC", "#BB0000"),
|
||||
cex.names = 0.7)
|
||||
abline(h = 100/20, col="#00000044")
|
||||
|
||||
# ... and add a legend
|
||||
legend (x = 1, y = 12,
|
||||
legend = c("mySeq", "Average composition"),
|
||||
fill = c("#CCCCCC", "#BB0000"),
|
||||
cex = 0.7,
|
||||
bty = "n")
|
||||
|
||||
|
||||
# == 3.2 Plotting ratios ===================================================
|
||||
|
||||
# To better compare the values, we'll calculate ratios between
|
||||
# obsData and refData
|
||||
|
||||
barplot(obsData / refData,
|
||||
col = "#CCCCCC",
|
||||
ylab = "Sequence / Average",
|
||||
ylim = c(0, 2.5),
|
||||
cex.names = 0.7)
|
||||
abline(h = 1, col="#BB0000")
|
||||
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
|
||||
|
||||
# ... but ratios are not very good here, since the difference in height on the
|
||||
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
|
||||
# lines) are exactly the same fold-difference !
|
||||
|
||||
# == 3.3 Plotting log ratios ===============================================
|
||||
|
||||
# A better way to display this
|
||||
# is to plot log(ratios).
|
||||
|
||||
barplot(log(obsData / refData),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
ylim = log(c(1/3, 3)),
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
|
||||
|
||||
# Note how the two-fold difference lines are now the same distance from the
|
||||
# line of equal ratio.
|
||||
|
||||
# == 3.4 Sort by frequency =================================================
|
||||
|
||||
barplot(sort(log(obsData / refData), decreasing = TRUE),
|
||||
ylim = log(c(1/3, 3)),
|
||||
col = "#CCCCCC",
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#BB0000")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
|
||||
|
||||
yTxt <- log(0.9)
|
||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
|
||||
text(5.5, yTxt, "Enriched", cex = 0.7)
|
||||
yTxt <- log(1.1)
|
||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
|
||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
# == 3.5 Color by amino acid type ==========================================
|
||||
|
||||
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
|
||||
# script, or define your own.
|
||||
|
||||
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
|
||||
|
||||
lR <- sort(log(obsData / refData), decreasing = TRUE)
|
||||
barplot(lR,
|
||||
ylim = log(c(1/3, 3)),
|
||||
col = AACOLS[names(lR)],
|
||||
ylab = "log(Sequence / Average)",
|
||||
cex.names = 0.7)
|
||||
abline(h = log(1), col="#00000055")
|
||||
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
|
||||
|
||||
yTxt <- log(0.9)
|
||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
|
||||
text(5.5, yTxt, "Enriched", cex = 0.7)
|
||||
yTxt <- log(1.1)
|
||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
|
||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
|
||||
|
||||
|
||||
# Task:
|
||||
# Interpret this plot. (Can you?) Which types of amino acids are enriched?
|
||||
# Depleted?
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
788
BIN-Sequence.R
788
BIN-Sequence.R
@ -1,394 +1,394 @@
|
||||
# tocID <- "BIN-Sequence.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Sequence unit.
|
||||
#
|
||||
# Version: 1.5
|
||||
#
|
||||
# Date: 2017-09 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.5 2020 Updates
|
||||
# 1.4 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.3 Update set.seed() usage
|
||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||
# 1.1 Add chartr()
|
||||
# 1.0 First live version 2017.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Prepare 63
|
||||
#TOC> 2 Storing Sequence 80
|
||||
#TOC> 3 String properties 109
|
||||
#TOC> 4 Substrings 116
|
||||
#TOC> 5 Creating strings: sprintf() 137
|
||||
#TOC> 6 Changing strings 172
|
||||
#TOC> 6.1.1 Changing case 174
|
||||
#TOC> 6.1.2 Reverse 179
|
||||
#TOC> 6.1.3 Change characters 183
|
||||
#TOC> 6.1.4 Substitute characters 211
|
||||
#TOC> 6.2 stringi and stringr 231
|
||||
#TOC> 6.3 dbSanitizeSequence() 241
|
||||
#TOC> 7 Permuting and sampling 253
|
||||
#TOC> 7.1 Permutations 260
|
||||
#TOC> 7.2 Sampling 306
|
||||
#TOC> 7.2.1 Equiprobable characters 308
|
||||
#TOC> 7.2.2 Defined probability vector 350
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Prepare =============================================================
|
||||
|
||||
# Much basic sequence handling is supported by the Bioconductor package
|
||||
# Biostrings.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# = 2 Storing Sequence ====================================================
|
||||
|
||||
|
||||
# Sequences can be represented and stored as vectors of single characters ...
|
||||
(v <- c("D", "I", "V", "M", "T", "Q"))
|
||||
|
||||
# ... as strings ...
|
||||
(s <- "DIVMTQ")
|
||||
|
||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
|
||||
# DNAstring, RNAstring, AAString, etc.
|
||||
(a <- Biostrings::AAString("DIVMTQ"))
|
||||
|
||||
# ... and all of these representations can be interconverted:
|
||||
|
||||
# string to vector ...
|
||||
unlist(strsplit(s, ""))
|
||||
|
||||
# vector to string ...
|
||||
paste(v, sep = "", collapse = "")
|
||||
|
||||
# ... and AAstring to plain string.
|
||||
as.character(a)
|
||||
|
||||
# Since operations with character vectors trivially follow all other vector
|
||||
# conventions and syntax, and we will look at Biostrings methods in more
|
||||
# detail in a later unit, we will focus on basic strings in the following.
|
||||
|
||||
|
||||
# = 3 String properties ===================================================
|
||||
|
||||
|
||||
length(s) # why ???
|
||||
nchar(s) # Aha!
|
||||
|
||||
|
||||
# = 4 Substrings ==========================================================
|
||||
|
||||
# Use the substr() function
|
||||
substr(s, 2, 4)
|
||||
|
||||
# or the similar substring()
|
||||
substring(s, 2, 4)
|
||||
|
||||
# Note: both functions are vectorized (i.e. they operate on vectors
|
||||
# of arguments, you don't need to loop over input)...
|
||||
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
|
||||
substr( myBiCodes, 1, 3)
|
||||
substring(myBiCodes, 1, 3)
|
||||
|
||||
# ... however only substring() will also use vectors for start and stop
|
||||
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
||||
(vI <- seq(1, nchar(s), by = 3)) # an index vector
|
||||
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
|
||||
substring(s, vI, vI+2) # ... returns all triplets
|
||||
|
||||
|
||||
# = 5 Creating strings: sprintf() =========================================
|
||||
|
||||
|
||||
# Sprintf is a very smart, very powerful function and has cognates in all
|
||||
# other programming languages. It has a bit of a learning curve, but this is
|
||||
# totally worth it:
|
||||
# the function takes a format string, and a list of other arguments. It returns
|
||||
# a formatted string. Here are some examples - watch carefully for sprintf()
|
||||
# calls elsewhere in the code.
|
||||
|
||||
sprintf("Just a string.")
|
||||
sprintf("A string and the number %d.", 5)
|
||||
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
|
||||
sprintf("Pi is ~ %1.2f ...", pi)
|
||||
sprintf("or more accurately ~ %1.11f.", pi)
|
||||
x <- "bottles of beer"
|
||||
N <- 99
|
||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
|
||||
N, x, N, x, "one down, and pass it around", N - 1, x)
|
||||
|
||||
# Note that in the last example, the value of the string was displayed with
|
||||
# R's usual print-formatting function and therefore the line-break "\n" did
|
||||
# not actually break the line. To have line breaks, tabs etc, you need to use
|
||||
# cat() to display the string:
|
||||
|
||||
for (i in N:(N-4)) {
|
||||
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
|
||||
i, x, i, x, "one down, and pass it around", i - 1, x))
|
||||
}
|
||||
|
||||
# sprintf() is vectorized: if one of its parameters is a vector, it
|
||||
# will generate one output string for each of the vector's elements:
|
||||
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
|
||||
|
||||
|
||||
# = 6 Changing strings ====================================================
|
||||
|
||||
# === 6.1.1 Changing case
|
||||
tolower(s)
|
||||
toupper(tolower(s))
|
||||
|
||||
|
||||
# === 6.1.2 Reverse
|
||||
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
|
||||
# Biostrings::str_rev(s)
|
||||
# The following works, of course, but awkward:
|
||||
s
|
||||
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
|
||||
|
||||
# reverse complement
|
||||
COMP <- c("t", "g", "c", "a")
|
||||
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
|
||||
s
|
||||
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
|
||||
|
||||
|
||||
# === 6.1.3 Change characters
|
||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||
# correpsonding character in "new." Kind of like the COMP vector above ...
|
||||
|
||||
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
|
||||
|
||||
# One could implement toupper() and tolower() with this - remember that R has
|
||||
# character vectors of uppercase and lowercase letters as language constants.
|
||||
chartr(paste0(letters, collapse = ""),
|
||||
paste0(LETTERS, collapse = ""),
|
||||
"Twinkle, twinkle little star, how I wonder what you are.")
|
||||
|
||||
# One amusing way to use the function is for a reversible substitution
|
||||
# cypher.
|
||||
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# encode ...
|
||||
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
|
||||
|
||||
# decode ...
|
||||
chartr(myCypher, alBet, x)
|
||||
# (Nb. substitution cyphers are easy to crack!)
|
||||
|
||||
|
||||
# === 6.1.4 Substitute characters
|
||||
# gsub can change lengths.
|
||||
# Example: implementing the binary Fibonacci sequence:
|
||||
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
|
||||
( s <- 1 )
|
||||
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
|
||||
|
||||
# Iterate this line a few times ...
|
||||
#
|
||||
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
|
||||
# for the features of the sequence.
|
||||
|
||||
# I use gsub() often to delete unwanted characters ...
|
||||
# ... select something, and substitute the empty string for it.
|
||||
(s <- gsub("-", "", s))
|
||||
|
||||
# For example: clean up a sequence
|
||||
# copy/paste from UniProt
|
||||
(s <- " 10 20 30 40 50
|
||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
||||
|
||||
|
||||
# remove numbers
|
||||
(s <- gsub("[0-9]", "", s))
|
||||
|
||||
# remove "whitespace" (spaces, tabs, line breaks)...
|
||||
(s <- gsub("\\s", "", s))
|
||||
|
||||
# == 6.2 stringi and stringr ===============================================
|
||||
|
||||
# But there are also specialized functions eg. to remove leading/trailing
|
||||
# whitespace which may be important to sanitize user input etc. Have a look at
|
||||
# the function descriptions for the stringr and the stringi package. stringr is
|
||||
# part of the tidyverse, and for the most part a wrapper for stringi functions.
|
||||
# https://github.com/tidyverse/stringr
|
||||
|
||||
|
||||
|
||||
# == 6.3 dbSanitizeSequence() ==============================================
|
||||
|
||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
||||
# sequences that may be copy/pasted from Web-sources
|
||||
|
||||
cat( s <- ">FASTA header will be removed
|
||||
10 20 30 40 50
|
||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
|
||||
|
||||
dbSanitizeSequence(s)
|
||||
|
||||
|
||||
# = 7 Permuting and sampling ==============================================
|
||||
|
||||
|
||||
# An important aspect of working with strings is generating random strings
|
||||
# with given statistical properties: reference items to evaluate significance.
|
||||
|
||||
|
||||
# == 7.1 Permutations ======================================================
|
||||
|
||||
|
||||
# One way to produce such reference items is to permute a string. A permuted
|
||||
# string has the same composition as the original, but all positional
|
||||
# information is lost. The sample() function can be used to permute:
|
||||
|
||||
# This is the sequence of the ompA secretion signal
|
||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||
|
||||
(x <- sample(s, length(s))) # permuted
|
||||
|
||||
# Here's a small example how such permuted strings may be useful. As you look
|
||||
# at the ompA sequence, you suspect that the two lysines near the +-charged
|
||||
# N-terminus may not be accidental, but selected for a positively charged
|
||||
# N-terminus. What is the chance that such a sequence has two lysines close to
|
||||
# the N-terminus simply by chance? Or put differently: what is the average
|
||||
# distance of two lysines in such a sequence to the N-terminus. First, we
|
||||
# need an expression that measures the distance. A simple use of the which()
|
||||
# function will do just fine.
|
||||
|
||||
which(s == "K") # shows they are in position 2 and 3, so ...
|
||||
mean(which(s == "K")) # ... gives us the average, and ...
|
||||
mean(which(x == "K")) # ... gives us the average of the permuted sequence.
|
||||
|
||||
# So what does the distribution look like? Lets do 10,000 trials.
|
||||
|
||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||
N <- 10000
|
||||
d <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
d[i] <- mean(which(sample(s, length(s)) == "K"))
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(d, breaks = 20)
|
||||
abline(v = 2.5, lwd = 2, col = "firebrick")
|
||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
||||
# N-terminus or more. That's just below the signifcance
|
||||
# threshold of 5 %. It's a trend, but to be sure we are looking
|
||||
# at a biological effect we would need to see more
|
||||
# sequences.
|
||||
|
||||
|
||||
# == 7.2 Sampling ==========================================================
|
||||
|
||||
# === 7.2.1 Equiprobable characters
|
||||
|
||||
# Assume you need a large random-nucleotide string for some statistical model.
|
||||
# How to create such a string? sample() can easily create it:
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
table(v)
|
||||
sum(table(v)[c("G", "C")]) # 51 is close to expected
|
||||
|
||||
# What's the number of CpG motifs? Easy to check with the stringi
|
||||
# stri_match_all() function
|
||||
|
||||
if (! requireNamespace("stringi", quietly = TRUE)) {
|
||||
install.packages("stringi")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = stringi) # basic information
|
||||
# browseVignettes("stringi") # available vignettes
|
||||
# data(package = "stringi") # available datasets
|
||||
|
||||
|
||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||
length(unlist(x))
|
||||
|
||||
# Now you could compare that number with yeast DNA sequences, and determine
|
||||
# whether there are more or less CpG motifs than expected by chance.
|
||||
# (cf. https://en.wikipedia.org/wiki/CpG_site)
|
||||
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
|
||||
# with the same probability. But the yeast genomic DNA GC content is only
|
||||
# 38%. So you would expect fewer CpG motifs based on the statistical properties
|
||||
# of the smaller number of Cs and Gs - before biology even comes into play. How
|
||||
# do we account for that?
|
||||
|
||||
# === 7.2.2 Defined probability vector
|
||||
|
||||
# This is where we need to know how to create samples with specific probability
|
||||
# distributions. A crude hack would be to create a sampling source vector with
|
||||
# 19 C, 19 G, 31 A and 31 T
|
||||
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
|
||||
# ... but that doesn't scale if the numeric accuracy needs to be higher.
|
||||
#
|
||||
# However sample() has an argument that takes care of that: you can explicitly
|
||||
# specify the probabilities with which each element of the the sampling vector
|
||||
# should be chosen:
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
table(v)
|
||||
sum(table(v)[c("G", "C")]) # Close to expected
|
||||
|
||||
# What's the number of CpG motifs?
|
||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||
# ... not a single one in this case.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "BIN-Sequence.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Sequence unit.
|
||||
#
|
||||
# Version: 1.5
|
||||
#
|
||||
# Date: 2017-09 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.5 2020 Updates
|
||||
# 1.4 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.3 Update set.seed() usage
|
||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||
# 1.1 Add chartr()
|
||||
# 1.0 First live version 2017.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Prepare 63
|
||||
#TOC> 2 Storing Sequence 80
|
||||
#TOC> 3 String properties 109
|
||||
#TOC> 4 Substrings 116
|
||||
#TOC> 5 Creating strings: sprintf() 137
|
||||
#TOC> 6 Changing strings 172
|
||||
#TOC> 6.1.1 Changing case 174
|
||||
#TOC> 6.1.2 Reverse 179
|
||||
#TOC> 6.1.3 Change characters 183
|
||||
#TOC> 6.1.4 Substitute characters 211
|
||||
#TOC> 6.2 stringi and stringr 231
|
||||
#TOC> 6.3 dbSanitizeSequence() 241
|
||||
#TOC> 7 Permuting and sampling 253
|
||||
#TOC> 7.1 Permutations 260
|
||||
#TOC> 7.2 Sampling 306
|
||||
#TOC> 7.2.1 Equiprobable characters 308
|
||||
#TOC> 7.2.2 Defined probability vector 350
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Prepare =============================================================
|
||||
|
||||
# Much basic sequence handling is supported by the Bioconductor package
|
||||
# Biostrings.
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# = 2 Storing Sequence ====================================================
|
||||
|
||||
|
||||
# Sequences can be represented and stored as vectors of single characters ...
|
||||
(v <- c("D", "I", "V", "M", "T", "Q"))
|
||||
|
||||
# ... as strings ...
|
||||
(s <- "DIVMTQ")
|
||||
|
||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
|
||||
# DNAstring, RNAstring, AAString, etc.
|
||||
(a <- Biostrings::AAString("DIVMTQ"))
|
||||
|
||||
# ... and all of these representations can be interconverted:
|
||||
|
||||
# string to vector ...
|
||||
unlist(strsplit(s, ""))
|
||||
|
||||
# vector to string ...
|
||||
paste(v, sep = "", collapse = "")
|
||||
|
||||
# ... and AAstring to plain string.
|
||||
as.character(a)
|
||||
|
||||
# Since operations with character vectors trivially follow all other vector
|
||||
# conventions and syntax, and we will look at Biostrings methods in more
|
||||
# detail in a later unit, we will focus on basic strings in the following.
|
||||
|
||||
|
||||
# = 3 String properties ===================================================
|
||||
|
||||
|
||||
length(s) # why ???
|
||||
nchar(s) # Aha!
|
||||
|
||||
|
||||
# = 4 Substrings ==========================================================
|
||||
|
||||
# Use the substr() function
|
||||
substr(s, 2, 4)
|
||||
|
||||
# or the similar substring()
|
||||
substring(s, 2, 4)
|
||||
|
||||
# Note: both functions are vectorized (i.e. they operate on vectors
|
||||
# of arguments, you don't need to loop over input)...
|
||||
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
|
||||
substr( myBiCodes, 1, 3)
|
||||
substring(myBiCodes, 1, 3)
|
||||
|
||||
# ... however only substring() will also use vectors for start and stop
|
||||
s <- "gatattgtgatgacccagtaa" # a DNA sequence
|
||||
(vI <- seq(1, nchar(s), by = 3)) # an index vector
|
||||
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
|
||||
substring(s, vI, vI+2) # ... returns all triplets
|
||||
|
||||
|
||||
# = 5 Creating strings: sprintf() =========================================
|
||||
|
||||
|
||||
# Sprintf is a very smart, very powerful function and has cognates in all
|
||||
# other programming languages. It has a bit of a learning curve, but this is
|
||||
# totally worth it:
|
||||
# the function takes a format string, and a list of other arguments. It returns
|
||||
# a formatted string. Here are some examples - watch carefully for sprintf()
|
||||
# calls elsewhere in the code.
|
||||
|
||||
sprintf("Just a string.")
|
||||
sprintf("A string and the number %d.", 5)
|
||||
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
|
||||
sprintf("Pi is ~ %1.2f ...", pi)
|
||||
sprintf("or more accurately ~ %1.11f.", pi)
|
||||
x <- "bottles of beer"
|
||||
N <- 99
|
||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
|
||||
N, x, N, x, "one down, and pass it around", N - 1, x)
|
||||
|
||||
# Note that in the last example, the value of the string was displayed with
|
||||
# R's usual print-formatting function and therefore the line-break "\n" did
|
||||
# not actually break the line. To have line breaks, tabs etc, you need to use
|
||||
# cat() to display the string:
|
||||
|
||||
for (i in N:(N-4)) {
|
||||
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
|
||||
i, x, i, x, "one down, and pass it around", i - 1, x))
|
||||
}
|
||||
|
||||
# sprintf() is vectorized: if one of its parameters is a vector, it
|
||||
# will generate one output string for each of the vector's elements:
|
||||
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
|
||||
|
||||
|
||||
# = 6 Changing strings ====================================================
|
||||
|
||||
# === 6.1.1 Changing case
|
||||
tolower(s)
|
||||
toupper(tolower(s))
|
||||
|
||||
|
||||
# === 6.1.2 Reverse
|
||||
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
|
||||
# Biostrings::str_rev(s)
|
||||
# The following works, of course, but awkward:
|
||||
s
|
||||
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
|
||||
|
||||
# reverse complement
|
||||
COMP <- c("t", "g", "c", "a")
|
||||
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
|
||||
s
|
||||
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
|
||||
|
||||
|
||||
# === 6.1.3 Change characters
|
||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||
# correpsonding character in "new." Kind of like the COMP vector above ...
|
||||
|
||||
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
|
||||
|
||||
# One could implement toupper() and tolower() with this - remember that R has
|
||||
# character vectors of uppercase and lowercase letters as language constants.
|
||||
chartr(paste0(letters, collapse = ""),
|
||||
paste0(LETTERS, collapse = ""),
|
||||
"Twinkle, twinkle little star, how I wonder what you are.")
|
||||
|
||||
# One amusing way to use the function is for a reversible substitution
|
||||
# cypher.
|
||||
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# encode ...
|
||||
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
|
||||
|
||||
# decode ...
|
||||
chartr(myCypher, alBet, x)
|
||||
# (Nb. substitution cyphers are easy to crack!)
|
||||
|
||||
|
||||
# === 6.1.4 Substitute characters
|
||||
# gsub can change lengths.
|
||||
# Example: implementing the binary Fibonacci sequence:
|
||||
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
|
||||
( s <- 1 )
|
||||
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
|
||||
|
||||
# Iterate this line a few times ...
|
||||
#
|
||||
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
|
||||
# for the features of the sequence.
|
||||
|
||||
# I use gsub() often to delete unwanted characters ...
|
||||
# ... select something, and substitute the empty string for it.
|
||||
(s <- gsub("-", "", s))
|
||||
|
||||
# For example: clean up a sequence
|
||||
# copy/paste from UniProt
|
||||
(s <- " 10 20 30 40 50
|
||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
|
||||
|
||||
|
||||
# remove numbers
|
||||
(s <- gsub("[0-9]", "", s))
|
||||
|
||||
# remove "whitespace" (spaces, tabs, line breaks)...
|
||||
(s <- gsub("\\s", "", s))
|
||||
|
||||
# == 6.2 stringi and stringr ===============================================
|
||||
|
||||
# But there are also specialized functions eg. to remove leading/trailing
|
||||
# whitespace which may be important to sanitize user input etc. Have a look at
|
||||
# the function descriptions for the stringr and the stringi package. stringr is
|
||||
# part of the tidyverse, and for the most part a wrapper for stringi functions.
|
||||
# https://github.com/tidyverse/stringr
|
||||
|
||||
|
||||
|
||||
# == 6.3 dbSanitizeSequence() ==============================================
|
||||
|
||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
|
||||
# sequences that may be copy/pasted from Web-sources
|
||||
|
||||
cat( s <- ">FASTA header will be removed
|
||||
10 20 30 40 50
|
||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
|
||||
|
||||
dbSanitizeSequence(s)
|
||||
|
||||
|
||||
# = 7 Permuting and sampling ==============================================
|
||||
|
||||
|
||||
# An important aspect of working with strings is generating random strings
|
||||
# with given statistical properties: reference items to evaluate significance.
|
||||
|
||||
|
||||
# == 7.1 Permutations ======================================================
|
||||
|
||||
|
||||
# One way to produce such reference items is to permute a string. A permuted
|
||||
# string has the same composition as the original, but all positional
|
||||
# information is lost. The sample() function can be used to permute:
|
||||
|
||||
# This is the sequence of the ompA secretion signal
|
||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||
|
||||
(x <- sample(s, length(s))) # permuted
|
||||
|
||||
# Here's a small example how such permuted strings may be useful. As you look
|
||||
# at the ompA sequence, you suspect that the two lysines near the +-charged
|
||||
# N-terminus may not be accidental, but selected for a positively charged
|
||||
# N-terminus. What is the chance that such a sequence has two lysines close to
|
||||
# the N-terminus simply by chance? Or put differently: what is the average
|
||||
# distance of two lysines in such a sequence to the N-terminus. First, we
|
||||
# need an expression that measures the distance. A simple use of the which()
|
||||
# function will do just fine.
|
||||
|
||||
which(s == "K") # shows they are in position 2 and 3, so ...
|
||||
mean(which(s == "K")) # ... gives us the average, and ...
|
||||
mean(which(x == "K")) # ... gives us the average of the permuted sequence.
|
||||
|
||||
# So what does the distribution look like? Lets do 10,000 trials.
|
||||
|
||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
|
||||
N <- 10000
|
||||
d <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
d[i] <- mean(which(sample(s, length(s)) == "K"))
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(d, breaks = 20)
|
||||
abline(v = 2.5, lwd = 2, col = "firebrick")
|
||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
||||
# N-terminus or more. That's just below the signifcance
|
||||
# threshold of 5 %. It's a trend, but to be sure we are looking
|
||||
# at a biological effect we would need to see more
|
||||
# sequences.
|
||||
|
||||
|
||||
# == 7.2 Sampling ==========================================================
|
||||
|
||||
# === 7.2.1 Equiprobable characters
|
||||
|
||||
# Assume you need a large random-nucleotide string for some statistical model.
|
||||
# How to create such a string? sample() can easily create it:
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
table(v)
|
||||
sum(table(v)[c("G", "C")]) # 51 is close to expected
|
||||
|
||||
# What's the number of CpG motifs? Easy to check with the stringi
|
||||
# stri_match_all() function
|
||||
|
||||
if (! requireNamespace("stringi", quietly = TRUE)) {
|
||||
install.packages("stringi")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = stringi) # basic information
|
||||
# browseVignettes("stringi") # available vignettes
|
||||
# data(package = "stringi") # available datasets
|
||||
|
||||
|
||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||
length(unlist(x))
|
||||
|
||||
# Now you could compare that number with yeast DNA sequences, and determine
|
||||
# whether there are more or less CpG motifs than expected by chance.
|
||||
# (cf. https://en.wikipedia.org/wiki/CpG_site)
|
||||
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
|
||||
# with the same probability. But the yeast genomic DNA GC content is only
|
||||
# 38%. So you would expect fewer CpG motifs based on the statistical properties
|
||||
# of the smaller number of Cs and Gs - before biology even comes into play. How
|
||||
# do we account for that?
|
||||
|
||||
# === 7.2.2 Defined probability vector
|
||||
|
||||
# This is where we need to know how to create samples with specific probability
|
||||
# distributions. A crude hack would be to create a sampling source vector with
|
||||
# 19 C, 19 G, 31 A and 31 T
|
||||
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
|
||||
# ... but that doesn't scale if the numeric accuracy needs to be higher.
|
||||
#
|
||||
# However sample() has an argument that takes care of that: you can explicitly
|
||||
# specify the probabilities with which each element of the the sampling vector
|
||||
# should be chosen:
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
N <- 100
|
||||
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
|
||||
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
table(v)
|
||||
sum(table(v)[c("G", "C")]) # Close to expected
|
||||
|
||||
# What's the number of CpG motifs?
|
||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
|
||||
# ... not a single one in this case.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
1368
BIN-Storing_data.R
1368
BIN-Storing_data.R
File diff suppressed because it is too large
Load Diff
@ -1,349 +1,349 @@
|
||||
# tocID <- "FND-Genetic_code.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-Genetic_code unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.1 Comment on "incomplete final line" warning in FASTA
|
||||
# 1.0 First live version
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------------
|
||||
#TOC> 1 Storing the genetic code 45
|
||||
#TOC> 1.1 Genetic code in Biostrings 63
|
||||
#TOC> 2 Working with the genetic code 94
|
||||
#TOC> 2.1 Translate a sequence. 129
|
||||
#TOC> 3 An alternative representation: 3D array 212
|
||||
#TOC> 3.1 Print a Genetic code table 246
|
||||
#TOC> 4 Tasks 272
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Storing the genetic code ============================================
|
||||
|
||||
# The genetic code maps trinucleotide codons to amino acids. To store it, we
|
||||
# need some mechanism to associate the two representations. The most
|
||||
# convenient way to do that is a "named vector" which holds the amino acid
|
||||
# code and assigns the codons as names to its elements.
|
||||
|
||||
x <- c("M", "H", "H", "*", "*", "*")
|
||||
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
|
||||
x
|
||||
|
||||
# Then we can access the vector by the codon as name, and retrieve the
|
||||
# amino acid ...
|
||||
|
||||
x["ATG"]
|
||||
x["CAC"]
|
||||
x["TAA"]
|
||||
|
||||
# ... or the names of elements, to retrieve the codon(s)
|
||||
names(x)[x == "M"]
|
||||
names(x)[x == "H"]
|
||||
names(x)[x == "*"]
|
||||
|
||||
|
||||
# == 1.1 Genetic code in Biostrings ========================================
|
||||
|
||||
# Coveniently, the standard genetic code as well as its alternatives are
|
||||
# available in the Bioconductor "Biostrings" package:
|
||||
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# The standard genetic code vector
|
||||
Biostrings::GENETIC_CODE
|
||||
|
||||
# The table of genetic codes. This information corresponds to this page
|
||||
# at the NCBI:
|
||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
|
||||
# Most of the alternative codes are mitochondrial codes. The id of the
|
||||
# Alternative Yeast Nuclear code is "12"
|
||||
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
|
||||
|
||||
|
||||
# = 2 Working with the genetic code =======================================
|
||||
|
||||
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
|
||||
# to a "local" variable, rather than retrieving it from the package all the
|
||||
# time.
|
||||
|
||||
GC <- Biostrings::GENETIC_CODE
|
||||
|
||||
# This is a named vector of characters ...
|
||||
|
||||
str(GC)
|
||||
|
||||
# ... which also stores the alternative initiation codons TTG and CTG in
|
||||
# an attribute of the vector. (Alternative initiation codons sometimes are
|
||||
# used instead of ATG to intiate translation, if translation is not initiated
|
||||
# at ATG thses are still translated with fMet.)
|
||||
|
||||
attr(GC, "alt_init_codons")
|
||||
|
||||
# But the key to use this vector is in the "names" which we use for subsetting
|
||||
# the list of amino acids in whatever way we need.
|
||||
names(GC)
|
||||
|
||||
# The translation of "TGG" ...
|
||||
GC["TGG"]
|
||||
|
||||
# All stop codons
|
||||
names(GC)[GC == "*"]
|
||||
|
||||
# All start codons
|
||||
names(GC)[GC == "M"] # ... or
|
||||
c(names(GC)[GC == "M"],
|
||||
attr(GC, "alt_init_codons"))
|
||||
|
||||
|
||||
# == 2.1 Translate a sequence. =============================================
|
||||
|
||||
|
||||
# I have provided a gene sequence in the data directory:
|
||||
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
|
||||
|
||||
# read it
|
||||
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
|
||||
# You will notice that this generates a Warning message:
|
||||
# Warning message:
|
||||
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
|
||||
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
|
||||
|
||||
# The reason for this is that the last character of the file is the letter "A"
|
||||
# and not a "\n" line break. This file is exactly how it was sent from the
|
||||
# NCBI server; I think good, defensive programming practice would have been to
|
||||
# include some kind of an end-marker in the file, like a final "\n". This helps
|
||||
# us recognize an incomplete transmission. Let's parse the actual sequence from
|
||||
# the file, and then check for completeness.
|
||||
|
||||
|
||||
head(mbp1)
|
||||
|
||||
# drop the first line (header)
|
||||
mbp1 <- mbp1[-1]
|
||||
head(mbp1)
|
||||
|
||||
# concatenate it all to a single string
|
||||
mbp1 <- paste(mbp1, sep = "", collapse = "")
|
||||
|
||||
# how long is it?
|
||||
nchar(mbp1)
|
||||
|
||||
# how many codons?
|
||||
nchar(mbp1)/3
|
||||
|
||||
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
|
||||
# first verification that the file we read is complete, the nucleotides of a
|
||||
# complete ORF should be divisible by 3.
|
||||
|
||||
# Extract the codons. There are many ways to split a long string into chunks
|
||||
# of three characters. Here we use the Biostrings codons() function. codons()
|
||||
# requires an object of type DNAstring - a special kind of string with
|
||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
||||
# with DNAstring(), then split it up, then convert it into a plain
|
||||
# character vector.
|
||||
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
|
||||
|
||||
head(mbp1Codons)
|
||||
|
||||
# now translate each codon
|
||||
|
||||
mbp1AA <- character(834)
|
||||
for (i in seq_along(mbp1Codons)) {
|
||||
mbp1AA[i] <- GC[mbp1Codons[i]]
|
||||
}
|
||||
|
||||
head(mbp1Codons)
|
||||
head(mbp1AA)
|
||||
|
||||
tail(mbp1Codons)
|
||||
tail(mbp1AA) # Note the stop!
|
||||
|
||||
# The TAA "ochre" stop codon is our second verification that the nucleotide
|
||||
# sequence is complete: a stop codon can't appear internally in an ORF.
|
||||
|
||||
# We can work with the mbp1AA vector, for example to tabulate the
|
||||
# amino acid frequencies:
|
||||
table(mbp1AA)
|
||||
sort(table(mbp1AA), decreasing = TRUE)
|
||||
|
||||
# Or we can paste all elements together into a single string. But let's remove
|
||||
# the stop, it's not actually a part of the sequence. To remove the last element
|
||||
# of a vector, re-assign it with a vector minus the index of the last element:
|
||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
||||
tail(mbp1AA) # Note the stop is gone!
|
||||
|
||||
# paste it together, collapsing the elements using an empty string as the
|
||||
# separation-character (i.e.: nothing)
|
||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
||||
|
||||
|
||||
# = 3 An alternative representation: 3D array =============================
|
||||
|
||||
|
||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
||||
# here is a good opportunity to review the syntax of 3D arrays with a
|
||||
# genetic code cube:
|
||||
|
||||
# Initialize, using A G C T as the names of the elements in each dimension
|
||||
cCube <- array(data = character(64),
|
||||
dim = c(4, 4, 4),
|
||||
dimnames = list(c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T")))
|
||||
|
||||
# fill it with amino acid codes using three nested loops
|
||||
for (i in 1:4) {
|
||||
for (j in 1:4) {
|
||||
for (k in 1:4) {
|
||||
myCodon <- paste(dimnames(cCube)[[1]][i],
|
||||
dimnames(cCube)[[2]][j],
|
||||
dimnames(cCube)[[3]][k],
|
||||
sep = "",
|
||||
collapse = "")
|
||||
cCube[i, j, k] <- GC[myCodon]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# confirm
|
||||
cCube["A", "T", "G"] # methionine
|
||||
cCube["T", "T", "T"] # phenylalanine
|
||||
cCube["T", "A", "G"] # stop (amber)
|
||||
|
||||
|
||||
|
||||
# == 3.1 Print a Genetic code table ========================================
|
||||
|
||||
|
||||
# The data structure of our cCube is well suited to print a table. In the
|
||||
# "standard" way to print the genetic code, we write codons with the same
|
||||
# second nucleotide in columns, and arrange rows in blocks of same
|
||||
# first nucleotide, varying the third nucleotide fastest. This maximizes the
|
||||
# similarity of adjacent amino acids in the table if we print the
|
||||
# nucleotides in the order T C A G. It's immidiately obvious that the code
|
||||
# is not random: the universal genetic code is exceptionally error tolerant in
|
||||
# the sense that mutations (or single-nucleotide translation errors) are likely
|
||||
# to result in an amino acid with similar biophysical properties as the
|
||||
# original.
|
||||
|
||||
nuc <- c("T", "C", "A", "G")
|
||||
|
||||
# (calling variables f, s, t to indicate first, second, and third position ...)
|
||||
for (f in nuc) { # first varies in blocks
|
||||
for (t in nuc) { # third varies in columns
|
||||
for (s in nuc) { # second varies in rows
|
||||
cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t]))
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
|
||||
# = 4 Tasks ===============================================================
|
||||
|
||||
|
||||
# Task: What do you need to change to print the table with U instead
|
||||
# of T? Try it.
|
||||
|
||||
|
||||
# Task: Point mutations are more often transitions (purine -> purine;
|
||||
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
|
||||
# pyrimidine -> purine), even though twice as many transversions
|
||||
# are possible in the code. This is most likely due a deamination /
|
||||
# tautomerization process that favours C -> T changes. If the code
|
||||
# indeed minimizes the effect of mutations, you would expect that
|
||||
# codons that differ by a transition code for more similar amino acids
|
||||
# than codons that differ by a transversion. Is that true? List the set
|
||||
# of all amino acid pairs that are encoded by codons with a C -> T
|
||||
# transition. Then list the set of amino acid pairs with a C -> A
|
||||
# transversion. Which set of pairs is more similar?
|
||||
|
||||
|
||||
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
|
||||
# have if you translate them in the 2. or the 3. frame?
|
||||
|
||||
|
||||
# Task: How does the amino acid composition change if you translate the mbp1
|
||||
# gene with the Alternative Yeast Nuclear code that is used by the
|
||||
# "GTC clade" of fungi?
|
||||
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
|
||||
|
||||
# Solution:
|
||||
|
||||
# Fetch the code
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
|
||||
altYcode <- Biostrings::getGeneticCode("12")
|
||||
|
||||
# what's the difference?
|
||||
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
|
||||
|
||||
Biostrings::GENETIC_CODE[delta]
|
||||
altYcode[delta]
|
||||
|
||||
# translate
|
||||
altYAA <- character(834)
|
||||
for (i in seq_along(mbp1Codons)) {
|
||||
altYAA[i] <- altYcode[mbp1Codons[i]]
|
||||
}
|
||||
|
||||
table(mbp1AA)
|
||||
table(altYAA)
|
||||
|
||||
# Task: The genetic code has significant redundacy, i.e. there are up to six
|
||||
# codons that code for the same amino acid. Write code that lists how
|
||||
# many amino acids are present how often i.e. it should tell you that
|
||||
# two amino acids are encoded only with a single codon, three amino
|
||||
# acids have six codons, etc. Solution below, but don't peek. There
|
||||
# are many possible ways to do this.
|
||||
#
|
||||
#
|
||||
# Solution:
|
||||
( x <- table(table(Biostrings::GENETIC_CODE)) )
|
||||
|
||||
# confirm
|
||||
sum(x * as.numeric(names(x)))
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "FND-Genetic_code.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-Genetic_code unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.1 Comment on "incomplete final line" warning in FASTA
|
||||
# 1.0 First live version
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------------
|
||||
#TOC> 1 Storing the genetic code 45
|
||||
#TOC> 1.1 Genetic code in Biostrings 63
|
||||
#TOC> 2 Working with the genetic code 94
|
||||
#TOC> 2.1 Translate a sequence. 129
|
||||
#TOC> 3 An alternative representation: 3D array 212
|
||||
#TOC> 3.1 Print a Genetic code table 246
|
||||
#TOC> 4 Tasks 272
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Storing the genetic code ============================================
|
||||
|
||||
# The genetic code maps trinucleotide codons to amino acids. To store it, we
|
||||
# need some mechanism to associate the two representations. The most
|
||||
# convenient way to do that is a "named vector" which holds the amino acid
|
||||
# code and assigns the codons as names to its elements.
|
||||
|
||||
x <- c("M", "H", "H", "*", "*", "*")
|
||||
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
|
||||
x
|
||||
|
||||
# Then we can access the vector by the codon as name, and retrieve the
|
||||
# amino acid ...
|
||||
|
||||
x["ATG"]
|
||||
x["CAC"]
|
||||
x["TAA"]
|
||||
|
||||
# ... or the names of elements, to retrieve the codon(s)
|
||||
names(x)[x == "M"]
|
||||
names(x)[x == "H"]
|
||||
names(x)[x == "*"]
|
||||
|
||||
|
||||
# == 1.1 Genetic code in Biostrings ========================================
|
||||
|
||||
# Coveniently, the standard genetic code as well as its alternatives are
|
||||
# available in the Bioconductor "Biostrings" package:
|
||||
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# The standard genetic code vector
|
||||
Biostrings::GENETIC_CODE
|
||||
|
||||
# The table of genetic codes. This information corresponds to this page
|
||||
# at the NCBI:
|
||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
|
||||
# Most of the alternative codes are mitochondrial codes. The id of the
|
||||
# Alternative Yeast Nuclear code is "12"
|
||||
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
|
||||
|
||||
|
||||
# = 2 Working with the genetic code =======================================
|
||||
|
||||
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
|
||||
# to a "local" variable, rather than retrieving it from the package all the
|
||||
# time.
|
||||
|
||||
GC <- Biostrings::GENETIC_CODE
|
||||
|
||||
# This is a named vector of characters ...
|
||||
|
||||
str(GC)
|
||||
|
||||
# ... which also stores the alternative initiation codons TTG and CTG in
|
||||
# an attribute of the vector. (Alternative initiation codons sometimes are
|
||||
# used instead of ATG to intiate translation, if translation is not initiated
|
||||
# at ATG thses are still translated with fMet.)
|
||||
|
||||
attr(GC, "alt_init_codons")
|
||||
|
||||
# But the key to use this vector is in the "names" which we use for subsetting
|
||||
# the list of amino acids in whatever way we need.
|
||||
names(GC)
|
||||
|
||||
# The translation of "TGG" ...
|
||||
GC["TGG"]
|
||||
|
||||
# All stop codons
|
||||
names(GC)[GC == "*"]
|
||||
|
||||
# All start codons
|
||||
names(GC)[GC == "M"] # ... or
|
||||
c(names(GC)[GC == "M"],
|
||||
attr(GC, "alt_init_codons"))
|
||||
|
||||
|
||||
# == 2.1 Translate a sequence. =============================================
|
||||
|
||||
|
||||
# I have provided a gene sequence in the data directory:
|
||||
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
|
||||
|
||||
# read it
|
||||
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
|
||||
# You will notice that this generates a Warning message:
|
||||
# Warning message:
|
||||
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
|
||||
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
|
||||
|
||||
# The reason for this is that the last character of the file is the letter "A"
|
||||
# and not a "\n" line break. This file is exactly how it was sent from the
|
||||
# NCBI server; I think good, defensive programming practice would have been to
|
||||
# include some kind of an end-marker in the file, like a final "\n". This helps
|
||||
# us recognize an incomplete transmission. Let's parse the actual sequence from
|
||||
# the file, and then check for completeness.
|
||||
|
||||
|
||||
head(mbp1)
|
||||
|
||||
# drop the first line (header)
|
||||
mbp1 <- mbp1[-1]
|
||||
head(mbp1)
|
||||
|
||||
# concatenate it all to a single string
|
||||
mbp1 <- paste(mbp1, sep = "", collapse = "")
|
||||
|
||||
# how long is it?
|
||||
nchar(mbp1)
|
||||
|
||||
# how many codons?
|
||||
nchar(mbp1)/3
|
||||
|
||||
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
|
||||
# first verification that the file we read is complete, the nucleotides of a
|
||||
# complete ORF should be divisible by 3.
|
||||
|
||||
# Extract the codons. There are many ways to split a long string into chunks
|
||||
# of three characters. Here we use the Biostrings codons() function. codons()
|
||||
# requires an object of type DNAstring - a special kind of string with
|
||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
||||
# with DNAstring(), then split it up, then convert it into a plain
|
||||
# character vector.
|
||||
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
|
||||
|
||||
head(mbp1Codons)
|
||||
|
||||
# now translate each codon
|
||||
|
||||
mbp1AA <- character(834)
|
||||
for (i in seq_along(mbp1Codons)) {
|
||||
mbp1AA[i] <- GC[mbp1Codons[i]]
|
||||
}
|
||||
|
||||
head(mbp1Codons)
|
||||
head(mbp1AA)
|
||||
|
||||
tail(mbp1Codons)
|
||||
tail(mbp1AA) # Note the stop!
|
||||
|
||||
# The TAA "ochre" stop codon is our second verification that the nucleotide
|
||||
# sequence is complete: a stop codon can't appear internally in an ORF.
|
||||
|
||||
# We can work with the mbp1AA vector, for example to tabulate the
|
||||
# amino acid frequencies:
|
||||
table(mbp1AA)
|
||||
sort(table(mbp1AA), decreasing = TRUE)
|
||||
|
||||
# Or we can paste all elements together into a single string. But let's remove
|
||||
# the stop, it's not actually a part of the sequence. To remove the last element
|
||||
# of a vector, re-assign it with a vector minus the index of the last element:
|
||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
||||
tail(mbp1AA) # Note the stop is gone!
|
||||
|
||||
# paste it together, collapsing the elements using an empty string as the
|
||||
# separation-character (i.e.: nothing)
|
||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
||||
|
||||
|
||||
# = 3 An alternative representation: 3D array =============================
|
||||
|
||||
|
||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
||||
# here is a good opportunity to review the syntax of 3D arrays with a
|
||||
# genetic code cube:
|
||||
|
||||
# Initialize, using A G C T as the names of the elements in each dimension
|
||||
cCube <- array(data = character(64),
|
||||
dim = c(4, 4, 4),
|
||||
dimnames = list(c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T")))
|
||||
|
||||
# fill it with amino acid codes using three nested loops
|
||||
for (i in 1:4) {
|
||||
for (j in 1:4) {
|
||||
for (k in 1:4) {
|
||||
myCodon <- paste(dimnames(cCube)[[1]][i],
|
||||
dimnames(cCube)[[2]][j],
|
||||
dimnames(cCube)[[3]][k],
|
||||
sep = "",
|
||||
collapse = "")
|
||||
cCube[i, j, k] <- GC[myCodon]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# confirm
|
||||
cCube["A", "T", "G"] # methionine
|
||||
cCube["T", "T", "T"] # phenylalanine
|
||||
cCube["T", "A", "G"] # stop (amber)
|
||||
|
||||
|
||||
|
||||
# == 3.1 Print a Genetic code table ========================================
|
||||
|
||||
|
||||
# The data structure of our cCube is well suited to print a table. In the
|
||||
# "standard" way to print the genetic code, we write codons with the same
|
||||
# second nucleotide in columns, and arrange rows in blocks of same
|
||||
# first nucleotide, varying the third nucleotide fastest. This maximizes the
|
||||
# similarity of adjacent amino acids in the table if we print the
|
||||
# nucleotides in the order T C A G. It's immidiately obvious that the code
|
||||
# is not random: the universal genetic code is exceptionally error tolerant in
|
||||
# the sense that mutations (or single-nucleotide translation errors) are likely
|
||||
# to result in an amino acid with similar biophysical properties as the
|
||||
# original.
|
||||
|
||||
nuc <- c("T", "C", "A", "G")
|
||||
|
||||
# (calling variables f, s, t to indicate first, second, and third position ...)
|
||||
for (f in nuc) { # first varies in blocks
|
||||
for (t in nuc) { # third varies in columns
|
||||
for (s in nuc) { # second varies in rows
|
||||
cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t]))
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
cat("\n")
|
||||
}
|
||||
|
||||
|
||||
# = 4 Tasks ===============================================================
|
||||
|
||||
|
||||
# Task: What do you need to change to print the table with U instead
|
||||
# of T? Try it.
|
||||
|
||||
|
||||
# Task: Point mutations are more often transitions (purine -> purine;
|
||||
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
|
||||
# pyrimidine -> purine), even though twice as many transversions
|
||||
# are possible in the code. This is most likely due a deamination /
|
||||
# tautomerization process that favours C -> T changes. If the code
|
||||
# indeed minimizes the effect of mutations, you would expect that
|
||||
# codons that differ by a transition code for more similar amino acids
|
||||
# than codons that differ by a transversion. Is that true? List the set
|
||||
# of all amino acid pairs that are encoded by codons with a C -> T
|
||||
# transition. Then list the set of amino acid pairs with a C -> A
|
||||
# transversion. Which set of pairs is more similar?
|
||||
|
||||
|
||||
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
|
||||
# have if you translate them in the 2. or the 3. frame?
|
||||
|
||||
|
||||
# Task: How does the amino acid composition change if you translate the mbp1
|
||||
# gene with the Alternative Yeast Nuclear code that is used by the
|
||||
# "GTC clade" of fungi?
|
||||
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
|
||||
|
||||
# Solution:
|
||||
|
||||
# Fetch the code
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
|
||||
altYcode <- Biostrings::getGeneticCode("12")
|
||||
|
||||
# what's the difference?
|
||||
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
|
||||
|
||||
Biostrings::GENETIC_CODE[delta]
|
||||
altYcode[delta]
|
||||
|
||||
# translate
|
||||
altYAA <- character(834)
|
||||
for (i in seq_along(mbp1Codons)) {
|
||||
altYAA[i] <- altYcode[mbp1Codons[i]]
|
||||
}
|
||||
|
||||
table(mbp1AA)
|
||||
table(altYAA)
|
||||
|
||||
# Task: The genetic code has significant redundacy, i.e. there are up to six
|
||||
# codons that code for the same amino acid. Write code that lists how
|
||||
# many amino acids are present how often i.e. it should tell you that
|
||||
# two amino acids are encoded only with a single codon, three amino
|
||||
# acids have six codons, etc. Solution below, but don't peek. There
|
||||
# are many possible ways to do this.
|
||||
#
|
||||
#
|
||||
# Solution:
|
||||
( x <- table(table(Biostrings::GENETIC_CODE)) )
|
||||
|
||||
# confirm
|
||||
sum(x * as.numeric(names(x)))
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,224 +1,224 @@
|
||||
# tocID <- "FND-STA-Information_theory.R"
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Information_theory unit.
|
||||
#
|
||||
# Version: 0.2.1
|
||||
#
|
||||
# Date: 2017 - 2021
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 0.2.1 Maintenance
|
||||
# 0.2 Under development
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------
|
||||
#TOC> 1 ___Section___ 39
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
# What level of information is "significant"
|
||||
|
||||
# Assume the background distribution is the database frequencies of
|
||||
# amino acids:
|
||||
|
||||
AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to
|
||||
# sum to 1.0
|
||||
AAref["A"] <- 0.0904
|
||||
AAref["C"] <- 0.0123
|
||||
AAref["D"] <- 0.0545
|
||||
AAref["E"] <- 0.0617
|
||||
AAref["F"] <- 0.0394
|
||||
AAref["G"] <- 0.0724
|
||||
AAref["H"] <- 0.0221
|
||||
AAref["I"] <- 0.0573
|
||||
AAref["K"] <- 0.0504
|
||||
AAref["L"] <- 0.0986
|
||||
AAref["M"] <- 0.0240
|
||||
AAref["N"] <- 0.0392
|
||||
AAref["P"] <- 0.0486
|
||||
AAref["Q"] <- 0.0381
|
||||
AAref["R"] <- 0.0570
|
||||
AAref["S"] <- 0.0673
|
||||
AAref["T"] <- 0.0558
|
||||
AAref["V"] <- 0.0686
|
||||
AAref["W"] <- 0.0129
|
||||
AAref["Y"] <- 0.0294
|
||||
sum(AAref)
|
||||
|
||||
# Function to calculate Shannon entropy
|
||||
H <- function(pmf) {
|
||||
# Calculate Shannon entropy
|
||||
# Parameters:
|
||||
# pmf (numeric) probability mass function: a vector of states and
|
||||
# associated probabilities. Each element of
|
||||
# pmf must be in (0, 1] and sum(pmf) must be 1.
|
||||
# Value:
|
||||
# Shannon entropy in bits.
|
||||
# Examples:
|
||||
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
|
||||
# # nucleotide sequence
|
||||
# H(1) # If all elements are the same, entropy is zero
|
||||
#
|
||||
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
|
||||
stop("Input is not a discrete probability distribution.")
|
||||
}
|
||||
H <- -sum(pmf * (log(pmf) / log(2)))
|
||||
return(H)
|
||||
}
|
||||
|
||||
# Why use all.equal()? Exact comparisons with floating point numbers are
|
||||
# brittle. Consider for example:
|
||||
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
|
||||
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
|
||||
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
|
||||
|
||||
|
||||
|
||||
# Entropy of the database frequencies (in bits):
|
||||
(Href <- H(AAref))
|
||||
|
||||
# for comparison: entropy if all amino acids are equiprobable
|
||||
H(rep(0.05, 20))
|
||||
|
||||
|
||||
# Set up a simulation to estimate the distribution of Information values
|
||||
# from random sequences drawn from AAref. This is the distribution for the
|
||||
# statistical null hypothesis:
|
||||
nObs <- 15 # number of observations (e.g aligned sequences)
|
||||
# nObs <- 80
|
||||
nTrials <- 10000 # number of trials
|
||||
IObs <- numeric(nTrials) # vector to store Information in each trial
|
||||
simCounts <- numeric(20) # vector to tabulate our information ...
|
||||
names(simCounts) <- names(AAref)# ... with the names of AAref
|
||||
|
||||
|
||||
for (i in 1:nTrials) { # simulate ...
|
||||
|
||||
# sample AAref letters, nObs times, with the probabilities of AAref:
|
||||
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
|
||||
|
||||
x <- table(AAobs) # table simulated observations
|
||||
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
|
||||
simCounts[names(x)] <- x # overwrite with observed counts
|
||||
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
|
||||
Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H
|
||||
IObs[i] <- Href - Hobs # store information
|
||||
}
|
||||
|
||||
# evaluate
|
||||
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
|
||||
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
|
||||
|
||||
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
|
||||
# i.e. an actual observation that lies outside the purple lines is deemed
|
||||
# "significant"(1)(2). Of course, this is only true to the degree that the
|
||||
# database frequencies are a valid model for the null-hypothesis on the
|
||||
# sequence position we are considering here.
|
||||
|
||||
# (1) If we use 5% quantiles, this means a value is significantly larger
|
||||
# than expected, and we ignore cases when the value is < 0; if we
|
||||
# consider both smaller and larger values, we need to use 2.5% quantiles,
|
||||
# since 5% of all observations lie outside the 0.025 and 0.975
|
||||
# quantiles.
|
||||
#
|
||||
# (2) For an actual observation of counts, we calculate its observed
|
||||
# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
|
||||
|
||||
|
||||
# You can probably now appreciate that information is a bit of a shortcut for
|
||||
# biological sequences, and does not really take the different inherent
|
||||
# frequencies based on the character of the amino acids into account. For
|
||||
# example, L is the most frequent and C is the least frequent, but if we have an
|
||||
# alignment of 1000 sequences and we see that the frequencies for L and C are
|
||||
# swapped, that would be _very_ surprising - nevertheless, the information would
|
||||
# be 0. In order to take that into account, we should actually compute
|
||||
# Kullback-Leibler divergences.
|
||||
|
||||
|
||||
# Swap C and L frequencies
|
||||
p <- AAref
|
||||
q <- AAref
|
||||
q["L"] <- AAref["C"]
|
||||
q["C"] <- AAref["L"]
|
||||
H(p)
|
||||
H(q)
|
||||
|
||||
KLdiv <- function(p, q) {
|
||||
# p and q are two pmfs of discrete probability distributions
|
||||
# with the same outcomes, which are nowhere 0.
|
||||
# Value: Kullback-Leibler divergence sum(p * log( p / q))).
|
||||
|
||||
if (length(p) != length(q)) {
|
||||
stop("PANIC: input vector lengths differ!")
|
||||
}
|
||||
if (any(c((p == 0), (q == 0)))) {
|
||||
stop("PANIC: 0's found in input vectors!")
|
||||
}
|
||||
|
||||
return(sum(p * log( p / q )))
|
||||
}
|
||||
|
||||
KLdiv(p, p)
|
||||
KLdiv(p, q)
|
||||
|
||||
|
||||
nObs <- 15 # number of observations (e.g aligned sequences)
|
||||
# nObs <- 80
|
||||
nTrials <- 10000 # number of trials
|
||||
KLdivObs <- numeric(nTrials) # vector to store Information in each trial
|
||||
simCounts <- numeric(20) # vector to tabulate our information ...
|
||||
names(simCounts) <- names(AAref)# ... with the names of AAref
|
||||
|
||||
|
||||
for (i in 1:nTrials) { # simulate ...
|
||||
|
||||
# sample AAref letters, nObs times, with the probabilities of AAref:
|
||||
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
|
||||
|
||||
x <- table(AAobs) # table simulated observations
|
||||
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
|
||||
simCounts[names(x)] <- x # overwrite with observed counts
|
||||
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
|
||||
simCounts <- simCounts/sum(simCounts) # counts to frequency
|
||||
KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
|
||||
}
|
||||
|
||||
# evaluate
|
||||
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
|
||||
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
|
||||
quantile(KLdivObs, 0.992)
|
||||
|
||||
# Running the simulation with KL does not give a fundamentally
|
||||
# different behaviour - since we are just randomly sampling. But KL would be
|
||||
# more sensitive in case there is biological selection, where the sampling is no
|
||||
# longer random. If I run the same simulation, with nObs <- 80 but calculating
|
||||
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
|
||||
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
|
||||
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
|
||||
# nice addition to the toolbox.
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "FND-STA-Information_theory.R"
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Information_theory unit.
|
||||
#
|
||||
# Version: 0.2.1
|
||||
#
|
||||
# Date: 2017 - 2021
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 0.2.1 Maintenance
|
||||
# 0.2 Under development
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------
|
||||
#TOC> 1 ___Section___ 39
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
# What level of information is "significant"
|
||||
|
||||
# Assume the background distribution is the database frequencies of
|
||||
# amino acids:
|
||||
|
||||
AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to
|
||||
# sum to 1.0
|
||||
AAref["A"] <- 0.0904
|
||||
AAref["C"] <- 0.0123
|
||||
AAref["D"] <- 0.0545
|
||||
AAref["E"] <- 0.0617
|
||||
AAref["F"] <- 0.0394
|
||||
AAref["G"] <- 0.0724
|
||||
AAref["H"] <- 0.0221
|
||||
AAref["I"] <- 0.0573
|
||||
AAref["K"] <- 0.0504
|
||||
AAref["L"] <- 0.0986
|
||||
AAref["M"] <- 0.0240
|
||||
AAref["N"] <- 0.0392
|
||||
AAref["P"] <- 0.0486
|
||||
AAref["Q"] <- 0.0381
|
||||
AAref["R"] <- 0.0570
|
||||
AAref["S"] <- 0.0673
|
||||
AAref["T"] <- 0.0558
|
||||
AAref["V"] <- 0.0686
|
||||
AAref["W"] <- 0.0129
|
||||
AAref["Y"] <- 0.0294
|
||||
sum(AAref)
|
||||
|
||||
# Function to calculate Shannon entropy
|
||||
H <- function(pmf) {
|
||||
# Calculate Shannon entropy
|
||||
# Parameters:
|
||||
# pmf (numeric) probability mass function: a vector of states and
|
||||
# associated probabilities. Each element of
|
||||
# pmf must be in (0, 1] and sum(pmf) must be 1.
|
||||
# Value:
|
||||
# Shannon entropy in bits.
|
||||
# Examples:
|
||||
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
|
||||
# # nucleotide sequence
|
||||
# H(1) # If all elements are the same, entropy is zero
|
||||
#
|
||||
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
|
||||
stop("Input is not a discrete probability distribution.")
|
||||
}
|
||||
H <- -sum(pmf * (log(pmf) / log(2)))
|
||||
return(H)
|
||||
}
|
||||
|
||||
# Why use all.equal()? Exact comparisons with floating point numbers are
|
||||
# brittle. Consider for example:
|
||||
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
|
||||
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
|
||||
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
|
||||
|
||||
|
||||
|
||||
# Entropy of the database frequencies (in bits):
|
||||
(Href <- H(AAref))
|
||||
|
||||
# for comparison: entropy if all amino acids are equiprobable
|
||||
H(rep(0.05, 20))
|
||||
|
||||
|
||||
# Set up a simulation to estimate the distribution of Information values
|
||||
# from random sequences drawn from AAref. This is the distribution for the
|
||||
# statistical null hypothesis:
|
||||
nObs <- 15 # number of observations (e.g aligned sequences)
|
||||
# nObs <- 80
|
||||
nTrials <- 10000 # number of trials
|
||||
IObs <- numeric(nTrials) # vector to store Information in each trial
|
||||
simCounts <- numeric(20) # vector to tabulate our information ...
|
||||
names(simCounts) <- names(AAref)# ... with the names of AAref
|
||||
|
||||
|
||||
for (i in 1:nTrials) { # simulate ...
|
||||
|
||||
# sample AAref letters, nObs times, with the probabilities of AAref:
|
||||
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
|
||||
|
||||
x <- table(AAobs) # table simulated observations
|
||||
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
|
||||
simCounts[names(x)] <- x # overwrite with observed counts
|
||||
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
|
||||
Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H
|
||||
IObs[i] <- Href - Hobs # store information
|
||||
}
|
||||
|
||||
# evaluate
|
||||
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
|
||||
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
|
||||
|
||||
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
|
||||
# i.e. an actual observation that lies outside the purple lines is deemed
|
||||
# "significant"(1)(2). Of course, this is only true to the degree that the
|
||||
# database frequencies are a valid model for the null-hypothesis on the
|
||||
# sequence position we are considering here.
|
||||
|
||||
# (1) If we use 5% quantiles, this means a value is significantly larger
|
||||
# than expected, and we ignore cases when the value is < 0; if we
|
||||
# consider both smaller and larger values, we need to use 2.5% quantiles,
|
||||
# since 5% of all observations lie outside the 0.025 and 0.975
|
||||
# quantiles.
|
||||
#
|
||||
# (2) For an actual observation of counts, we calculate its observed
|
||||
# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
|
||||
|
||||
|
||||
# You can probably now appreciate that information is a bit of a shortcut for
|
||||
# biological sequences, and does not really take the different inherent
|
||||
# frequencies based on the character of the amino acids into account. For
|
||||
# example, L is the most frequent and C is the least frequent, but if we have an
|
||||
# alignment of 1000 sequences and we see that the frequencies for L and C are
|
||||
# swapped, that would be _very_ surprising - nevertheless, the information would
|
||||
# be 0. In order to take that into account, we should actually compute
|
||||
# Kullback-Leibler divergences.
|
||||
|
||||
|
||||
# Swap C and L frequencies
|
||||
p <- AAref
|
||||
q <- AAref
|
||||
q["L"] <- AAref["C"]
|
||||
q["C"] <- AAref["L"]
|
||||
H(p)
|
||||
H(q)
|
||||
|
||||
KLdiv <- function(p, q) {
|
||||
# p and q are two pmfs of discrete probability distributions
|
||||
# with the same outcomes, which are nowhere 0.
|
||||
# Value: Kullback-Leibler divergence sum(p * log( p / q))).
|
||||
|
||||
if (length(p) != length(q)) {
|
||||
stop("PANIC: input vector lengths differ!")
|
||||
}
|
||||
if (any(c((p == 0), (q == 0)))) {
|
||||
stop("PANIC: 0's found in input vectors!")
|
||||
}
|
||||
|
||||
return(sum(p * log( p / q )))
|
||||
}
|
||||
|
||||
KLdiv(p, p)
|
||||
KLdiv(p, q)
|
||||
|
||||
|
||||
nObs <- 15 # number of observations (e.g aligned sequences)
|
||||
# nObs <- 80
|
||||
nTrials <- 10000 # number of trials
|
||||
KLdivObs <- numeric(nTrials) # vector to store Information in each trial
|
||||
simCounts <- numeric(20) # vector to tabulate our information ...
|
||||
names(simCounts) <- names(AAref)# ... with the names of AAref
|
||||
|
||||
|
||||
for (i in 1:nTrials) { # simulate ...
|
||||
|
||||
# sample AAref letters, nObs times, with the probabilities of AAref:
|
||||
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
|
||||
|
||||
x <- table(AAobs) # table simulated observations
|
||||
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
|
||||
simCounts[names(x)] <- x # overwrite with observed counts
|
||||
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
|
||||
simCounts <- simCounts/sum(simCounts) # counts to frequency
|
||||
KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
|
||||
}
|
||||
|
||||
# evaluate
|
||||
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
|
||||
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
|
||||
quantile(KLdivObs, 0.992)
|
||||
|
||||
# Running the simulation with KL does not give a fundamentally
|
||||
# different behaviour - since we are just randomly sampling. But KL would be
|
||||
# more sensitive in case there is biological selection, where the sampling is no
|
||||
# longer random. If I run the same simulation, with nObs <- 80 but calculating
|
||||
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
|
||||
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
|
||||
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
|
||||
# nice addition to the toolbox.
|
||||
|
||||
|
||||
# [END]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,351 +1,351 @@
|
||||
# tocID <- "FND-STA-Significance.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Significance unit.
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-09 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance. Add sample solution.
|
||||
# 1.2 Update set.seed() usage
|
||||
# 1.1 Corrected treatment of empirical p-value
|
||||
# 1.0 First contents
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------------
|
||||
#TOC> 1 Significance and p-value 49
|
||||
#TOC> 1.1 Significance levels 60
|
||||
#TOC> 1.2 probability and p-value 77
|
||||
#TOC> 1.2.1 p-value illustrated 109
|
||||
#TOC> 2 One- or two-sided 165
|
||||
#TOC> 3 Significance by integration 209
|
||||
#TOC> 4 Significance by simulation or permutation 215
|
||||
#TOC> 5 Final tasks 327
|
||||
#TOC> 6 Sample solutions 336
|
||||
#TOC> 6.1 338
|
||||
#TOC> 6.2 342
|
||||
#TOC> 6.3 346
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Significance and p-value ============================================
|
||||
|
||||
# The idea of the probability of an event has a precise mathematical
|
||||
# interpretation, but how is it useful to know the probability? Usually we are
|
||||
# interested in whether we should accept or reject a hypothesis based on the
|
||||
# observations we have. A rational way to do this is to say: if the probability
|
||||
# of observing the data is very small under the null-hypothesis, then we will
|
||||
# assume the observation is due to something other than the null-hypothesis. But
|
||||
# what do we mean by the "probability of our observation"? And what is "very
|
||||
# small"?
|
||||
|
||||
# == 1.1 Significance levels ===============================================
|
||||
|
||||
# A "very small" probability is purely a matter of convention - a cultural
|
||||
# convention. In the biomedical field we usually call probabilities of less then
|
||||
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
|
||||
# observations with a probability of less than 0.05 "significant" and if we want
|
||||
# to highlight this in text or in a graph, we often mark them with an asterisk
|
||||
# (*). Also we often call observations with a probability of less than 0.01
|
||||
# "highly significant" and mark them with two asterisks (**). But there is no
|
||||
# special significance in these numbers, the cutoff point for significance could
|
||||
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
|
||||
# British statistician Ronald Fisher happened to propose for this purpose in
|
||||
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
|
||||
# different purposes (cf.
|
||||
# https://en.wikipedia.org/wiki/Statistical_significance).
|
||||
|
||||
|
||||
# == 1.2 probability and p-value ===========================================
|
||||
|
||||
# But what do we even mean by the probability of an observation?
|
||||
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
|
||||
# standard deviation of 1. The sample I get is ...
|
||||
|
||||
set.seed(sqrt(5))
|
||||
x <- rnorm(1)
|
||||
set.seed(NULL)
|
||||
|
||||
print(x, digits = 22)
|
||||
# [1] -0.8969145466249813791748
|
||||
|
||||
# So what's the probability of that number? Obviously, the probability of
|
||||
# getting exactly this number is very, very, very small. But also obviously,
|
||||
# this does not mean that observing this number is in any way significant - we
|
||||
# always observe some number. That's not what we mean in this case. There are
|
||||
# several implicit assumptions when we speak of the probability of an
|
||||
# observation:
|
||||
|
||||
# 1: the observation can be compared to a probability distribution;
|
||||
# 2: that distribution can be integrated between any specific value
|
||||
# and its upper and lower bounds (or +- infinity).
|
||||
|
||||
# Then what we really mean by the probability of an observation in the context
|
||||
# of that distribution is: the probability of observing that value, or a value
|
||||
# more extreme than the one we have. We call this the p-value. Note that we are
|
||||
# not talking about an individual number anymore, we are talking about the area
|
||||
# under the curve between our observation and the upper (or lower) bound of the
|
||||
# curve, as a fraction of the whole.
|
||||
|
||||
|
||||
# === 1.2.1 p-value illustrated
|
||||
|
||||
# Let's illustrate. First we draw a million random values from our
|
||||
# standard, normal distribution:
|
||||
|
||||
N <- 1e6 # one million
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
r <- rnorm(N) # N values from a normal distribution
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# Let's see what the distribution looks like:
|
||||
|
||||
(h <- hist(r))
|
||||
|
||||
# The histogram details are now available in the list h - e.g. h$counts
|
||||
|
||||
# Where is the value we have drawn previously?
|
||||
abline(v = x, col = "#EE0000")
|
||||
|
||||
# How many values are smaller?
|
||||
sum(r < x)
|
||||
|
||||
# Let's color the bars:
|
||||
# first, make a vector of red and green colors for the bars with breaks
|
||||
# smaller and larger then x, white for the bar that contains x ...
|
||||
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
|
||||
hCol <- c(hCol, "#FFFFFFFF")
|
||||
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
|
||||
# ... then plot the histogram, with colored bars ...
|
||||
hist(r, col = hCol)
|
||||
# ... add two colored rectangles into the white bar ...
|
||||
idx <- sum(h$breaks < x)
|
||||
xMin <- h$breaks[idx]
|
||||
xMax <- h$breaks[idx + 1]
|
||||
y <- h$counts[idx]
|
||||
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
|
||||
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
|
||||
# ... and a red line for our observation.
|
||||
abline(v = x, col = "#EE0000", lwd = 2)
|
||||
|
||||
# The p-value of our observation is the red area as a fraction of the
|
||||
# whole histogram (red + green).
|
||||
|
||||
|
||||
# Task:
|
||||
# Explain how the expression sum(r < x) works to give us a count of values
|
||||
# with the property we are looking for. E.g., examine -4:4 < x
|
||||
|
||||
# Task:
|
||||
# Write an expression to estimate the probability that a value
|
||||
# drawn from the vector r is less-or-equal to x. The result you get
|
||||
# will depend on the exact values that went into the vector r but it should
|
||||
# be close to 0.185 That expression is the p-value associated with x.
|
||||
# (Sample solution 6.1)
|
||||
|
||||
|
||||
# = 2 One- or two-sided ===================================================
|
||||
|
||||
# The shape of our histogram confirms that the rnorm() function has returned
|
||||
# values that appear distributed according to a normal distribution. In a normal
|
||||
# distribution, readily available tables tell us that 5% of the values (i.e. our
|
||||
# significance level) lie 1.96 (or approximately 2) standard deviations away
|
||||
# from the mean. Is this the case here? How many values in our vector r are
|
||||
# larger than 1.96?
|
||||
|
||||
sum(r > 1.96)
|
||||
# [1] 24589
|
||||
|
||||
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
|
||||
|
||||
# The answer is: we have to be careful with two-sided distributions. 2 standard
|
||||
# deviations away from the mean means either larger or smaller than 1.96 . This
|
||||
# can give rise to errors. If we are simply are interested in outliers, no
|
||||
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
|
||||
# But if we are specifically interested in, say, larger values, because a
|
||||
# smaller value is not meaningful, then the significance cutoff, expressed as
|
||||
# standard deviations, is relaxed. We can use the quantile function to see what
|
||||
# the cutoff values are:
|
||||
|
||||
quantile(r)
|
||||
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
|
||||
# close to ± 1.96, as expected
|
||||
quantile(r, probs = 0.95) # for the single 5% boundary
|
||||
# close to 1.64 . Check counts to confirm:
|
||||
sum(r > quantile(r, probs = 0.95))
|
||||
# [1] 50000
|
||||
# which is 5%, as expected.
|
||||
|
||||
# Task:
|
||||
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
|
||||
# (Sample solution 6.2)
|
||||
|
||||
# To summarize: when we evaluate the significance of an event, we divide a
|
||||
# probability distribution into two parts at the point where the event was
|
||||
# observed. We then ask whether the integral over the more extreme part is less
|
||||
# or more than 5% of the whole. If it is less, we deem the event to be
|
||||
# significant.
|
||||
#
|
||||
|
||||
|
||||
# = 3 Significance by integration =========================================
|
||||
|
||||
# If the underlying probability distribution can be analytically or numerically
|
||||
# integrated, the siginificance of an observation can be directly computed.
|
||||
|
||||
|
||||
# = 4 Significance by simulation or permutation ===========================
|
||||
|
||||
# But whether the integration is correct, or relies on assumptions that may not
|
||||
# be warranted for biological data, can be a highly technical question.
|
||||
# Fortunately, we can often simply run a simulation, a random resampling, or a
|
||||
# permutation and then count the number of outcomes, just as we did with our
|
||||
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
|
||||
# p-value" is defined as (Nobs + 1) / (N + 1). )
|
||||
|
||||
# Here is an example. Assume you have a protein sequence and
|
||||
# you speculate that positively charged residues are close to negatively charged
|
||||
# residues to balance charge locally. A statistic that would capture this is the
|
||||
# mean minimum distance between all D,E residues and the closest R,K,H
|
||||
# residue. Let's compute this for the sequence of yeast Mbp1.
|
||||
|
||||
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
|
||||
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
|
||||
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
|
||||
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
|
||||
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
|
||||
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
|
||||
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
|
||||
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
|
||||
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
|
||||
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
|
||||
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
|
||||
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
|
||||
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
|
||||
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
|
||||
|
||||
# first we split this string into individual characters:
|
||||
v <- unlist(strsplit(MBP1, ""))
|
||||
|
||||
# and find the positions of our charged residues
|
||||
|
||||
ED <- grep("[ED]", v)
|
||||
RKH <- grep("[RKH]", v)
|
||||
|
||||
sep <- numeric(length(ED)) # this vector will hold the distances
|
||||
for (i in seq_along(ED)) {
|
||||
sep[i] <- min(abs(RKH - ED[i]))
|
||||
}
|
||||
|
||||
# Task: read and explain this bit of code
|
||||
|
||||
# Now that sep is computed, what does it look like?
|
||||
|
||||
table(sep) # these are the minimum distances
|
||||
# 24 of D,E residues are adjacent to R,K,H;
|
||||
# the longest separation is 28 residues.
|
||||
|
||||
# What is the mean separation?
|
||||
mean(sep)
|
||||
|
||||
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
|
||||
# to solve this analytically. But by permutation it's soooo easy.
|
||||
|
||||
# First, we combine what we have done above into a function:
|
||||
|
||||
chSep <- function(v) {
|
||||
# computes the mean minimum separation of oppositely charged residues
|
||||
# Parameter: v (char) a vector of amino acids in the one-letter code
|
||||
# Value: msep (numeric) mean minimum separation
|
||||
|
||||
ED <- grep("[EDed]", v)
|
||||
RKH <- grep("[RKHrkh]", v)
|
||||
|
||||
sep <- numeric(length(ED))
|
||||
for (i in seq_along(ED)) {
|
||||
sep[i] <- min(abs(RKH - ED[i]))
|
||||
}
|
||||
return(mean(sep))
|
||||
}
|
||||
|
||||
# Execute the function to define it.
|
||||
|
||||
# Confirm that the function gives the same result as the number we
|
||||
# calculated above:
|
||||
chSep(v)
|
||||
|
||||
# Now we can produce a random permutation of v, and recalculate
|
||||
|
||||
set.seed(pi) # set RNG seed for repeatable randomness
|
||||
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
|
||||
# code paradigm. It is very useful.
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
|
||||
chSep(w)
|
||||
# 3.773 ... that's actually less than what we had before.
|
||||
|
||||
# Let's do this 10000 times and record the results (takes a few seconds):
|
||||
|
||||
N <- 10000
|
||||
chs <- numeric(N)
|
||||
for (i in 1:N) {
|
||||
chs[i] <- chSep(sample(v, length(v))) # charge
|
||||
}
|
||||
|
||||
hist(chs, breaks = 50)
|
||||
abline(v = chSep(v), col = "#EE0000")
|
||||
|
||||
# Contrary to our expectations, the actual observed mean minimum charge
|
||||
# separation seems to be larger than what we observe in randomly permuted
|
||||
# sequences. But is this significant? Your task to find out.
|
||||
|
||||
# Task:
|
||||
# Calculate the empirical p-value for chsep(v)
|
||||
# (Sample solution 6.3)
|
||||
|
||||
|
||||
# = 5 Final tasks =========================================================
|
||||
|
||||
# From chs, compute the empirical p-value of a mean minimum charge separation to
|
||||
# be larger or equal to the value observed for the yeast MBP1 sequence. Note
|
||||
# the result in your journal. Is it significant? Also note the result of
|
||||
# the following expression for validation:
|
||||
seal(sum(chs))
|
||||
|
||||
|
||||
# = 6 Sample solutions ====================================================
|
||||
|
||||
# == 6.1 ==================================================================
|
||||
#
|
||||
sum(r <= x) / length(r)
|
||||
|
||||
# == 6.2 ==================================================================
|
||||
#
|
||||
abline(v = quantile(r, probs = c(0.05)))
|
||||
|
||||
# == 6.3 ==================================================================
|
||||
#
|
||||
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "FND-STA-Significance.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Significance unit.
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-09 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance. Add sample solution.
|
||||
# 1.2 Update set.seed() usage
|
||||
# 1.1 Corrected treatment of empirical p-value
|
||||
# 1.0 First contents
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------------
|
||||
#TOC> 1 Significance and p-value 49
|
||||
#TOC> 1.1 Significance levels 60
|
||||
#TOC> 1.2 probability and p-value 77
|
||||
#TOC> 1.2.1 p-value illustrated 109
|
||||
#TOC> 2 One- or two-sided 165
|
||||
#TOC> 3 Significance by integration 209
|
||||
#TOC> 4 Significance by simulation or permutation 215
|
||||
#TOC> 5 Final tasks 327
|
||||
#TOC> 6 Sample solutions 336
|
||||
#TOC> 6.1 338
|
||||
#TOC> 6.2 342
|
||||
#TOC> 6.3 346
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Significance and p-value ============================================
|
||||
|
||||
# The idea of the probability of an event has a precise mathematical
|
||||
# interpretation, but how is it useful to know the probability? Usually we are
|
||||
# interested in whether we should accept or reject a hypothesis based on the
|
||||
# observations we have. A rational way to do this is to say: if the probability
|
||||
# of observing the data is very small under the null-hypothesis, then we will
|
||||
# assume the observation is due to something other than the null-hypothesis. But
|
||||
# what do we mean by the "probability of our observation"? And what is "very
|
||||
# small"?
|
||||
|
||||
# == 1.1 Significance levels ===============================================
|
||||
|
||||
# A "very small" probability is purely a matter of convention - a cultural
|
||||
# convention. In the biomedical field we usually call probabilities of less then
|
||||
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
|
||||
# observations with a probability of less than 0.05 "significant" and if we want
|
||||
# to highlight this in text or in a graph, we often mark them with an asterisk
|
||||
# (*). Also we often call observations with a probability of less than 0.01
|
||||
# "highly significant" and mark them with two asterisks (**). But there is no
|
||||
# special significance in these numbers, the cutoff point for significance could
|
||||
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
|
||||
# British statistician Ronald Fisher happened to propose for this purpose in
|
||||
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
|
||||
# different purposes (cf.
|
||||
# https://en.wikipedia.org/wiki/Statistical_significance).
|
||||
|
||||
|
||||
# == 1.2 probability and p-value ===========================================
|
||||
|
||||
# But what do we even mean by the probability of an observation?
|
||||
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
|
||||
# standard deviation of 1. The sample I get is ...
|
||||
|
||||
set.seed(sqrt(5))
|
||||
x <- rnorm(1)
|
||||
set.seed(NULL)
|
||||
|
||||
print(x, digits = 22)
|
||||
# [1] -0.8969145466249813791748
|
||||
|
||||
# So what's the probability of that number? Obviously, the probability of
|
||||
# getting exactly this number is very, very, very small. But also obviously,
|
||||
# this does not mean that observing this number is in any way significant - we
|
||||
# always observe some number. That's not what we mean in this case. There are
|
||||
# several implicit assumptions when we speak of the probability of an
|
||||
# observation:
|
||||
|
||||
# 1: the observation can be compared to a probability distribution;
|
||||
# 2: that distribution can be integrated between any specific value
|
||||
# and its upper and lower bounds (or +- infinity).
|
||||
|
||||
# Then what we really mean by the probability of an observation in the context
|
||||
# of that distribution is: the probability of observing that value, or a value
|
||||
# more extreme than the one we have. We call this the p-value. Note that we are
|
||||
# not talking about an individual number anymore, we are talking about the area
|
||||
# under the curve between our observation and the upper (or lower) bound of the
|
||||
# curve, as a fraction of the whole.
|
||||
|
||||
|
||||
# === 1.2.1 p-value illustrated
|
||||
|
||||
# Let's illustrate. First we draw a million random values from our
|
||||
# standard, normal distribution:
|
||||
|
||||
N <- 1e6 # one million
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
r <- rnorm(N) # N values from a normal distribution
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
# Let's see what the distribution looks like:
|
||||
|
||||
(h <- hist(r))
|
||||
|
||||
# The histogram details are now available in the list h - e.g. h$counts
|
||||
|
||||
# Where is the value we have drawn previously?
|
||||
abline(v = x, col = "#EE0000")
|
||||
|
||||
# How many values are smaller?
|
||||
sum(r < x)
|
||||
|
||||
# Let's color the bars:
|
||||
# first, make a vector of red and green colors for the bars with breaks
|
||||
# smaller and larger then x, white for the bar that contains x ...
|
||||
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
|
||||
hCol <- c(hCol, "#FFFFFFFF")
|
||||
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
|
||||
# ... then plot the histogram, with colored bars ...
|
||||
hist(r, col = hCol)
|
||||
# ... add two colored rectangles into the white bar ...
|
||||
idx <- sum(h$breaks < x)
|
||||
xMin <- h$breaks[idx]
|
||||
xMax <- h$breaks[idx + 1]
|
||||
y <- h$counts[idx]
|
||||
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
|
||||
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
|
||||
# ... and a red line for our observation.
|
||||
abline(v = x, col = "#EE0000", lwd = 2)
|
||||
|
||||
# The p-value of our observation is the red area as a fraction of the
|
||||
# whole histogram (red + green).
|
||||
|
||||
|
||||
# Task:
|
||||
# Explain how the expression sum(r < x) works to give us a count of values
|
||||
# with the property we are looking for. E.g., examine -4:4 < x
|
||||
|
||||
# Task:
|
||||
# Write an expression to estimate the probability that a value
|
||||
# drawn from the vector r is less-or-equal to x. The result you get
|
||||
# will depend on the exact values that went into the vector r but it should
|
||||
# be close to 0.185 That expression is the p-value associated with x.
|
||||
# (Sample solution 6.1)
|
||||
|
||||
|
||||
# = 2 One- or two-sided ===================================================
|
||||
|
||||
# The shape of our histogram confirms that the rnorm() function has returned
|
||||
# values that appear distributed according to a normal distribution. In a normal
|
||||
# distribution, readily available tables tell us that 5% of the values (i.e. our
|
||||
# significance level) lie 1.96 (or approximately 2) standard deviations away
|
||||
# from the mean. Is this the case here? How many values in our vector r are
|
||||
# larger than 1.96?
|
||||
|
||||
sum(r > 1.96)
|
||||
# [1] 24589
|
||||
|
||||
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
|
||||
|
||||
# The answer is: we have to be careful with two-sided distributions. 2 standard
|
||||
# deviations away from the mean means either larger or smaller than 1.96 . This
|
||||
# can give rise to errors. If we are simply are interested in outliers, no
|
||||
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
|
||||
# But if we are specifically interested in, say, larger values, because a
|
||||
# smaller value is not meaningful, then the significance cutoff, expressed as
|
||||
# standard deviations, is relaxed. We can use the quantile function to see what
|
||||
# the cutoff values are:
|
||||
|
||||
quantile(r)
|
||||
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
|
||||
# close to ± 1.96, as expected
|
||||
quantile(r, probs = 0.95) # for the single 5% boundary
|
||||
# close to 1.64 . Check counts to confirm:
|
||||
sum(r > quantile(r, probs = 0.95))
|
||||
# [1] 50000
|
||||
# which is 5%, as expected.
|
||||
|
||||
# Task:
|
||||
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
|
||||
# (Sample solution 6.2)
|
||||
|
||||
# To summarize: when we evaluate the significance of an event, we divide a
|
||||
# probability distribution into two parts at the point where the event was
|
||||
# observed. We then ask whether the integral over the more extreme part is less
|
||||
# or more than 5% of the whole. If it is less, we deem the event to be
|
||||
# significant.
|
||||
#
|
||||
|
||||
|
||||
# = 3 Significance by integration =========================================
|
||||
|
||||
# If the underlying probability distribution can be analytically or numerically
|
||||
# integrated, the siginificance of an observation can be directly computed.
|
||||
|
||||
|
||||
# = 4 Significance by simulation or permutation ===========================
|
||||
|
||||
# But whether the integration is correct, or relies on assumptions that may not
|
||||
# be warranted for biological data, can be a highly technical question.
|
||||
# Fortunately, we can often simply run a simulation, a random resampling, or a
|
||||
# permutation and then count the number of outcomes, just as we did with our
|
||||
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
|
||||
# p-value" is defined as (Nobs + 1) / (N + 1). )
|
||||
|
||||
# Here is an example. Assume you have a protein sequence and
|
||||
# you speculate that positively charged residues are close to negatively charged
|
||||
# residues to balance charge locally. A statistic that would capture this is the
|
||||
# mean minimum distance between all D,E residues and the closest R,K,H
|
||||
# residue. Let's compute this for the sequence of yeast Mbp1.
|
||||
|
||||
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
|
||||
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
|
||||
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
|
||||
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
|
||||
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
|
||||
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
|
||||
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
|
||||
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
|
||||
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
|
||||
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
|
||||
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
|
||||
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
|
||||
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
|
||||
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
|
||||
|
||||
# first we split this string into individual characters:
|
||||
v <- unlist(strsplit(MBP1, ""))
|
||||
|
||||
# and find the positions of our charged residues
|
||||
|
||||
ED <- grep("[ED]", v)
|
||||
RKH <- grep("[RKH]", v)
|
||||
|
||||
sep <- numeric(length(ED)) # this vector will hold the distances
|
||||
for (i in seq_along(ED)) {
|
||||
sep[i] <- min(abs(RKH - ED[i]))
|
||||
}
|
||||
|
||||
# Task: read and explain this bit of code
|
||||
|
||||
# Now that sep is computed, what does it look like?
|
||||
|
||||
table(sep) # these are the minimum distances
|
||||
# 24 of D,E residues are adjacent to R,K,H;
|
||||
# the longest separation is 28 residues.
|
||||
|
||||
# What is the mean separation?
|
||||
mean(sep)
|
||||
|
||||
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
|
||||
# to solve this analytically. But by permutation it's soooo easy.
|
||||
|
||||
# First, we combine what we have done above into a function:
|
||||
|
||||
chSep <- function(v) {
|
||||
# computes the mean minimum separation of oppositely charged residues
|
||||
# Parameter: v (char) a vector of amino acids in the one-letter code
|
||||
# Value: msep (numeric) mean minimum separation
|
||||
|
||||
ED <- grep("[EDed]", v)
|
||||
RKH <- grep("[RKHrkh]", v)
|
||||
|
||||
sep <- numeric(length(ED))
|
||||
for (i in seq_along(ED)) {
|
||||
sep[i] <- min(abs(RKH - ED[i]))
|
||||
}
|
||||
return(mean(sep))
|
||||
}
|
||||
|
||||
# Execute the function to define it.
|
||||
|
||||
# Confirm that the function gives the same result as the number we
|
||||
# calculated above:
|
||||
chSep(v)
|
||||
|
||||
# Now we can produce a random permutation of v, and recalculate
|
||||
|
||||
set.seed(pi) # set RNG seed for repeatable randomness
|
||||
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
|
||||
# code paradigm. It is very useful.
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
|
||||
chSep(w)
|
||||
# 3.773 ... that's actually less than what we had before.
|
||||
|
||||
# Let's do this 10000 times and record the results (takes a few seconds):
|
||||
|
||||
N <- 10000
|
||||
chs <- numeric(N)
|
||||
for (i in 1:N) {
|
||||
chs[i] <- chSep(sample(v, length(v))) # charge
|
||||
}
|
||||
|
||||
hist(chs, breaks = 50)
|
||||
abline(v = chSep(v), col = "#EE0000")
|
||||
|
||||
# Contrary to our expectations, the actual observed mean minimum charge
|
||||
# separation seems to be larger than what we observe in randomly permuted
|
||||
# sequences. But is this significant? Your task to find out.
|
||||
|
||||
# Task:
|
||||
# Calculate the empirical p-value for chsep(v)
|
||||
# (Sample solution 6.3)
|
||||
|
||||
|
||||
# = 5 Final tasks =========================================================
|
||||
|
||||
# From chs, compute the empirical p-value of a mean minimum charge separation to
|
||||
# be larger or equal to the value observed for the yeast MBP1 sequence. Note
|
||||
# the result in your journal. Is it significant? Also note the result of
|
||||
# the following expression for validation:
|
||||
seal(sum(chs))
|
||||
|
||||
|
||||
# = 6 Sample solutions ====================================================
|
||||
|
||||
# == 6.1 ==================================================================
|
||||
#
|
||||
sum(r <= x) / length(r)
|
||||
|
||||
# == 6.2 ==================================================================
|
||||
#
|
||||
abline(v = quantile(r, probs = c(0.05)))
|
||||
|
||||
# == 6.3 ==================================================================
|
||||
#
|
||||
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,3 +1,3 @@
|
||||
# BCH441-WORK-ABC-units
|
||||
|
||||
# BCH441-WORK-ABC-units
|
||||
|
||||
This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.
|
490
RPR-Biostrings.R
490
RPR-Biostrings.R
@ -1,245 +1,245 @@
|
||||
# tocID <- "RPR-Biostrings.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Biostrings unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 2017 Revisions
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 The Biostrings:: Package 56
|
||||
#TOC> 2 Getting Data into Biostrings:: Objects 88
|
||||
#TOC> 3 Working with Biostrings:: Objects 110
|
||||
#TOC> 3.1 Properties 127
|
||||
#TOC> 3.2 Subsetting 168
|
||||
#TOC> 3.3 Operators 180
|
||||
#TOC> 3.4 Transformations 187
|
||||
#TOC> 4 Getting Data out of Biostrings:: Objects 194
|
||||
#TOC> 5 More 203
|
||||
#TOC> 5.1 Views 205
|
||||
#TOC> 5.2 Iranges 219
|
||||
#TOC> 5.3 StringSets 225
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# This is a very brief introduction to the Biostrings:: package, other units will
|
||||
# be using more of the Biostrings:: functions.
|
||||
|
||||
|
||||
# = 1 The Biostrings:: Package ============================================
|
||||
|
||||
|
||||
# First, we install and load the Biostrings:: package from bioconductor (if we
|
||||
# haven't done so already).
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Examine the package information:
|
||||
library(help = Biostrings) # basic information
|
||||
browseVignettes("Biostrings") # available vignettes
|
||||
data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# At its core, Biostrings:: objects are "classes" of type XString (you can think
|
||||
# of a "class" in R as a special kind of list), that can take on particular
|
||||
# flavours for RNA, DNA or amino acid sequence information.
|
||||
|
||||
class(Biostrings::RNAString("AUG"))
|
||||
class(Biostrings::DNAString("ATG"))
|
||||
class(Biostrings::AAString("M"))
|
||||
|
||||
# An essential property of Biostrings:: objects is that they only allow letters
|
||||
# from the applicable IUPAC alphabet:
|
||||
Biostrings::RNAString("AUG")
|
||||
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
||||
|
||||
|
||||
# = 2 Getting Data into Biostrings:: Objects ==============================
|
||||
|
||||
|
||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
|
||||
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
rawSeq <- dbSanitizeSequence(rawSeq)
|
||||
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
|
||||
# into an object of class DNAstring
|
||||
|
||||
# Multi FASTA files can be read directly as a "XStringSet) ...
|
||||
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
|
||||
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
|
||||
|
||||
# ... and if you subset one sequence from the set, you get an XString object
|
||||
# back again.
|
||||
(Xseq <- biosDNASet[[1]])
|
||||
|
||||
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
|
||||
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
||||
|
||||
|
||||
|
||||
# = 3 Working with Biostrings:: Objects ===================================
|
||||
|
||||
# Biostrings:: is a highly engineered package that is tightly integrated into
|
||||
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||
# undesirable level of computational overhead and dependencies. Using the
|
||||
# package as we normally do - i.e. calling required functions with their
|
||||
# explicit package prefix is therefore not advisable. There are generics
|
||||
# that won't be propery dispatched. If you only need a small number of
|
||||
# functions for a very specific context, you will probably get away with
|
||||
# Biostrings::<function>() - but even in the demonstration code of this script
|
||||
# not everything works out of the box. We'll therefore load the library,
|
||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||
# the functions come from.
|
||||
|
||||
library(Biostrings)
|
||||
|
||||
|
||||
# == 3.1 Properties ========================================================
|
||||
str(rawSeq)
|
||||
str(biosDNAseq)
|
||||
|
||||
length(rawSeq) # ... is 1: one string only. To get the number of
|
||||
# characters in a string, you need nchar().
|
||||
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
|
||||
nchar(rawSeq)
|
||||
nchar(biosDNAseq) # ... but nchar() works too.
|
||||
|
||||
(uL <- Biostrings::uniqueLetters(biosDNAseq))
|
||||
|
||||
# Count frequencies - with strings, you would strsplit() into a character
|
||||
# vector and then use table(). biost
|
||||
Biostrings::alphabetFrequency(biosDNAseq)
|
||||
|
||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
|
||||
# returns.
|
||||
Biostrings::letterFrequency(biosDNAseq, uL)
|
||||
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
|
||||
length(biosDNAseq) # GC contents
|
||||
|
||||
Biostrings::dinucleotideFrequency(biosDNAseq)
|
||||
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
|
||||
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
|
||||
barplot(sort(triNuc), col="#4499EE33")
|
||||
triNuc[triNuc == max(triNuc)]
|
||||
triNuc[triNuc == min(triNuc)]
|
||||
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
|
||||
|
||||
# compare to a shuffled sequence:
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
||||
max(triNuc)
|
||||
# Interpret this plot.
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||
barplot(sort(triNuc), col="#EEEE4433")
|
||||
max(triNuc)
|
||||
|
||||
|
||||
# == 3.2 Subsetting ========================================================
|
||||
|
||||
# Subsetting any XString object works as expected:
|
||||
biosDNAseq[4:15]
|
||||
|
||||
# ... well - maybe not expected, because rawSeq[4:15] would not work.
|
||||
|
||||
# Alternatively to the "[" operator, use the subseq() function - especially for
|
||||
# long sequences. This is far more efficient.
|
||||
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
|
||||
|
||||
|
||||
# == 3.3 Operators =========================================================
|
||||
|
||||
# RNAstring() and DNAstring() objects compare U and T as equals!
|
||||
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
||||
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
||||
|
||||
|
||||
# == 3.4 Transformations ===================================================
|
||||
|
||||
biosDNAseq[4:15]
|
||||
Biostrings::reverseComplement(biosDNAseq[4:15])
|
||||
Biostrings::translate(biosDNAseq[4:15])
|
||||
|
||||
|
||||
# = 4 Getting Data out of Biostrings:: Objects ============================
|
||||
|
||||
# If you need a character object, use toString():
|
||||
|
||||
Biostrings::toString(biosDNAseq[4:15])
|
||||
|
||||
# saveRDS() and readRDS() works like on all other R objects.
|
||||
|
||||
|
||||
# = 5 More ================================================================
|
||||
|
||||
# == 5.1 Views =============================================================
|
||||
|
||||
# Biostring "Views" are objects that store multiple substrings of one
|
||||
# Biostring object.
|
||||
|
||||
(myView <- Biostrings::Views(biosDNAseq,
|
||||
start = c(1, 19, 37),
|
||||
end = c(15, 30, 45)))
|
||||
|
||||
# Views are convenient to store feature annotations
|
||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
|
||||
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
|
||||
|
||||
|
||||
# == 5.2 Iranges ===========================================================
|
||||
|
||||
# Biostrings:: Iranges are like Views with a common start point. These can be
|
||||
# useful for feature annotations. Instead of start/end you store start/width.
|
||||
|
||||
|
||||
# == 5.3 StringSets ========================================================
|
||||
|
||||
# Biostring "StringSets" store multiple sequences.
|
||||
#
|
||||
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
|
||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
|
||||
|
||||
x <- Biostrings::toString(ompA)
|
||||
for (i in 2:10) {
|
||||
x[i] <- Biostrings::toString(sample(ompA))
|
||||
}
|
||||
shuffledPeptideSet <- Biostrings::AAStringSet(x)
|
||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
|
||||
shuffledPeptideSet
|
||||
|
||||
length(shuffledPeptideSet)
|
||||
Biostrings::width(shuffledPeptideSet)
|
||||
Biostrings::alphabetFrequency(shuffledPeptideSet)
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-Biostrings.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Biostrings unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 2017 Revisions
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 The Biostrings:: Package 56
|
||||
#TOC> 2 Getting Data into Biostrings:: Objects 88
|
||||
#TOC> 3 Working with Biostrings:: Objects 110
|
||||
#TOC> 3.1 Properties 127
|
||||
#TOC> 3.2 Subsetting 168
|
||||
#TOC> 3.3 Operators 180
|
||||
#TOC> 3.4 Transformations 187
|
||||
#TOC> 4 Getting Data out of Biostrings:: Objects 194
|
||||
#TOC> 5 More 203
|
||||
#TOC> 5.1 Views 205
|
||||
#TOC> 5.2 Iranges 219
|
||||
#TOC> 5.3 StringSets 225
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# This is a very brief introduction to the Biostrings:: package, other units will
|
||||
# be using more of the Biostrings:: functions.
|
||||
|
||||
|
||||
# = 1 The Biostrings:: Package ============================================
|
||||
|
||||
|
||||
# First, we install and load the Biostrings:: package from bioconductor (if we
|
||||
# haven't done so already).
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Examine the package information:
|
||||
library(help = Biostrings) # basic information
|
||||
browseVignettes("Biostrings") # available vignettes
|
||||
data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# At its core, Biostrings:: objects are "classes" of type XString (you can think
|
||||
# of a "class" in R as a special kind of list), that can take on particular
|
||||
# flavours for RNA, DNA or amino acid sequence information.
|
||||
|
||||
class(Biostrings::RNAString("AUG"))
|
||||
class(Biostrings::DNAString("ATG"))
|
||||
class(Biostrings::AAString("M"))
|
||||
|
||||
# An essential property of Biostrings:: objects is that they only allow letters
|
||||
# from the applicable IUPAC alphabet:
|
||||
Biostrings::RNAString("AUG")
|
||||
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
||||
|
||||
|
||||
# = 2 Getting Data into Biostrings:: Objects ==============================
|
||||
|
||||
|
||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
|
||||
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
rawSeq <- dbSanitizeSequence(rawSeq)
|
||||
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
|
||||
# into an object of class DNAstring
|
||||
|
||||
# Multi FASTA files can be read directly as a "XStringSet) ...
|
||||
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
|
||||
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
|
||||
|
||||
# ... and if you subset one sequence from the set, you get an XString object
|
||||
# back again.
|
||||
(Xseq <- biosDNASet[[1]])
|
||||
|
||||
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
|
||||
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
||||
|
||||
|
||||
|
||||
# = 3 Working with Biostrings:: Objects ===================================
|
||||
|
||||
# Biostrings:: is a highly engineered package that is tightly integrated into
|
||||
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||
# undesirable level of computational overhead and dependencies. Using the
|
||||
# package as we normally do - i.e. calling required functions with their
|
||||
# explicit package prefix is therefore not advisable. There are generics
|
||||
# that won't be propery dispatched. If you only need a small number of
|
||||
# functions for a very specific context, you will probably get away with
|
||||
# Biostrings::<function>() - but even in the demonstration code of this script
|
||||
# not everything works out of the box. We'll therefore load the library,
|
||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||
# the functions come from.
|
||||
|
||||
library(Biostrings)
|
||||
|
||||
|
||||
# == 3.1 Properties ========================================================
|
||||
str(rawSeq)
|
||||
str(biosDNAseq)
|
||||
|
||||
length(rawSeq) # ... is 1: one string only. To get the number of
|
||||
# characters in a string, you need nchar().
|
||||
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
|
||||
nchar(rawSeq)
|
||||
nchar(biosDNAseq) # ... but nchar() works too.
|
||||
|
||||
(uL <- Biostrings::uniqueLetters(biosDNAseq))
|
||||
|
||||
# Count frequencies - with strings, you would strsplit() into a character
|
||||
# vector and then use table(). biost
|
||||
Biostrings::alphabetFrequency(biosDNAseq)
|
||||
|
||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
|
||||
# returns.
|
||||
Biostrings::letterFrequency(biosDNAseq, uL)
|
||||
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
|
||||
length(biosDNAseq) # GC contents
|
||||
|
||||
Biostrings::dinucleotideFrequency(biosDNAseq)
|
||||
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
|
||||
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
|
||||
barplot(sort(triNuc), col="#4499EE33")
|
||||
triNuc[triNuc == max(triNuc)]
|
||||
triNuc[triNuc == min(triNuc)]
|
||||
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
|
||||
|
||||
# compare to a shuffled sequence:
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
||||
max(triNuc)
|
||||
# Interpret this plot.
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||
barplot(sort(triNuc), col="#EEEE4433")
|
||||
max(triNuc)
|
||||
|
||||
|
||||
# == 3.2 Subsetting ========================================================
|
||||
|
||||
# Subsetting any XString object works as expected:
|
||||
biosDNAseq[4:15]
|
||||
|
||||
# ... well - maybe not expected, because rawSeq[4:15] would not work.
|
||||
|
||||
# Alternatively to the "[" operator, use the subseq() function - especially for
|
||||
# long sequences. This is far more efficient.
|
||||
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
|
||||
|
||||
|
||||
# == 3.3 Operators =========================================================
|
||||
|
||||
# RNAstring() and DNAstring() objects compare U and T as equals!
|
||||
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
||||
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
||||
|
||||
|
||||
# == 3.4 Transformations ===================================================
|
||||
|
||||
biosDNAseq[4:15]
|
||||
Biostrings::reverseComplement(biosDNAseq[4:15])
|
||||
Biostrings::translate(biosDNAseq[4:15])
|
||||
|
||||
|
||||
# = 4 Getting Data out of Biostrings:: Objects ============================
|
||||
|
||||
# If you need a character object, use toString():
|
||||
|
||||
Biostrings::toString(biosDNAseq[4:15])
|
||||
|
||||
# saveRDS() and readRDS() works like on all other R objects.
|
||||
|
||||
|
||||
# = 5 More ================================================================
|
||||
|
||||
# == 5.1 Views =============================================================
|
||||
|
||||
# Biostring "Views" are objects that store multiple substrings of one
|
||||
# Biostring object.
|
||||
|
||||
(myView <- Biostrings::Views(biosDNAseq,
|
||||
start = c(1, 19, 37),
|
||||
end = c(15, 30, 45)))
|
||||
|
||||
# Views are convenient to store feature annotations
|
||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
|
||||
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
|
||||
|
||||
|
||||
# == 5.2 Iranges ===========================================================
|
||||
|
||||
# Biostrings:: Iranges are like Views with a common start point. These can be
|
||||
# useful for feature annotations. Instead of start/end you store start/width.
|
||||
|
||||
|
||||
# == 5.3 StringSets ========================================================
|
||||
|
||||
# Biostring "StringSets" store multiple sequences.
|
||||
#
|
||||
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
|
||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
|
||||
|
||||
x <- Biostrings::toString(ompA)
|
||||
for (i in 2:10) {
|
||||
x[i] <- Biostrings::toString(sample(ompA))
|
||||
}
|
||||
shuffledPeptideSet <- Biostrings::AAStringSet(x)
|
||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
|
||||
shuffledPeptideSet
|
||||
|
||||
length(shuffledPeptideSet)
|
||||
Biostrings::width(shuffledPeptideSet)
|
||||
Biostrings::alphabetFrequency(shuffledPeptideSet)
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,165 +1,165 @@
|
||||
# tocID <- "RPR-ChimeraX_remote.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code demonstrating remote scripting of ChimeraX.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0.1 2021 Minimal updates
|
||||
# 1.0 First ABC units version
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# %-encode and escape quotes, or just pass-through?
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------
|
||||
#TOC> 1 ChimeraX REMOTE SCRIPTING 41
|
||||
#TOC> 1.1 Defining a Port 59
|
||||
#TOC> 1.2 Open ChimeraX 81
|
||||
#TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ChimeraX REMOTE SCRIPTING ===========================================
|
||||
|
||||
|
||||
# One of the cool features of ChimeraX is that it can be driven by Python code,
|
||||
# both within a running session and through Python scripts. What I find even
|
||||
# cooler though is that ChimeraX can be driven from any programming language via
|
||||
# its remote control function that can listen to commands sent from any other
|
||||
# application. The interface that is used here is the standard REST (method) -
|
||||
# the GET and POST verbs that ubiquitously underly the communication of clients
|
||||
# and servers on the Web.
|
||||
|
||||
# In order to establish the communication between this script and ChimeraX, all
|
||||
# we need to do is:
|
||||
# - open ChimeraX;
|
||||
# - tell it to listen on a specific "port";
|
||||
# - send commands to that port via httr::
|
||||
|
||||
|
||||
# == 1.1 Defining a Port ===================================================
|
||||
|
||||
# The httr:: package needs to be available
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
# We need to think od a port. Any available port number between 49152-65535 is
|
||||
# fine. We'll choose 61803 because that's the fractional part of the golden
|
||||
# ratio. But one could choose another.
|
||||
|
||||
CXPORT <- 61803
|
||||
|
||||
# Check that our current version of R supports sockets (default since V 3.3)
|
||||
capabilities("sockets") # MUST be TRUE. If not, don't continue.
|
||||
|
||||
|
||||
# == 1.2 Open ChimeraX =====================================================
|
||||
|
||||
# - Open a fresh, new session of recently updated version of ChimeraX
|
||||
# - type:
|
||||
#
|
||||
# remotecontrol rest start port 61803
|
||||
#
|
||||
# ... or whatever the value of CXPORT is.
|
||||
|
||||
# Now watch what happens in ChimeraX when you execute the following line:
|
||||
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
|
||||
|
||||
# The .utilities.R script includes the function CX(), based on this principle,
|
||||
# through which you can send commands to ChimeraX
|
||||
|
||||
CX("camera sbs")
|
||||
CX("lighting soft")
|
||||
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
|
||||
|
||||
# The command echos Chimera's response if the parameter "quietly" is
|
||||
# FALSE (default), and we can silence output with quietly = TRUE :
|
||||
CX("info models #1 attribute num_residues")
|
||||
CX("info models #1 attribute num_residues", quietly = TRUE)
|
||||
|
||||
# Either way, the command also returns Chimera's responses "invisibly";
|
||||
# i.e. we can use the results by assigning the output to a variable:
|
||||
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
|
||||
x <- read.table(file = textConnection(hBonds), skip = 9,
|
||||
blank.lines.skip = TRUE, fill = TRUE)
|
||||
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
|
||||
|
||||
|
||||
# = 2 WORKED EXAMPLE: SUPERPOSITION =======================================
|
||||
|
||||
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
|
||||
# to explore possible DNA binding regions in 1BM8
|
||||
|
||||
# The model for 1BM8 is already open as model 1 (#1)
|
||||
CX("hide #1 cartoons") # hide model 1 cartoon representation
|
||||
CX("open 1DUX") # assume this is opened as model #2
|
||||
CX("hide #2") # hide everything ...
|
||||
CX("select #2/C") # chain c (protein)
|
||||
CX("show sel cartoons") # ... and show cartoons of chain c (protein)
|
||||
CX("color sequential sel target c palette steelblue:darkmagenta")
|
||||
CX("view #2/C") # re-center the display
|
||||
CX("cofr #2/C:62@CA") # set pivot to an interface residue
|
||||
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
|
||||
CX("style sel stick")
|
||||
CX("show sel target ab") # show atoms/bonds
|
||||
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
|
||||
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
|
||||
CX("surface sel enclose sel") # compute joint accessible surface of both chains
|
||||
CX("transparency 50")
|
||||
CX("select clear")
|
||||
|
||||
# Now superimpose the 1BM8 chain onto 1DUX chain C
|
||||
CX("show #1 cartoons")
|
||||
CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition
|
||||
|
||||
# study the general layout, and the position of the 1mb8 secondary structure
|
||||
# elements relative to 1DUX
|
||||
|
||||
# Let's examine side chain orientations in more detail
|
||||
CX("hide #2/C cartoons") # hide the 1DUX protein
|
||||
|
||||
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
|
||||
CX("select zone #2/A,B 3.5 #1 & protein residues true")
|
||||
CX("~select sel & H") # de-select H atoms
|
||||
CX("show sel target ab")
|
||||
CX("size stickRadius 0.4")
|
||||
CX("select clear")
|
||||
|
||||
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
|
||||
# transcription factor binding mode; the detailed conformations of side chains
|
||||
# would need to change only to a minor degree. There is a very significant
|
||||
# degree of structural similarity; remarkable, given that the DNA is not the
|
||||
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
|
||||
# determined without a DNA ligand.
|
||||
|
||||
CX("remotecontrol rest stop") # release the socket
|
||||
# Done.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-ChimeraX_remote.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code demonstrating remote scripting of ChimeraX.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0.1 2021 Minimal updates
|
||||
# 1.0 First ABC units version
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# %-encode and escape quotes, or just pass-through?
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------
|
||||
#TOC> 1 ChimeraX REMOTE SCRIPTING 41
|
||||
#TOC> 1.1 Defining a Port 59
|
||||
#TOC> 1.2 Open ChimeraX 81
|
||||
#TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ChimeraX REMOTE SCRIPTING ===========================================
|
||||
|
||||
|
||||
# One of the cool features of ChimeraX is that it can be driven by Python code,
|
||||
# both within a running session and through Python scripts. What I find even
|
||||
# cooler though is that ChimeraX can be driven from any programming language via
|
||||
# its remote control function that can listen to commands sent from any other
|
||||
# application. The interface that is used here is the standard REST (method) -
|
||||
# the GET and POST verbs that ubiquitously underly the communication of clients
|
||||
# and servers on the Web.
|
||||
|
||||
# In order to establish the communication between this script and ChimeraX, all
|
||||
# we need to do is:
|
||||
# - open ChimeraX;
|
||||
# - tell it to listen on a specific "port";
|
||||
# - send commands to that port via httr::
|
||||
|
||||
|
||||
# == 1.1 Defining a Port ===================================================
|
||||
|
||||
# The httr:: package needs to be available
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
# We need to think od a port. Any available port number between 49152-65535 is
|
||||
# fine. We'll choose 61803 because that's the fractional part of the golden
|
||||
# ratio. But one could choose another.
|
||||
|
||||
CXPORT <- 61803
|
||||
|
||||
# Check that our current version of R supports sockets (default since V 3.3)
|
||||
capabilities("sockets") # MUST be TRUE. If not, don't continue.
|
||||
|
||||
|
||||
# == 1.2 Open ChimeraX =====================================================
|
||||
|
||||
# - Open a fresh, new session of recently updated version of ChimeraX
|
||||
# - type:
|
||||
#
|
||||
# remotecontrol rest start port 61803
|
||||
#
|
||||
# ... or whatever the value of CXPORT is.
|
||||
|
||||
# Now watch what happens in ChimeraX when you execute the following line:
|
||||
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
|
||||
|
||||
# The .utilities.R script includes the function CX(), based on this principle,
|
||||
# through which you can send commands to ChimeraX
|
||||
|
||||
CX("camera sbs")
|
||||
CX("lighting soft")
|
||||
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
|
||||
|
||||
# The command echos Chimera's response if the parameter "quietly" is
|
||||
# FALSE (default), and we can silence output with quietly = TRUE :
|
||||
CX("info models #1 attribute num_residues")
|
||||
CX("info models #1 attribute num_residues", quietly = TRUE)
|
||||
|
||||
# Either way, the command also returns Chimera's responses "invisibly";
|
||||
# i.e. we can use the results by assigning the output to a variable:
|
||||
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
|
||||
x <- read.table(file = textConnection(hBonds), skip = 9,
|
||||
blank.lines.skip = TRUE, fill = TRUE)
|
||||
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
|
||||
|
||||
|
||||
# = 2 WORKED EXAMPLE: SUPERPOSITION =======================================
|
||||
|
||||
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
|
||||
# to explore possible DNA binding regions in 1BM8
|
||||
|
||||
# The model for 1BM8 is already open as model 1 (#1)
|
||||
CX("hide #1 cartoons") # hide model 1 cartoon representation
|
||||
CX("open 1DUX") # assume this is opened as model #2
|
||||
CX("hide #2") # hide everything ...
|
||||
CX("select #2/C") # chain c (protein)
|
||||
CX("show sel cartoons") # ... and show cartoons of chain c (protein)
|
||||
CX("color sequential sel target c palette steelblue:darkmagenta")
|
||||
CX("view #2/C") # re-center the display
|
||||
CX("cofr #2/C:62@CA") # set pivot to an interface residue
|
||||
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
|
||||
CX("style sel stick")
|
||||
CX("show sel target ab") # show atoms/bonds
|
||||
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
|
||||
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
|
||||
CX("surface sel enclose sel") # compute joint accessible surface of both chains
|
||||
CX("transparency 50")
|
||||
CX("select clear")
|
||||
|
||||
# Now superimpose the 1BM8 chain onto 1DUX chain C
|
||||
CX("show #1 cartoons")
|
||||
CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition
|
||||
|
||||
# study the general layout, and the position of the 1mb8 secondary structure
|
||||
# elements relative to 1DUX
|
||||
|
||||
# Let's examine side chain orientations in more detail
|
||||
CX("hide #2/C cartoons") # hide the 1DUX protein
|
||||
|
||||
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
|
||||
CX("select zone #2/A,B 3.5 #1 & protein residues true")
|
||||
CX("~select sel & H") # de-select H atoms
|
||||
CX("show sel target ab")
|
||||
CX("size stickRadius 0.4")
|
||||
CX("select clear")
|
||||
|
||||
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
|
||||
# transcription factor binding mode; the detailed conformations of side chains
|
||||
# would need to change only to a minor degree. There is a very significant
|
||||
# degree of structural similarity; remarkable, given that the DNA is not the
|
||||
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
|
||||
# determined without a DNA ligand.
|
||||
|
||||
CX("remotecontrol rest stop") # release the socket
|
||||
# Done.
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
644
RPR-FASTA.R
644
RPR-FASTA.R
@ -1,322 +1,322 @@
|
||||
# tocID <- "RPR-FASTA.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-FASTA unit.
|
||||
#
|
||||
# Version: 1.1.2
|
||||
#
|
||||
# Date: 2017-10 - 2021-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1.2 style update
|
||||
# 1.1.1 bugfix - wrong function name
|
||||
# 1.1 2020 Maintenance. Rewrite validation logic. Add data
|
||||
# to utilities. Define AACOLS
|
||||
# 1.0 New unit.
|
||||
#
|
||||
#
|
||||
# TODO: Make a simple solution first, then extend it to error checking, and
|
||||
# to handle .mfa files.
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------
|
||||
#TOC> 1 Reading and validating FASTA 45
|
||||
#TOC> 1.1 Validating FASTA 81
|
||||
#TOC> 2 Parsing FASTA 227
|
||||
#TOC> 3 Interpreting FASTA 247
|
||||
#TOC> 4 Writing FASTA 274
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Reading and validating FASTA ========================================
|
||||
|
||||
# FASTA is a text based format, structured in lines that are separated by
|
||||
# line-feed or paragraph-break characters. Which one of these is used, depends
|
||||
# on your operating system. But R's readLines() function knows how to handle
|
||||
# these correctly, accross platforms. Don't try to read such files "by hand".
|
||||
# Here is the yeast Mbp1 gene, via SGD.
|
||||
|
||||
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
|
||||
# The warning is generated because the programmer at the NCBI who implemented
|
||||
# the code to write this FASTA file neglected to place a line-break character
|
||||
# after the last sequence character. While this is not technically incorrect,
|
||||
# it is poor practice: the resulting file can't be distinguished from one that
|
||||
# has been truncated in transmission.
|
||||
|
||||
head(faMBP1)
|
||||
|
||||
# Note that there are NO line-break characters ("\n") at the end of these
|
||||
# strings, even though they were present in the original file. readLines()
|
||||
# has "consumed" these characters while reading - but every single line is in
|
||||
# a vector of its own.
|
||||
|
||||
tail(faMBP1)
|
||||
|
||||
# Also note that the last line has fewer characters - this means readLines()
|
||||
# imported the whole line, despite it not being terminated by "\n".
|
||||
|
||||
# It's very straightforward to work with such data, for example by collapsing
|
||||
# everything except the first line into a single string ...
|
||||
|
||||
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
|
||||
|
||||
f[1]
|
||||
nchar(f[2])
|
||||
|
||||
# == 1.1 Validating FASTA ==================================================
|
||||
|
||||
# The code above is making the assumption that everything from line 2 until
|
||||
# the end IS sequence, the whole sequence and nothing but sequence.
|
||||
# That assumption can break down in many ways:
|
||||
#
|
||||
# - there could be more than one header line. The specification says otherwise,
|
||||
# but some older files use multiple, consecutive header lines. You don't
|
||||
# want that to end up in your sequence.
|
||||
# - this could be not a FASTA file at all. It could be raw sequence, a
|
||||
# different sequence file format, or a wholly different file altogether.
|
||||
# If you look at the file, you can immediately tell, but if you are
|
||||
# reading the file in a complex workflow, your could easily import wrong
|
||||
# data into your analysis.
|
||||
# - there could be more than one sequence in the file. Such Multi-FASTA files
|
||||
# occur commonly, as downloads of ORFs from genome regions or other
|
||||
# sets of genes or proteins, or as the input / output for multiple
|
||||
# sequence alignment programs.
|
||||
#
|
||||
# Data "from the wild" can (and usually does) have the most unexpected
|
||||
# variations and it is really, really important to be clear about the
|
||||
# assumptions that you are making. It is possible to "fix" things, according
|
||||
# to the "Robustness Principle" :
|
||||
# "Be conservative in what you send,
|
||||
# be liberal in what you accept".
|
||||
# (cf. https://en.wikipedia.org/wiki/Robustness_principle )
|
||||
# ... but if you think about this, that's actually a really poor idea,
|
||||
# which is much more likely to dilute standards, make unwarranted
|
||||
# assumptions, and allow errors to pass silently and corrupt data.
|
||||
#
|
||||
# Let's discard this principle on the trash-heap of
|
||||
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
|
||||
# identify problems, and follow the principle: "crash early, crash often". Of
|
||||
# course I can write code that would reformat any possible input as a FASTA
|
||||
# file - but what good will it do me if it parses the file I receive
|
||||
# from a server into FASTA format like:
|
||||
#
|
||||
# >404- Page Not Found</title</head>
|
||||
# dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
|
||||
# spellingrcntacttheadministratrsdyhtml
|
||||
#
|
||||
# Therefore, we write ourselves a FASTA checker that will enforce the following:
|
||||
# (1) a FASTA file contains one or more sequences separated by zero or
|
||||
# more empty lines
|
||||
# (2) a sequence contains one header line followed by
|
||||
# one or more sequence lines
|
||||
# (3) a sequence line contains one or more uppercase or lowercase single
|
||||
# letter amino acid codes, hyphens (gap character), or * (stop).
|
||||
#
|
||||
# Anything else should generate an error.
|
||||
|
||||
# (Case 1): Header(s) exist
|
||||
fX <- c("ABC",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grepl("^>", fX) # "^>" is a regular expression that
|
||||
# means: the exact character ">" at the
|
||||
# beginning ("^") of the line.
|
||||
if ( ! any(sel) ) { stop("no header lines in input.") }
|
||||
|
||||
|
||||
# (Case 2) No adjacent header lines
|
||||
fX <- c(">ABC",
|
||||
">123",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grepl("^>", fX)
|
||||
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
|
||||
if ( any(sel)) { stop("adjacent header lines in input.") }
|
||||
|
||||
# (Case 3.1) all sequence lines contain only valid characters
|
||||
# (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
|
||||
# are defined with the .utilities.R script)
|
||||
AAVALID
|
||||
fX <- c(">ABC",
|
||||
"def ;-) ghi",
|
||||
"klmnpq")
|
||||
myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
|
||||
sel <- ! grepl("^>", fX) # NOT headers
|
||||
if (any(grepl(myRegex, fX[sel]))) {
|
||||
stop("invalid chracter(s) outside of header lines.")
|
||||
}
|
||||
|
||||
# (Case 3.2) all headers are followed directly by
|
||||
# at least one letter of sequence
|
||||
fX <- c(">ABC",
|
||||
"",
|
||||
">123",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grep("^>", fX) + 1 # indexes of headers + 1
|
||||
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
|
||||
if (! all(grepl(myRegex, fX[sel]))) {
|
||||
stop("a header has no adjacent sequence.")
|
||||
}
|
||||
# Ah, you might ask - couldn't we just have dropped all empty lines, and
|
||||
# then caught this in Case 2? No - for two reasons: we would still miss headers
|
||||
# at the end of file, and, we would have changed the line numbering - and
|
||||
# ideally our "production" function will create information about where the
|
||||
# error is to be found.
|
||||
|
||||
|
||||
# Now combine this into a function ...
|
||||
|
||||
val <- function(fa) {
|
||||
|
||||
if ( ! any(grepl("^>", fa)) ) {
|
||||
stop("no header lines in input.")
|
||||
}
|
||||
|
||||
sel <- grepl("^>", fa)
|
||||
if ( any(sel[- length(sel)] & sel[-1])) {
|
||||
stop("adjacent header lines in input.")
|
||||
}
|
||||
|
||||
sel <- ! grepl("^>", fa)
|
||||
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
|
||||
stop("invalid chracter(s) outside of header lines.")
|
||||
}
|
||||
|
||||
sel <- grep("^>", fa) + 1
|
||||
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
|
||||
stop("a header has no adjacent sequence.")
|
||||
}
|
||||
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
# Here is an example
|
||||
FA <- c(">head1",
|
||||
"acdef",
|
||||
"ghi",
|
||||
"",
|
||||
">head2",
|
||||
"kl",
|
||||
">head3",
|
||||
"mn",
|
||||
"pqrs")
|
||||
val(FA) # ... should not create an error
|
||||
|
||||
|
||||
# A somewhat more elaborate validateFA() function was loaded with the
|
||||
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
|
||||
# fasta files have space-characters in their spacer lines. Try it ...
|
||||
validateFA(FA)
|
||||
|
||||
# = 2 Parsing FASTA =======================================================
|
||||
|
||||
# Once we have validated our assumptions about our input, it's quite
|
||||
# painless to parse it. I have put this together as a function and the function
|
||||
# gets loaded from ./.utilities.R
|
||||
#
|
||||
|
||||
# Lets try this:
|
||||
# - the first 3 elements of faMBP1:
|
||||
readFASTA(faMBP1[1:3])
|
||||
|
||||
# - a multi FASTA file of aligned APSES domain sequences:
|
||||
|
||||
refAPSES <- readFASTA("./data/refAPSES.mfa")
|
||||
|
||||
# Subset the sequence with "P39678" in the header
|
||||
refAPSES[grep("P39678", refAPSES$head) ,]
|
||||
|
||||
|
||||
|
||||
# = 3 Interpreting FASTA ==================================================
|
||||
|
||||
|
||||
# FASTA files are straightforward to interpret - just one thing may be of note:
|
||||
# when working with strings, we can use substr(<string>, <start>, <stop>) to
|
||||
# extract substrings, but more often we expand the string into a vector of
|
||||
# single characters with strsplit(<string>, ""). strsplit() returns a list,
|
||||
# to accommodate that <string> could be a vector of many elements, therefore
|
||||
# we usually unlist() the result if we use it only on a single string.
|
||||
|
||||
# Example: How many positive charged residues in "MBP1_SACCE"?
|
||||
|
||||
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
|
||||
s
|
||||
|
||||
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
|
||||
# for the characters, sum() coerces to 1 and 0
|
||||
# respectively, and that gives us the result.
|
||||
|
||||
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
|
||||
|
||||
# residue distribution
|
||||
x <- factor(s, levels = names(AACOLS))
|
||||
pie(table(x)[names(AACOLS)], col = AACOLS)
|
||||
|
||||
|
||||
|
||||
# = 4 Writing FASTA =======================================================
|
||||
|
||||
|
||||
# Writing FASTA files is mostly just the reverse of reading, with one
|
||||
# twist: we need to break the long sequence string into chunks of the desired
|
||||
# width. The FASTA specification calls for a maximum of 120 characters per line,
|
||||
# but writing out much less than that is common, since it allows to comfortably
|
||||
# view lines on the console, or printing them on a sheet of paper (do we still
|
||||
# do that actually?). How do we break a string into chunks? A combination of
|
||||
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
|
||||
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
|
||||
# loop through our FASTA object in memory, we can build the output by c()'ing
|
||||
# blocks of header + sequence to each other. For VERY large objects this might
|
||||
# be slow - in that case, we might want to precalculate the size of the output
|
||||
# object. But that's more of a hypothetical consideration.
|
||||
|
||||
( s <- refAPSES$seq[2] )
|
||||
nchar(s)
|
||||
w <- 30 # width of chunk
|
||||
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk
|
||||
(ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk
|
||||
|
||||
# Task: Is this safe? What happens if nchar(s) is shorter than w?
|
||||
# What happens if nchar(s) is an exact multiple of w?
|
||||
|
||||
substring(s, starts, ends)
|
||||
# confirm that the output contains the first and last residue, and both
|
||||
# residues adjacent to the breaks
|
||||
|
||||
# As always, the function has been defined in ".utilities.R" for to use
|
||||
# any time... type writeFASTA to examine it.
|
||||
|
||||
# Let's try this...
|
||||
|
||||
writeFASTA(refAPSES, width = 40)
|
||||
|
||||
# roundtrip for validation: write refAPSES with a different format,
|
||||
# read it back in - the new dataframe must be identical
|
||||
# to the original dataframe.
|
||||
fname <- tempfile()
|
||||
writeFASTA(refAPSES, fn = fname, width = 30)
|
||||
identical(refAPSES, readFASTA(fname))
|
||||
|
||||
# ...works for me :-)
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-FASTA.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-FASTA unit.
|
||||
#
|
||||
# Version: 1.1.2
|
||||
#
|
||||
# Date: 2017-10 - 2021-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1.2 style update
|
||||
# 1.1.1 bugfix - wrong function name
|
||||
# 1.1 2020 Maintenance. Rewrite validation logic. Add data
|
||||
# to utilities. Define AACOLS
|
||||
# 1.0 New unit.
|
||||
#
|
||||
#
|
||||
# TODO: Make a simple solution first, then extend it to error checking, and
|
||||
# to handle .mfa files.
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------
|
||||
#TOC> 1 Reading and validating FASTA 45
|
||||
#TOC> 1.1 Validating FASTA 81
|
||||
#TOC> 2 Parsing FASTA 227
|
||||
#TOC> 3 Interpreting FASTA 247
|
||||
#TOC> 4 Writing FASTA 274
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Reading and validating FASTA ========================================
|
||||
|
||||
# FASTA is a text based format, structured in lines that are separated by
|
||||
# line-feed or paragraph-break characters. Which one of these is used, depends
|
||||
# on your operating system. But R's readLines() function knows how to handle
|
||||
# these correctly, accross platforms. Don't try to read such files "by hand".
|
||||
# Here is the yeast Mbp1 gene, via SGD.
|
||||
|
||||
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
|
||||
# The warning is generated because the programmer at the NCBI who implemented
|
||||
# the code to write this FASTA file neglected to place a line-break character
|
||||
# after the last sequence character. While this is not technically incorrect,
|
||||
# it is poor practice: the resulting file can't be distinguished from one that
|
||||
# has been truncated in transmission.
|
||||
|
||||
head(faMBP1)
|
||||
|
||||
# Note that there are NO line-break characters ("\n") at the end of these
|
||||
# strings, even though they were present in the original file. readLines()
|
||||
# has "consumed" these characters while reading - but every single line is in
|
||||
# a vector of its own.
|
||||
|
||||
tail(faMBP1)
|
||||
|
||||
# Also note that the last line has fewer characters - this means readLines()
|
||||
# imported the whole line, despite it not being terminated by "\n".
|
||||
|
||||
# It's very straightforward to work with such data, for example by collapsing
|
||||
# everything except the first line into a single string ...
|
||||
|
||||
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
|
||||
|
||||
f[1]
|
||||
nchar(f[2])
|
||||
|
||||
# == 1.1 Validating FASTA ==================================================
|
||||
|
||||
# The code above is making the assumption that everything from line 2 until
|
||||
# the end IS sequence, the whole sequence and nothing but sequence.
|
||||
# That assumption can break down in many ways:
|
||||
#
|
||||
# - there could be more than one header line. The specification says otherwise,
|
||||
# but some older files use multiple, consecutive header lines. You don't
|
||||
# want that to end up in your sequence.
|
||||
# - this could be not a FASTA file at all. It could be raw sequence, a
|
||||
# different sequence file format, or a wholly different file altogether.
|
||||
# If you look at the file, you can immediately tell, but if you are
|
||||
# reading the file in a complex workflow, your could easily import wrong
|
||||
# data into your analysis.
|
||||
# - there could be more than one sequence in the file. Such Multi-FASTA files
|
||||
# occur commonly, as downloads of ORFs from genome regions or other
|
||||
# sets of genes or proteins, or as the input / output for multiple
|
||||
# sequence alignment programs.
|
||||
#
|
||||
# Data "from the wild" can (and usually does) have the most unexpected
|
||||
# variations and it is really, really important to be clear about the
|
||||
# assumptions that you are making. It is possible to "fix" things, according
|
||||
# to the "Robustness Principle" :
|
||||
# "Be conservative in what you send,
|
||||
# be liberal in what you accept".
|
||||
# (cf. https://en.wikipedia.org/wiki/Robustness_principle )
|
||||
# ... but if you think about this, that's actually a really poor idea,
|
||||
# which is much more likely to dilute standards, make unwarranted
|
||||
# assumptions, and allow errors to pass silently and corrupt data.
|
||||
#
|
||||
# Let's discard this principle on the trash-heap of
|
||||
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
|
||||
# identify problems, and follow the principle: "crash early, crash often". Of
|
||||
# course I can write code that would reformat any possible input as a FASTA
|
||||
# file - but what good will it do me if it parses the file I receive
|
||||
# from a server into FASTA format like:
|
||||
#
|
||||
# >404- Page Not Found</title</head>
|
||||
# dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
|
||||
# spellingrcntacttheadministratrsdyhtml
|
||||
#
|
||||
# Therefore, we write ourselves a FASTA checker that will enforce the following:
|
||||
# (1) a FASTA file contains one or more sequences separated by zero or
|
||||
# more empty lines
|
||||
# (2) a sequence contains one header line followed by
|
||||
# one or more sequence lines
|
||||
# (3) a sequence line contains one or more uppercase or lowercase single
|
||||
# letter amino acid codes, hyphens (gap character), or * (stop).
|
||||
#
|
||||
# Anything else should generate an error.
|
||||
|
||||
# (Case 1): Header(s) exist
|
||||
fX <- c("ABC",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grepl("^>", fX) # "^>" is a regular expression that
|
||||
# means: the exact character ">" at the
|
||||
# beginning ("^") of the line.
|
||||
if ( ! any(sel) ) { stop("no header lines in input.") }
|
||||
|
||||
|
||||
# (Case 2) No adjacent header lines
|
||||
fX <- c(">ABC",
|
||||
">123",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grepl("^>", fX)
|
||||
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
|
||||
if ( any(sel)) { stop("adjacent header lines in input.") }
|
||||
|
||||
# (Case 3.1) all sequence lines contain only valid characters
|
||||
# (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
|
||||
# are defined with the .utilities.R script)
|
||||
AAVALID
|
||||
fX <- c(">ABC",
|
||||
"def ;-) ghi",
|
||||
"klmnpq")
|
||||
myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
|
||||
sel <- ! grepl("^>", fX) # NOT headers
|
||||
if (any(grepl(myRegex, fX[sel]))) {
|
||||
stop("invalid chracter(s) outside of header lines.")
|
||||
}
|
||||
|
||||
# (Case 3.2) all headers are followed directly by
|
||||
# at least one letter of sequence
|
||||
fX <- c(">ABC",
|
||||
"",
|
||||
">123",
|
||||
"defghi",
|
||||
"klmnpq")
|
||||
sel <- grep("^>", fX) + 1 # indexes of headers + 1
|
||||
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
|
||||
if (! all(grepl(myRegex, fX[sel]))) {
|
||||
stop("a header has no adjacent sequence.")
|
||||
}
|
||||
# Ah, you might ask - couldn't we just have dropped all empty lines, and
|
||||
# then caught this in Case 2? No - for two reasons: we would still miss headers
|
||||
# at the end of file, and, we would have changed the line numbering - and
|
||||
# ideally our "production" function will create information about where the
|
||||
# error is to be found.
|
||||
|
||||
|
||||
# Now combine this into a function ...
|
||||
|
||||
val <- function(fa) {
|
||||
|
||||
if ( ! any(grepl("^>", fa)) ) {
|
||||
stop("no header lines in input.")
|
||||
}
|
||||
|
||||
sel <- grepl("^>", fa)
|
||||
if ( any(sel[- length(sel)] & sel[-1])) {
|
||||
stop("adjacent header lines in input.")
|
||||
}
|
||||
|
||||
sel <- ! grepl("^>", fa)
|
||||
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
|
||||
stop("invalid chracter(s) outside of header lines.")
|
||||
}
|
||||
|
||||
sel <- grep("^>", fa) + 1
|
||||
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
|
||||
stop("a header has no adjacent sequence.")
|
||||
}
|
||||
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
# Here is an example
|
||||
FA <- c(">head1",
|
||||
"acdef",
|
||||
"ghi",
|
||||
"",
|
||||
">head2",
|
||||
"kl",
|
||||
">head3",
|
||||
"mn",
|
||||
"pqrs")
|
||||
val(FA) # ... should not create an error
|
||||
|
||||
|
||||
# A somewhat more elaborate validateFA() function was loaded with the
|
||||
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
|
||||
# fasta files have space-characters in their spacer lines. Try it ...
|
||||
validateFA(FA)
|
||||
|
||||
# = 2 Parsing FASTA =======================================================
|
||||
|
||||
# Once we have validated our assumptions about our input, it's quite
|
||||
# painless to parse it. I have put this together as a function and the function
|
||||
# gets loaded from ./.utilities.R
|
||||
#
|
||||
|
||||
# Lets try this:
|
||||
# - the first 3 elements of faMBP1:
|
||||
readFASTA(faMBP1[1:3])
|
||||
|
||||
# - a multi FASTA file of aligned APSES domain sequences:
|
||||
|
||||
refAPSES <- readFASTA("./data/refAPSES.mfa")
|
||||
|
||||
# Subset the sequence with "P39678" in the header
|
||||
refAPSES[grep("P39678", refAPSES$head) ,]
|
||||
|
||||
|
||||
|
||||
# = 3 Interpreting FASTA ==================================================
|
||||
|
||||
|
||||
# FASTA files are straightforward to interpret - just one thing may be of note:
|
||||
# when working with strings, we can use substr(<string>, <start>, <stop>) to
|
||||
# extract substrings, but more often we expand the string into a vector of
|
||||
# single characters with strsplit(<string>, ""). strsplit() returns a list,
|
||||
# to accommodate that <string> could be a vector of many elements, therefore
|
||||
# we usually unlist() the result if we use it only on a single string.
|
||||
|
||||
# Example: How many positive charged residues in "MBP1_SACCE"?
|
||||
|
||||
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
|
||||
s
|
||||
|
||||
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
|
||||
# for the characters, sum() coerces to 1 and 0
|
||||
# respectively, and that gives us the result.
|
||||
|
||||
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
|
||||
|
||||
# residue distribution
|
||||
x <- factor(s, levels = names(AACOLS))
|
||||
pie(table(x)[names(AACOLS)], col = AACOLS)
|
||||
|
||||
|
||||
|
||||
# = 4 Writing FASTA =======================================================
|
||||
|
||||
|
||||
# Writing FASTA files is mostly just the reverse of reading, with one
|
||||
# twist: we need to break the long sequence string into chunks of the desired
|
||||
# width. The FASTA specification calls for a maximum of 120 characters per line,
|
||||
# but writing out much less than that is common, since it allows to comfortably
|
||||
# view lines on the console, or printing them on a sheet of paper (do we still
|
||||
# do that actually?). How do we break a string into chunks? A combination of
|
||||
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
|
||||
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
|
||||
# loop through our FASTA object in memory, we can build the output by c()'ing
|
||||
# blocks of header + sequence to each other. For VERY large objects this might
|
||||
# be slow - in that case, we might want to precalculate the size of the output
|
||||
# object. But that's more of a hypothetical consideration.
|
||||
|
||||
( s <- refAPSES$seq[2] )
|
||||
nchar(s)
|
||||
w <- 30 # width of chunk
|
||||
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk
|
||||
(ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk
|
||||
|
||||
# Task: Is this safe? What happens if nchar(s) is shorter than w?
|
||||
# What happens if nchar(s) is an exact multiple of w?
|
||||
|
||||
substring(s, starts, ends)
|
||||
# confirm that the output contains the first and last residue, and both
|
||||
# residues adjacent to the breaks
|
||||
|
||||
# As always, the function has been defined in ".utilities.R" for to use
|
||||
# any time... type writeFASTA to examine it.
|
||||
|
||||
# Let's try this...
|
||||
|
||||
writeFASTA(refAPSES, width = 40)
|
||||
|
||||
# roundtrip for validation: write refAPSES with a different format,
|
||||
# read it back in - the new dataframe must be identical
|
||||
# to the original dataframe.
|
||||
fname <- tempfile()
|
||||
writeFASTA(refAPSES, fn = fname, width = 30)
|
||||
identical(refAPSES, readFASTA(fname))
|
||||
|
||||
# ...works for me :-)
|
||||
|
||||
|
||||
# [END]
|
||||
|
1348
RPR-GEO2R.R
1348
RPR-GEO2R.R
File diff suppressed because it is too large
Load Diff
@ -1,385 +1,385 @@
|
||||
# tocID <- "RPR-Genetic_code_optimality.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||
# 1.0 New material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Designing a computational experiment 58
|
||||
#TOC> 2 Setting up the tools 74
|
||||
#TOC> 2.1 Natural and alternative genetic codes 77
|
||||
#TOC> 2.2 Effect of mutations 135
|
||||
#TOC> 2.2.1 reverse-translate 146
|
||||
#TOC> 2.2.2 Randomly mutate 171
|
||||
#TOC> 2.2.3 Forward- translate 196
|
||||
#TOC> 2.2.4 measure effect 213
|
||||
#TOC> 3 Run the experiment 267
|
||||
#TOC> 4 Task solutions 363
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
|
||||
# their robsustness to code changes. The approaches are quite simple and you
|
||||
# will be able to come up with obvious refinements; the point of this code is to
|
||||
# demonstrate some R programming techniques, in preparation for more
|
||||
# sophisticated questions later.
|
||||
|
||||
|
||||
# = 1 Designing a computational experiment ================================
|
||||
|
||||
# Computational experiments are conducted like wet-lab experiments. We begin
|
||||
# with a hypothesis, then define the observables that relate to the hypothesis,
|
||||
# then define the measures we apply to observations, and finally we interpret
|
||||
# our observations. If we want to learn something about the evolution of the
|
||||
# genetic code ...
|
||||
|
||||
# - we construct a hypothesis such as: the genetic code has evolved so as to
|
||||
# minimize the effect of mutations;
|
||||
# - we define the observables: the effect of mutations in
|
||||
# sequences, given the natural and possible alternative codes;
|
||||
# - we define the measures to quantify the effect of mutations;
|
||||
# - then we compute alternatives and interpret the results.
|
||||
|
||||
|
||||
# = 2 Setting up the tools ================================================
|
||||
|
||||
|
||||
# == 2.1 Natural and alternative genetic codes =============================
|
||||
|
||||
# Load genetic code tables from the Biostrings package
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# There are many ways to generate alternative codes. The simplest way is to
|
||||
# randomly assign amino acids to codons. A more sophisticated way is to keep the
|
||||
# redundancy of codons intact, since it may reflect some form of symmetry
|
||||
# breaking that ignores the third nucleotide of a codon for the most part;
|
||||
# therefore we only replace the amino acids of the existing code with random
|
||||
# others. Here are two functions that implement these two ideas about alternate
|
||||
# codes.
|
||||
|
||||
randomGC <- function(GC) {
|
||||
# Return a genetic code with randomly assigned amino acids.
|
||||
# Parameters:
|
||||
# GC named chr length-64 character vector of 20 amino acid one-letter
|
||||
# codes plus "*" (stop), named with the codon triplet.
|
||||
# Value: named chr same vector with random amino acid assignments in which
|
||||
# every amino acid and "*" is encoded at least once.
|
||||
|
||||
aa <- unique(GC) # the amino acids in the input code
|
||||
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
|
||||
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
|
||||
# does not contain all amino acids,
|
||||
# then we sample() again.
|
||||
GC[1:64] <- sample(aa, 64, replace = TRUE)
|
||||
}
|
||||
return(GC)
|
||||
}
|
||||
|
||||
swappedGC <- function(GC) {
|
||||
# Return a genetic code with randomly swapped amino acids.
|
||||
# Parameters:
|
||||
# GC named chr length-64 character vector of 20 amino acid one-letter
|
||||
# codes plus "*" (stop), named with the codon triplet.
|
||||
# Value: named chr same vector with random amino acid assignments where the
|
||||
# amino acids have been swapped.
|
||||
|
||||
aaOrig <- unique(GC) # the amino acids in the input code
|
||||
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
|
||||
names(aaSwap) <- aaOrig # name them after the original
|
||||
GC[1:64] <- aaSwap[GC] # replace original with shuffled
|
||||
|
||||
return(GC)
|
||||
}
|
||||
|
||||
|
||||
# == 2.2 Effect of mutations ===============================================
|
||||
|
||||
|
||||
# To evaluate the effects of mutations we will do the following:
|
||||
# - we take an amino acid sequence (Mbp1 will do just nicely);
|
||||
# - we reverse-translate it into a nucleotide sequence;
|
||||
# - we mutate it randomly;
|
||||
# - we translate it back to amino acids;
|
||||
# - we count the number of mutations and evaluate their severity.
|
||||
|
||||
|
||||
# === 2.2.1 reverse-translate
|
||||
|
||||
# To reverse-translate an amino acid vector, we randomly pick one of its
|
||||
# codons from a genetic code, and assemble all codons to a sequence.
|
||||
|
||||
traRev <- function(s, GC) {
|
||||
# Parameters:
|
||||
# s chr a sequence vector
|
||||
# GC chr a genetic code
|
||||
# Value:
|
||||
# A reverse-translated vector of codons
|
||||
vC <- character(length(s))
|
||||
|
||||
for (i in seq_along(s)) {
|
||||
codon <- names(GC)[GC == s[i]] # get all codons for this AA
|
||||
if (length(codon) > 1) { # if there's more than one ...
|
||||
codon <- sample(codon, 1) # pick one at random ...
|
||||
}
|
||||
vC[i] <- codon # store it
|
||||
}
|
||||
|
||||
return(vC)
|
||||
}
|
||||
|
||||
|
||||
# === 2.2.2 Randomly mutate
|
||||
|
||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
|
||||
# one of the three with another nucleotide.
|
||||
|
||||
randMut <- function(vC) {
|
||||
# Parameter:
|
||||
# vC chr a vector of codons
|
||||
# Value: chr a vector of codons with a single point mutation from vC
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
|
||||
for (i in seq_along(vC)) {
|
||||
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
|
||||
iNuc <- sample(1:3, 1) # choose one of the three
|
||||
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
|
||||
triplet[iNuc] <- mutNuc # replace the original
|
||||
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
|
||||
}
|
||||
return(vC)
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
# === 2.2.3 Forward- translate
|
||||
|
||||
traFor <- function(vC, GC) {
|
||||
# Parameters:
|
||||
# vC chr a codon vector
|
||||
# GC chr a genetic code
|
||||
# Value:
|
||||
# A vector of amino acids
|
||||
vAA <- character(length(vC))
|
||||
|
||||
for (i in seq_along(vC)) {
|
||||
vAA[i] <- GC[vC[i]] # translate and store
|
||||
}
|
||||
return(vAA)
|
||||
}
|
||||
|
||||
|
||||
# === 2.2.4 measure effect
|
||||
|
||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
|
||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
|
||||
# categories, according to their free energy of transfer from water to octanol:
|
||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
||||
|
||||
# Then we will penalize as follows:
|
||||
# Changes within one category: 0.1
|
||||
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
|
||||
# Changes from hydrophobic to hydrophilic or back: 1.0
|
||||
# Changes to stop-codon: 3.0
|
||||
|
||||
evalMut <- function(nat, mut) {
|
||||
# Evaluate severity of mutations between amino acid sequence vectors nat and
|
||||
# mut in an ad hoc approach based on hydrophobicity changes.
|
||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
||||
|
||||
penalties <- numeric(length(nat))
|
||||
lMut <- nat != mut # logical TRUE for all mutated positions
|
||||
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
|
||||
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
|
||||
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
|
||||
|
||||
return(sum(penalties))
|
||||
}
|
||||
|
||||
# A more sophisticated approach could take additional quantities into account,
|
||||
# such as charge, size, or flexibility - and it could add heuristics, such as:
|
||||
# proline is always bad in secondary structure, charged amino acids are terrible
|
||||
# in the folded core of a protein, replacing a small by a large amino acid in
|
||||
# the core is very disruptive ... etc.
|
||||
#
|
||||
# For our experiment, we should not use a mutation data matrix however:
|
||||
# empirical mutation probabilities are superbly suited to estimate evolutionary
|
||||
# relationships. Here however, as we are trying to evaluate effects of random
|
||||
# mutations on genetic codes, our reasoning would be circular - we would
|
||||
# discover that the natural genetic code is optimal ... because it is most
|
||||
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
|
||||
|
||||
|
||||
# = 3 Run the experiment ==================================================
|
||||
|
||||
# Fetch the standard Genetic code from Biostrings::
|
||||
|
||||
stdCode <- Biostrings::GENETIC_CODE
|
||||
|
||||
# Fetch the nucleotide sequence for MBP1:
|
||||
|
||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
||||
myDNA <- paste0(myDNA, collapse = "")
|
||||
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
|
||||
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
||||
|
||||
myAA <- traFor(myDNA, stdCode)
|
||||
|
||||
# Mutate and evaluate
|
||||
set.seed(112358)
|
||||
x <- randMut(myDNA)
|
||||
set.seed(NULL)
|
||||
x <- traFor(x, stdCode)
|
||||
evalMut(myAA, x) # 166.4
|
||||
|
||||
# Try this 200 times, and see how the values are distributed.
|
||||
N <- 200
|
||||
valSTDC <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) { # this takes a few seconds ...
|
||||
x <- randMut(myDNA) # mutate
|
||||
x <- traFor(x, stdCode) # translate
|
||||
valSTDC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valSTDC,
|
||||
breaks = 15,
|
||||
col = "palegoldenrod",
|
||||
xlim = c(0, 400),
|
||||
ylim = c(0, N/4),
|
||||
main = "Standard vs. Synthetic Genetic Code",
|
||||
xlab = "Mutation penalty")
|
||||
|
||||
# This looks like a normal distribution. Let's assume the effect of mutations
|
||||
# under the standard genetic code is the mean of this distribution:
|
||||
effectSTDC <- mean(valSTDC) # 178.1
|
||||
|
||||
# Now we can look at the effects of alternate genetic codes:
|
||||
|
||||
set.seed(112358)
|
||||
# choose a new code
|
||||
GC <- randomGC(stdCode)
|
||||
set.seed(NULL)
|
||||
|
||||
# reverse translate hypothetical sequence according to the new code
|
||||
x <- traRev(myAA, GC)
|
||||
|
||||
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
|
||||
x <- traFor(x, GC) # translate back, with the new code
|
||||
evalMut(myAA, x) # evaluate mutation effects: 298.5
|
||||
|
||||
# That seems a fair bit higher than what we saw as "effectUGC"
|
||||
# Let's try with different genetic codes. 200 trials - but this time every trial
|
||||
# is with a different, synthetic genetic code.
|
||||
|
||||
N <- 200
|
||||
valXGC <- numeric(N)
|
||||
|
||||
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- randomGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
valXGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valXGC,
|
||||
col = "plum",
|
||||
breaks = 15,
|
||||
add = TRUE)
|
||||
|
||||
# These two distributions are very widely separated!
|
||||
|
||||
# Task: Perform the same experiment with the swapped genetic code.
|
||||
# Compare the distributions. Interpret the result.
|
||||
|
||||
|
||||
# These are simple experiments, under assumptions that can be refined in
|
||||
# meaningful ways. Yet, even those simple computational experiments show
|
||||
# that the Universal Genetic Code has features that one would predict if
|
||||
# it has evolved under selective pressure to minimize the effects of mutations.
|
||||
# Gradual change under mutation is benificial to evolution, disruptive
|
||||
# change is not.
|
||||
|
||||
|
||||
# = 4 Task solutions ======================================================
|
||||
|
||||
N <- 200
|
||||
valSGC <- numeric(N)
|
||||
|
||||
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- swappedGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
valSGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valSGC,
|
||||
col = "#6688FF88",
|
||||
breaks = 15,
|
||||
add = TRUE)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-Genetic_code_optimality.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||
#
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 2020 Maintenance
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||
# 1.0 New material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Designing a computational experiment 58
|
||||
#TOC> 2 Setting up the tools 74
|
||||
#TOC> 2.1 Natural and alternative genetic codes 77
|
||||
#TOC> 2.2 Effect of mutations 135
|
||||
#TOC> 2.2.1 reverse-translate 146
|
||||
#TOC> 2.2.2 Randomly mutate 171
|
||||
#TOC> 2.2.3 Forward- translate 196
|
||||
#TOC> 2.2.4 measure effect 213
|
||||
#TOC> 3 Run the experiment 267
|
||||
#TOC> 4 Task solutions 363
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
|
||||
# their robsustness to code changes. The approaches are quite simple and you
|
||||
# will be able to come up with obvious refinements; the point of this code is to
|
||||
# demonstrate some R programming techniques, in preparation for more
|
||||
# sophisticated questions later.
|
||||
|
||||
|
||||
# = 1 Designing a computational experiment ================================
|
||||
|
||||
# Computational experiments are conducted like wet-lab experiments. We begin
|
||||
# with a hypothesis, then define the observables that relate to the hypothesis,
|
||||
# then define the measures we apply to observations, and finally we interpret
|
||||
# our observations. If we want to learn something about the evolution of the
|
||||
# genetic code ...
|
||||
|
||||
# - we construct a hypothesis such as: the genetic code has evolved so as to
|
||||
# minimize the effect of mutations;
|
||||
# - we define the observables: the effect of mutations in
|
||||
# sequences, given the natural and possible alternative codes;
|
||||
# - we define the measures to quantify the effect of mutations;
|
||||
# - then we compute alternatives and interpret the results.
|
||||
|
||||
|
||||
# = 2 Setting up the tools ================================================
|
||||
|
||||
|
||||
# == 2.1 Natural and alternative genetic codes =============================
|
||||
|
||||
# Load genetic code tables from the Biostrings package
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# There are many ways to generate alternative codes. The simplest way is to
|
||||
# randomly assign amino acids to codons. A more sophisticated way is to keep the
|
||||
# redundancy of codons intact, since it may reflect some form of symmetry
|
||||
# breaking that ignores the third nucleotide of a codon for the most part;
|
||||
# therefore we only replace the amino acids of the existing code with random
|
||||
# others. Here are two functions that implement these two ideas about alternate
|
||||
# codes.
|
||||
|
||||
randomGC <- function(GC) {
|
||||
# Return a genetic code with randomly assigned amino acids.
|
||||
# Parameters:
|
||||
# GC named chr length-64 character vector of 20 amino acid one-letter
|
||||
# codes plus "*" (stop), named with the codon triplet.
|
||||
# Value: named chr same vector with random amino acid assignments in which
|
||||
# every amino acid and "*" is encoded at least once.
|
||||
|
||||
aa <- unique(GC) # the amino acids in the input code
|
||||
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
|
||||
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
|
||||
# does not contain all amino acids,
|
||||
# then we sample() again.
|
||||
GC[1:64] <- sample(aa, 64, replace = TRUE)
|
||||
}
|
||||
return(GC)
|
||||
}
|
||||
|
||||
swappedGC <- function(GC) {
|
||||
# Return a genetic code with randomly swapped amino acids.
|
||||
# Parameters:
|
||||
# GC named chr length-64 character vector of 20 amino acid one-letter
|
||||
# codes plus "*" (stop), named with the codon triplet.
|
||||
# Value: named chr same vector with random amino acid assignments where the
|
||||
# amino acids have been swapped.
|
||||
|
||||
aaOrig <- unique(GC) # the amino acids in the input code
|
||||
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
|
||||
names(aaSwap) <- aaOrig # name them after the original
|
||||
GC[1:64] <- aaSwap[GC] # replace original with shuffled
|
||||
|
||||
return(GC)
|
||||
}
|
||||
|
||||
|
||||
# == 2.2 Effect of mutations ===============================================
|
||||
|
||||
|
||||
# To evaluate the effects of mutations we will do the following:
|
||||
# - we take an amino acid sequence (Mbp1 will do just nicely);
|
||||
# - we reverse-translate it into a nucleotide sequence;
|
||||
# - we mutate it randomly;
|
||||
# - we translate it back to amino acids;
|
||||
# - we count the number of mutations and evaluate their severity.
|
||||
|
||||
|
||||
# === 2.2.1 reverse-translate
|
||||
|
||||
# To reverse-translate an amino acid vector, we randomly pick one of its
|
||||
# codons from a genetic code, and assemble all codons to a sequence.
|
||||
|
||||
traRev <- function(s, GC) {
|
||||
# Parameters:
|
||||
# s chr a sequence vector
|
||||
# GC chr a genetic code
|
||||
# Value:
|
||||
# A reverse-translated vector of codons
|
||||
vC <- character(length(s))
|
||||
|
||||
for (i in seq_along(s)) {
|
||||
codon <- names(GC)[GC == s[i]] # get all codons for this AA
|
||||
if (length(codon) > 1) { # if there's more than one ...
|
||||
codon <- sample(codon, 1) # pick one at random ...
|
||||
}
|
||||
vC[i] <- codon # store it
|
||||
}
|
||||
|
||||
return(vC)
|
||||
}
|
||||
|
||||
|
||||
# === 2.2.2 Randomly mutate
|
||||
|
||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
|
||||
# one of the three with another nucleotide.
|
||||
|
||||
randMut <- function(vC) {
|
||||
# Parameter:
|
||||
# vC chr a vector of codons
|
||||
# Value: chr a vector of codons with a single point mutation from vC
|
||||
|
||||
nuc <- c("A", "C", "G", "T")
|
||||
|
||||
for (i in seq_along(vC)) {
|
||||
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
|
||||
iNuc <- sample(1:3, 1) # choose one of the three
|
||||
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
|
||||
triplet[iNuc] <- mutNuc # replace the original
|
||||
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
|
||||
}
|
||||
return(vC)
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
# === 2.2.3 Forward- translate
|
||||
|
||||
traFor <- function(vC, GC) {
|
||||
# Parameters:
|
||||
# vC chr a codon vector
|
||||
# GC chr a genetic code
|
||||
# Value:
|
||||
# A vector of amino acids
|
||||
vAA <- character(length(vC))
|
||||
|
||||
for (i in seq_along(vC)) {
|
||||
vAA[i] <- GC[vC[i]] # translate and store
|
||||
}
|
||||
return(vAA)
|
||||
}
|
||||
|
||||
|
||||
# === 2.2.4 measure effect
|
||||
|
||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
|
||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
|
||||
# categories, according to their free energy of transfer from water to octanol:
|
||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
||||
|
||||
# Then we will penalize as follows:
|
||||
# Changes within one category: 0.1
|
||||
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
|
||||
# Changes from hydrophobic to hydrophilic or back: 1.0
|
||||
# Changes to stop-codon: 3.0
|
||||
|
||||
evalMut <- function(nat, mut) {
|
||||
# Evaluate severity of mutations between amino acid sequence vectors nat and
|
||||
# mut in an ad hoc approach based on hydrophobicity changes.
|
||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
|
||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
|
||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
|
||||
|
||||
penalties <- numeric(length(nat))
|
||||
lMut <- nat != mut # logical TRUE for all mutated positions
|
||||
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
|
||||
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
|
||||
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
|
||||
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
|
||||
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
|
||||
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
|
||||
|
||||
return(sum(penalties))
|
||||
}
|
||||
|
||||
# A more sophisticated approach could take additional quantities into account,
|
||||
# such as charge, size, or flexibility - and it could add heuristics, such as:
|
||||
# proline is always bad in secondary structure, charged amino acids are terrible
|
||||
# in the folded core of a protein, replacing a small by a large amino acid in
|
||||
# the core is very disruptive ... etc.
|
||||
#
|
||||
# For our experiment, we should not use a mutation data matrix however:
|
||||
# empirical mutation probabilities are superbly suited to estimate evolutionary
|
||||
# relationships. Here however, as we are trying to evaluate effects of random
|
||||
# mutations on genetic codes, our reasoning would be circular - we would
|
||||
# discover that the natural genetic code is optimal ... because it is most
|
||||
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
|
||||
|
||||
|
||||
# = 3 Run the experiment ==================================================
|
||||
|
||||
# Fetch the standard Genetic code from Biostrings::
|
||||
|
||||
stdCode <- Biostrings::GENETIC_CODE
|
||||
|
||||
# Fetch the nucleotide sequence for MBP1:
|
||||
|
||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
||||
myDNA <- paste0(myDNA, collapse = "")
|
||||
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
|
||||
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
||||
|
||||
myAA <- traFor(myDNA, stdCode)
|
||||
|
||||
# Mutate and evaluate
|
||||
set.seed(112358)
|
||||
x <- randMut(myDNA)
|
||||
set.seed(NULL)
|
||||
x <- traFor(x, stdCode)
|
||||
evalMut(myAA, x) # 166.4
|
||||
|
||||
# Try this 200 times, and see how the values are distributed.
|
||||
N <- 200
|
||||
valSTDC <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) { # this takes a few seconds ...
|
||||
x <- randMut(myDNA) # mutate
|
||||
x <- traFor(x, stdCode) # translate
|
||||
valSTDC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valSTDC,
|
||||
breaks = 15,
|
||||
col = "palegoldenrod",
|
||||
xlim = c(0, 400),
|
||||
ylim = c(0, N/4),
|
||||
main = "Standard vs. Synthetic Genetic Code",
|
||||
xlab = "Mutation penalty")
|
||||
|
||||
# This looks like a normal distribution. Let's assume the effect of mutations
|
||||
# under the standard genetic code is the mean of this distribution:
|
||||
effectSTDC <- mean(valSTDC) # 178.1
|
||||
|
||||
# Now we can look at the effects of alternate genetic codes:
|
||||
|
||||
set.seed(112358)
|
||||
# choose a new code
|
||||
GC <- randomGC(stdCode)
|
||||
set.seed(NULL)
|
||||
|
||||
# reverse translate hypothetical sequence according to the new code
|
||||
x <- traRev(myAA, GC)
|
||||
|
||||
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
|
||||
x <- traFor(x, GC) # translate back, with the new code
|
||||
evalMut(myAA, x) # evaluate mutation effects: 298.5
|
||||
|
||||
# That seems a fair bit higher than what we saw as "effectUGC"
|
||||
# Let's try with different genetic codes. 200 trials - but this time every trial
|
||||
# is with a different, synthetic genetic code.
|
||||
|
||||
N <- 200
|
||||
valXGC <- numeric(N)
|
||||
|
||||
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- randomGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
valXGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valXGC,
|
||||
col = "plum",
|
||||
breaks = 15,
|
||||
add = TRUE)
|
||||
|
||||
# These two distributions are very widely separated!
|
||||
|
||||
# Task: Perform the same experiment with the swapped genetic code.
|
||||
# Compare the distributions. Interpret the result.
|
||||
|
||||
|
||||
# These are simple experiments, under assumptions that can be refined in
|
||||
# meaningful ways. Yet, even those simple computational experiments show
|
||||
# that the Universal Genetic Code has features that one would predict if
|
||||
# it has evolved under selective pressure to minimize the effects of mutations.
|
||||
# Gradual change under mutation is benificial to evolution, disruptive
|
||||
# change is not.
|
||||
|
||||
|
||||
# = 4 Task solutions ======================================================
|
||||
|
||||
N <- 200
|
||||
valSGC <- numeric(N)
|
||||
|
||||
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- swappedGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
valSGC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valSGC,
|
||||
col = "#6688FF88",
|
||||
breaks = 15,
|
||||
add = TRUE)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,50 +1,50 @@
|
||||
# tocID <- "RPR-Introduction.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Introduction unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09-18
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 Updtaed workflow; live
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# === TASK: Local script
|
||||
#
|
||||
# - Open the file myScript.R
|
||||
#
|
||||
# - Create a section header with a date.
|
||||
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
|
||||
# from 0). Not a loop - a single expression. The first number you get must
|
||||
# be 1. The last number you get must be 1024.
|
||||
#
|
||||
# - Save the file in the myScripts folder, and close it.
|
||||
#
|
||||
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
|
||||
# to execute it.
|
||||
#
|
||||
# - Done
|
||||
|
||||
# (This task is meant to make sure that writing R expressions, saving
|
||||
# them in scripts, opening script files and executing code in the file works
|
||||
# for you. If there is an issue, get in touch.)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-Introduction.R"
|
||||
#
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Introduction unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09-18
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 Updtaed workflow; live
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# === TASK: Local script
|
||||
#
|
||||
# - Open the file myScript.R
|
||||
#
|
||||
# - Create a section header with a date.
|
||||
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
|
||||
# from 0). Not a loop - a single expression. The first number you get must
|
||||
# be 1. The last number you get must be 1024.
|
||||
#
|
||||
# - Save the file in the myScripts folder, and close it.
|
||||
#
|
||||
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
|
||||
# to execute it.
|
||||
#
|
||||
# - Done
|
||||
|
||||
# (This task is meant to make sure that writing R expressions, saving
|
||||
# them in scripts, opening script files and executing code in the file works
|
||||
# for you. If there is an issue, get in touch.)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,168 +1,168 @@
|
||||
# tocID <- "RPR-PROSITE_POST.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0.1 Updates for slightly changed interfaces
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------------
|
||||
#TOC> 1 Constructing a POST command from a Web query 43
|
||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
|
||||
#TOC> 2 Task solutions 156
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Constructing a POST command from a Web query ========================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
|
||||
|
||||
# We have reverse engineered the Web form for a ScanProsite request, and can
|
||||
# construct a valid POST request from knowing the required field names. The POST
|
||||
# command is similar to GET(), but we need an explicit request body that
|
||||
# contains a list of key/value pairs
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||
|
||||
response <- httr::POST(URL,
|
||||
body = list(meta = "opt1",
|
||||
meta1_protein = "opt1",
|
||||
seq = UniProtID,
|
||||
skip = "on",
|
||||
output = "tabular"))
|
||||
|
||||
# Send off this request, and you should have a response in a few
|
||||
# seconds. Let's check the status first:
|
||||
|
||||
httr::status_code(response) # If this is not 200, something went wrong and it
|
||||
# makes no sense to continue. If this persists, ask
|
||||
# on the Discussion Board what to do.
|
||||
|
||||
|
||||
# The text contents of the response is available with the
|
||||
# content() function:
|
||||
httr::content(response, "text")
|
||||
|
||||
# ... should show you the same as the page contents that you have seen in the
|
||||
# browser. Now we need to extract the data from the page. For this simple
|
||||
# example we can get away with using regular expressions, but in general we need
|
||||
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
|
||||
# strsplit() the response into individual lines, since each of our data elements
|
||||
# is on its own line, and then capture the contents. The way Prosite has
|
||||
# formatted their HTML we can simply split on the "\\n" newline character - but
|
||||
# they could write the same valid HTML without any newline-characters at all.
|
||||
# Understand that we are working with a bit of a "hack" here: exploting
|
||||
# empirical assumptions rather than a formal specification. But sometimes quick
|
||||
# and dirty is fine, because quick.
|
||||
|
||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||
head(lines)
|
||||
|
||||
# Now we define a query pattern for the lines we want:
|
||||
# we can use the uID, bracketed by two "|" pipe
|
||||
# characters:
|
||||
|
||||
patt <- sprintf("\\|%s\\|", UniProtID)
|
||||
|
||||
# ... and select only the lines that match this
|
||||
# pattern:
|
||||
|
||||
( lines <- lines[grep(patt, lines)] )
|
||||
|
||||
# ... captures the three lines of output.
|
||||
|
||||
# Now we break the lines apart into tokens: this is another application of
|
||||
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
|
||||
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
|
||||
|
||||
unlist(strsplit(lines[1], "\\t|\\|"))
|
||||
|
||||
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
|
||||
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
|
||||
# not the literal character "t". And "|" has to be escaped because we mean the
|
||||
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
|
||||
# backslash turns a special meaning off, and sometimes it turns a special
|
||||
# meaning on. Unfortunately there's no easy way to tell - you just need to
|
||||
# remember the characters - or have a reference handy. The metacharacters are
|
||||
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
|
||||
# where in the regex they are.
|
||||
|
||||
# Let's put the tokens into named slots of a data frame
|
||||
|
||||
features <- data.frame()
|
||||
for (line in lines) {
|
||||
tokens <- unlist(strsplit(line, "\\t|\\|"))
|
||||
features <- rbind(features,
|
||||
data.frame(uID = tokens[2],
|
||||
start = as.numeric(tokens[4]),
|
||||
end = as.numeric(tokens[5]),
|
||||
psID = tokens[6],
|
||||
psName = tokens[7],
|
||||
psSeq = tokens[11]))
|
||||
}
|
||||
features
|
||||
|
||||
# This forms the base of a function that collects the features automatically
|
||||
# from a PrositeScan result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchPrositeFeatures() function ============================
|
||||
|
||||
|
||||
# Task: write a function that takes as input a UniProt ID, fetches the
|
||||
# features it contains from ScanProsite and returns a data frame as given above, or
|
||||
# an empty data frame if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
|
||||
# clicking on dbFetchPrositeFeatures() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchPrositeFeatures("Q5KMQ9")
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-PROSITE_POST.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0.1 Updates for slightly changed interfaces
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------------
|
||||
#TOC> 1 Constructing a POST command from a Web query 43
|
||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
|
||||
#TOC> 2 Task solutions 156
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Constructing a POST command from a Web query ========================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
|
||||
|
||||
# We have reverse engineered the Web form for a ScanProsite request, and can
|
||||
# construct a valid POST request from knowing the required field names. The POST
|
||||
# command is similar to GET(), but we need an explicit request body that
|
||||
# contains a list of key/value pairs
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||
|
||||
response <- httr::POST(URL,
|
||||
body = list(meta = "opt1",
|
||||
meta1_protein = "opt1",
|
||||
seq = UniProtID,
|
||||
skip = "on",
|
||||
output = "tabular"))
|
||||
|
||||
# Send off this request, and you should have a response in a few
|
||||
# seconds. Let's check the status first:
|
||||
|
||||
httr::status_code(response) # If this is not 200, something went wrong and it
|
||||
# makes no sense to continue. If this persists, ask
|
||||
# on the Discussion Board what to do.
|
||||
|
||||
|
||||
# The text contents of the response is available with the
|
||||
# content() function:
|
||||
httr::content(response, "text")
|
||||
|
||||
# ... should show you the same as the page contents that you have seen in the
|
||||
# browser. Now we need to extract the data from the page. For this simple
|
||||
# example we can get away with using regular expressions, but in general we need
|
||||
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
|
||||
# strsplit() the response into individual lines, since each of our data elements
|
||||
# is on its own line, and then capture the contents. The way Prosite has
|
||||
# formatted their HTML we can simply split on the "\\n" newline character - but
|
||||
# they could write the same valid HTML without any newline-characters at all.
|
||||
# Understand that we are working with a bit of a "hack" here: exploting
|
||||
# empirical assumptions rather than a formal specification. But sometimes quick
|
||||
# and dirty is fine, because quick.
|
||||
|
||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||
head(lines)
|
||||
|
||||
# Now we define a query pattern for the lines we want:
|
||||
# we can use the uID, bracketed by two "|" pipe
|
||||
# characters:
|
||||
|
||||
patt <- sprintf("\\|%s\\|", UniProtID)
|
||||
|
||||
# ... and select only the lines that match this
|
||||
# pattern:
|
||||
|
||||
( lines <- lines[grep(patt, lines)] )
|
||||
|
||||
# ... captures the three lines of output.
|
||||
|
||||
# Now we break the lines apart into tokens: this is another application of
|
||||
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
|
||||
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
|
||||
|
||||
unlist(strsplit(lines[1], "\\t|\\|"))
|
||||
|
||||
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
|
||||
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
|
||||
# not the literal character "t". And "|" has to be escaped because we mean the
|
||||
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
|
||||
# backslash turns a special meaning off, and sometimes it turns a special
|
||||
# meaning on. Unfortunately there's no easy way to tell - you just need to
|
||||
# remember the characters - or have a reference handy. The metacharacters are
|
||||
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
|
||||
# where in the regex they are.
|
||||
|
||||
# Let's put the tokens into named slots of a data frame
|
||||
|
||||
features <- data.frame()
|
||||
for (line in lines) {
|
||||
tokens <- unlist(strsplit(line, "\\t|\\|"))
|
||||
features <- rbind(features,
|
||||
data.frame(uID = tokens[2],
|
||||
start = as.numeric(tokens[4]),
|
||||
end = as.numeric(tokens[5]),
|
||||
psID = tokens[6],
|
||||
psName = tokens[7],
|
||||
psSeq = tokens[11]))
|
||||
}
|
||||
features
|
||||
|
||||
# This forms the base of a function that collects the features automatically
|
||||
# from a PrositeScan result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchPrositeFeatures() function ============================
|
||||
|
||||
|
||||
# Task: write a function that takes as input a UniProt ID, fetches the
|
||||
# features it contains from ScanProsite and returns a data frame as given above, or
|
||||
# an empty data frame if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
|
||||
# clicking on dbFetchPrositeFeatures() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchPrositeFeatures("Q5KMQ9")
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
270
RPR-Pipe.R
270
RPR-Pipe.R
@ -1,135 +1,135 @@
|
||||
# tocID <- "RPR-Pipe.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# Discussing pipe operators.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2021 10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# - find more interesting examples
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------
|
||||
#TOC> 1 Pipe Concept 41
|
||||
#TOC> 2 Nested Expression 73
|
||||
#TOC> 3 magrittr:: Pipe 78
|
||||
#TOC> 4 Base R Pipe 93
|
||||
#TOC> 5 Intermediate Assignment 108
|
||||
#TOC> 6 Postscript 127
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Pipe Concept =======================================================
|
||||
|
||||
# Pipes are actually an awesome idea for any code that implements a workflow -
|
||||
# a sequence of operations, each of which transforms data in a specialized way.
|
||||
#
|
||||
# This principle is familiar from maths: chained functions. If have a function
|
||||
# y = f(x) and want to use those results as in z = g(y), I can just write
|
||||
# z = g(f(x))
|
||||
#
|
||||
# On the unix command line, pipes were used from the very beginning, implemented
|
||||
# with the "|" pipe character.
|
||||
#
|
||||
# In R, the magrittr package provided the %>% operator, and recently the |>
|
||||
# operator has been introduced into base R.
|
||||
#
|
||||
# However there are alternatives: intermediate assignment, and nested functions
|
||||
# that have always existed in base R anyway.
|
||||
#
|
||||
# Let us look at an example. In writing this, I found out that virtually
|
||||
# ALL non-trivial examples I came up with don't translate well into this idiom
|
||||
# at all. It is actually quite limited to simple filtering operations on
|
||||
# data. A more interesting example might be added in the future, let me know if
|
||||
# you have a good idea.
|
||||
#
|
||||
# A somewhat contrived example is to sort a list of files by the
|
||||
# length of the file names:
|
||||
|
||||
myFiles <- list.files(pattern = "\\.R$")
|
||||
|
||||
# nchar() gives the number of characters in a string, order() produces indices
|
||||
# that map an array to its sorted form.
|
||||
#
|
||||
# = 2 Nested Expression ===================================================
|
||||
|
||||
myFiles[order(nchar(myFiles))]
|
||||
|
||||
|
||||
# = 3 magrittr:: Pipe =====================================================
|
||||
|
||||
if (! requireNamespace("magrittr", quietly = TRUE)) {
|
||||
install.packages("magrittr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = magrittr) # basic information
|
||||
# browseVignettes("magrittr") # available vignettes
|
||||
# data(package = "magrittr") # available datasets
|
||||
|
||||
|
||||
library(magrittr)
|
||||
|
||||
myFiles %>% nchar %>% order %>% myFiles[.]
|
||||
|
||||
# = 4 Base R Pipe =========================================================
|
||||
|
||||
# Since version 4.1, base R now supports a pipe operator without the need
|
||||
# to load a special package. Such an introductions of external functionality
|
||||
# into the language is very rare.
|
||||
#
|
||||
# Unfortunately it won't (yet) work with the '[' function, so we need to write
|
||||
# an intermediate function for this example
|
||||
extract <- function(x, v) {
|
||||
return(v[x])
|
||||
}
|
||||
|
||||
myFiles |> nchar() |> order() |> extract(myFiles)
|
||||
|
||||
|
||||
# = 5 Intermediate Assignment =============================================
|
||||
|
||||
# So what's the problem? As you can see, the piped code may be concise and
|
||||
# expressive. But there is also a large amount of implicit assignment and
|
||||
# processing going on and that is usually a bad idea because it makes code hard
|
||||
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
|
||||
# replacing it with the pipe makes things much better. My preferred idiom is
|
||||
# to use intermediate assignments. Only then is it convenient to examine
|
||||
# the code step by step and validate every single step. And that is the most
|
||||
# important objective at all: no code is good if it does not compute
|
||||
# correctly.
|
||||
|
||||
|
||||
x <- nchar(myFiles)
|
||||
x <- order(x)
|
||||
myFiles[x]
|
||||
|
||||
|
||||
|
||||
# = 6 Postscript ==========================================================
|
||||
|
||||
# I tried to write an example that strips all comments from a list of files, and
|
||||
# another example that finds all files that were not yet updated this year
|
||||
# (according to the "# Date: in the header). Neither examples can be well
|
||||
# written without intermediate assignments, or at least sapply() functions
|
||||
# that are not simpler at all than the intermediate assignment.
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-Pipe.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# Discussing pipe operators.
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2021 10
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
# - find more interesting examples
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------
|
||||
#TOC> 1 Pipe Concept 41
|
||||
#TOC> 2 Nested Expression 73
|
||||
#TOC> 3 magrittr:: Pipe 78
|
||||
#TOC> 4 Base R Pipe 93
|
||||
#TOC> 5 Intermediate Assignment 108
|
||||
#TOC> 6 Postscript 127
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Pipe Concept =======================================================
|
||||
|
||||
# Pipes are actually an awesome idea for any code that implements a workflow -
|
||||
# a sequence of operations, each of which transforms data in a specialized way.
|
||||
#
|
||||
# This principle is familiar from maths: chained functions. If have a function
|
||||
# y = f(x) and want to use those results as in z = g(y), I can just write
|
||||
# z = g(f(x))
|
||||
#
|
||||
# On the unix command line, pipes were used from the very beginning, implemented
|
||||
# with the "|" pipe character.
|
||||
#
|
||||
# In R, the magrittr package provided the %>% operator, and recently the |>
|
||||
# operator has been introduced into base R.
|
||||
#
|
||||
# However there are alternatives: intermediate assignment, and nested functions
|
||||
# that have always existed in base R anyway.
|
||||
#
|
||||
# Let us look at an example. In writing this, I found out that virtually
|
||||
# ALL non-trivial examples I came up with don't translate well into this idiom
|
||||
# at all. It is actually quite limited to simple filtering operations on
|
||||
# data. A more interesting example might be added in the future, let me know if
|
||||
# you have a good idea.
|
||||
#
|
||||
# A somewhat contrived example is to sort a list of files by the
|
||||
# length of the file names:
|
||||
|
||||
myFiles <- list.files(pattern = "\\.R$")
|
||||
|
||||
# nchar() gives the number of characters in a string, order() produces indices
|
||||
# that map an array to its sorted form.
|
||||
#
|
||||
# = 2 Nested Expression ===================================================
|
||||
|
||||
myFiles[order(nchar(myFiles))]
|
||||
|
||||
|
||||
# = 3 magrittr:: Pipe =====================================================
|
||||
|
||||
if (! requireNamespace("magrittr", quietly = TRUE)) {
|
||||
install.packages("magrittr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = magrittr) # basic information
|
||||
# browseVignettes("magrittr") # available vignettes
|
||||
# data(package = "magrittr") # available datasets
|
||||
|
||||
|
||||
library(magrittr)
|
||||
|
||||
myFiles %>% nchar %>% order %>% myFiles[.]
|
||||
|
||||
# = 4 Base R Pipe =========================================================
|
||||
|
||||
# Since version 4.1, base R now supports a pipe operator without the need
|
||||
# to load a special package. Such an introductions of external functionality
|
||||
# into the language is very rare.
|
||||
#
|
||||
# Unfortunately it won't (yet) work with the '[' function, so we need to write
|
||||
# an intermediate function for this example
|
||||
extract <- function(x, v) {
|
||||
return(v[x])
|
||||
}
|
||||
|
||||
myFiles |> nchar() |> order() |> extract(myFiles)
|
||||
|
||||
|
||||
# = 5 Intermediate Assignment =============================================
|
||||
|
||||
# So what's the problem? As you can see, the piped code may be concise and
|
||||
# expressive. But there is also a large amount of implicit assignment and
|
||||
# processing going on and that is usually a bad idea because it makes code hard
|
||||
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
|
||||
# replacing it with the pipe makes things much better. My preferred idiom is
|
||||
# to use intermediate assignments. Only then is it convenient to examine
|
||||
# the code step by step and validate every single step. And that is the most
|
||||
# important objective at all: no code is good if it does not compute
|
||||
# correctly.
|
||||
|
||||
|
||||
x <- nchar(myFiles)
|
||||
x <- order(x)
|
||||
myFiles[x]
|
||||
|
||||
|
||||
|
||||
# = 6 Postscript ==========================================================
|
||||
|
||||
# I tried to write an example that strips all comments from a list of files, and
|
||||
# another example that finds all files that were not yet updated this year
|
||||
# (according to the "# Date: in the header). Neither examples can be well
|
||||
# written without intermediate assignments, or at least sapply() functions
|
||||
# that are not simpler at all than the intermediate assignment.
|
||||
|
||||
# [END]
|
||||
|
360
RPR-RegEx.R
360
RPR-RegEx.R
@ -1,180 +1,180 @@
|
||||
# tocID <- "RPR-RegEx.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-RegEx unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017-08 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 0.1 Maintenance 2020
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 A regex example 41
|
||||
#TOC> 2 Counting lines 108
|
||||
#TOC> 2.1 Counting C-alpha atoms only 126
|
||||
#TOC> 3 Code Solutions 142
|
||||
#TOC> 3.1 Counting atoms 144
|
||||
#TOC> 3.2 Counting C-alpha records 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 A regex example =====================================================
|
||||
|
||||
# The canonical FASTA version of yeast Mbp1 at Uniprot
|
||||
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
|
||||
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
|
||||
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
|
||||
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
|
||||
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
|
||||
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
|
||||
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
|
||||
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
||||
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
|
||||
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
|
||||
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
|
||||
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
|
||||
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
|
||||
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
|
||||
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
|
||||
|
||||
nchar(s)
|
||||
# Must be 969
|
||||
|
||||
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
|
||||
# two vertical bars ("pipes") in the header record.
|
||||
#
|
||||
|
||||
# Develop the regular expression:
|
||||
# Just five characters returned, so we know we are using
|
||||
patt <- "^>(.{5})" # the right functions
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
patt <- "^>(.*)|" # everything to the pipe character
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Ooops - "|" is a metacharacter - we must escape it
|
||||
|
||||
patt <- "^>(.*)\|" # using "\|"
|
||||
# Ooops - that's not how we escape: must double the \ to send a literal
|
||||
# "\" plus the character "|" to the regex engine.
|
||||
|
||||
patt <- "^>(.*)\\|" # using "\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Good. Now let's first match everything that is not a "|", then match a "|"
|
||||
patt <- "^>([^|]*)\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# the same thing again, but capture the second match. And insist that there
|
||||
# must be at least one character captured
|
||||
|
||||
patt <- "^>[^|]*\\|([^|]+)\\|"
|
||||
# Analyze this pattern:
|
||||
# ^ anchor the match at the beginning of the line
|
||||
# > ">" must be the first character
|
||||
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
|
||||
# we don't know what other versions of the string "sp"
|
||||
# might appear. Note that within the brackets "|" is NOT a
|
||||
# metacharacter.
|
||||
# \\| "|" character: ouside of square brackets "|" is a metacharacter
|
||||
# and means "OR"; we need to escape it to match a literal "|".
|
||||
# ( open parenthesis: capture what comes next ...
|
||||
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
|
||||
# ) close parenthesis: stop capturing here
|
||||
# \\| second "|" character, escaped
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
|
||||
# = 2 Counting lines ======================================================
|
||||
|
||||
# Task: Write a function that returns the number of atoms in a PDB file. Call it
|
||||
# atomCount(). Sample data is here:
|
||||
myPDB <- readLines("./data/0TST.pdb")
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that begin with "ATOM "
|
||||
# or with "HETATM".
|
||||
|
||||
# Try this. Write a function. Solution code is at the end of this file.
|
||||
# Don't peek.
|
||||
|
||||
atomCount("./data/0TST.pdb") # must return 6
|
||||
|
||||
|
||||
|
||||
# == 2.1 Counting C-alpha atoms only =======================================
|
||||
|
||||
# Task: write a function based on the previous one that matches only CA records,
|
||||
# i.e. it can be used to count the number of amino acids. Don't get
|
||||
# fooled by calcium atoms, or the string CA appearing elsewhere.
|
||||
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that have a C-alpha atom.
|
||||
|
||||
# Try this. Solution code is at the end of this file. Don't peek.
|
||||
|
||||
CAcount("./data/0TST.pdb") # must return 1
|
||||
|
||||
|
||||
# = 3 Code Solutions ======================================================
|
||||
|
||||
# == 3.1 Counting atoms ====================================================
|
||||
|
||||
atomCount <- function(IN) {
|
||||
# count the number of atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match "^ATOM " or "^HETATM"
|
||||
# Note: the regex MUST be anchored to the beginning of the line, otherwise
|
||||
# it might match somewhere in a comment!
|
||||
x <- readLines(IN)
|
||||
patt <- "(^ATOM )|(^HETATM)"
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
# == 3.2 Counting C-alpha records ==========================================
|
||||
|
||||
|
||||
CAcount <- function(IN) {
|
||||
# count the number of C-alpha atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match " CA " in position 13 - 16 of
|
||||
# an ATOM record.
|
||||
# Note: the regex MUST be aligned into the right position, otherwise it
|
||||
# might match Calcium records!
|
||||
x <- readLines(IN)
|
||||
patt <- "^ATOM ...... CA "
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-RegEx.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-RegEx unit
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2017-08 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 0.1 Maintenance 2020
|
||||
# V 0.1 First code
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 A regex example 41
|
||||
#TOC> 2 Counting lines 108
|
||||
#TOC> 2.1 Counting C-alpha atoms only 126
|
||||
#TOC> 3 Code Solutions 142
|
||||
#TOC> 3.1 Counting atoms 144
|
||||
#TOC> 3.2 Counting C-alpha records 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 A regex example =====================================================
|
||||
|
||||
# The canonical FASTA version of yeast Mbp1 at Uniprot
|
||||
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
|
||||
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
|
||||
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
|
||||
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
|
||||
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
|
||||
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
|
||||
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
|
||||
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
||||
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
|
||||
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
|
||||
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
|
||||
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
|
||||
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
|
||||
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
|
||||
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
|
||||
|
||||
nchar(s)
|
||||
# Must be 969
|
||||
|
||||
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
|
||||
# two vertical bars ("pipes") in the header record.
|
||||
#
|
||||
|
||||
# Develop the regular expression:
|
||||
# Just five characters returned, so we know we are using
|
||||
patt <- "^>(.{5})" # the right functions
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
patt <- "^>(.*)|" # everything to the pipe character
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Ooops - "|" is a metacharacter - we must escape it
|
||||
|
||||
patt <- "^>(.*)\|" # using "\|"
|
||||
# Ooops - that's not how we escape: must double the \ to send a literal
|
||||
# "\" plus the character "|" to the regex engine.
|
||||
|
||||
patt <- "^>(.*)\\|" # using "\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Good. Now let's first match everything that is not a "|", then match a "|"
|
||||
patt <- "^>([^|]*)\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# the same thing again, but capture the second match. And insist that there
|
||||
# must be at least one character captured
|
||||
|
||||
patt <- "^>[^|]*\\|([^|]+)\\|"
|
||||
# Analyze this pattern:
|
||||
# ^ anchor the match at the beginning of the line
|
||||
# > ">" must be the first character
|
||||
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
|
||||
# we don't know what other versions of the string "sp"
|
||||
# might appear. Note that within the brackets "|" is NOT a
|
||||
# metacharacter.
|
||||
# \\| "|" character: ouside of square brackets "|" is a metacharacter
|
||||
# and means "OR"; we need to escape it to match a literal "|".
|
||||
# ( open parenthesis: capture what comes next ...
|
||||
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
|
||||
# ) close parenthesis: stop capturing here
|
||||
# \\| second "|" character, escaped
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
|
||||
# = 2 Counting lines ======================================================
|
||||
|
||||
# Task: Write a function that returns the number of atoms in a PDB file. Call it
|
||||
# atomCount(). Sample data is here:
|
||||
myPDB <- readLines("./data/0TST.pdb")
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that begin with "ATOM "
|
||||
# or with "HETATM".
|
||||
|
||||
# Try this. Write a function. Solution code is at the end of this file.
|
||||
# Don't peek.
|
||||
|
||||
atomCount("./data/0TST.pdb") # must return 6
|
||||
|
||||
|
||||
|
||||
# == 2.1 Counting C-alpha atoms only =======================================
|
||||
|
||||
# Task: write a function based on the previous one that matches only CA records,
|
||||
# i.e. it can be used to count the number of amino acids. Don't get
|
||||
# fooled by calcium atoms, or the string CA appearing elsewhere.
|
||||
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that have a C-alpha atom.
|
||||
|
||||
# Try this. Solution code is at the end of this file. Don't peek.
|
||||
|
||||
CAcount("./data/0TST.pdb") # must return 1
|
||||
|
||||
|
||||
# = 3 Code Solutions ======================================================
|
||||
|
||||
# == 3.1 Counting atoms ====================================================
|
||||
|
||||
atomCount <- function(IN) {
|
||||
# count the number of atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match "^ATOM " or "^HETATM"
|
||||
# Note: the regex MUST be anchored to the beginning of the line, otherwise
|
||||
# it might match somewhere in a comment!
|
||||
x <- readLines(IN)
|
||||
patt <- "(^ATOM )|(^HETATM)"
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
# == 3.2 Counting C-alpha records ==========================================
|
||||
|
||||
|
||||
CAcount <- function(IN) {
|
||||
# count the number of C-alpha atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match " CA " in position 13 - 16 of
|
||||
# an ATOM record.
|
||||
# Note: the regex MUST be aligned into the right position, otherwise it
|
||||
# might match Calcium records!
|
||||
x <- readLines(IN)
|
||||
patt <- "^ATOM ...... CA "
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
1658
RPR-SX-PDB.R
1658
RPR-SX-PDB.R
File diff suppressed because it is too large
Load Diff
@ -1,135 +1,135 @@
|
||||
# tocID <- "RPR-UniProt_GET.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
|
||||
# added FASTA headers as attribute
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 UniProt files via GET 43
|
||||
#TOC> 1.1 Task - fetchUniProtSeq() function 105
|
||||
#TOC> 2 Task solutions 118
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 UniProt files via GET ===============================================
|
||||
|
||||
|
||||
# Perhaps the simplest example of scripted download is to retrieve a protein
|
||||
# FASTA sequence from UniProt. All we need is to construct an URL with the
|
||||
# correct UniProt ID.
|
||||
|
||||
# An interface between R scripts and Web servers is provided by the httr::
|
||||
# package. This sends and receives information via the http protocol, just like
|
||||
# a Web browser. Since this is a short and simple request, the GET verb is the
|
||||
# right tool:
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
# The UniProt ID for Mbp1 is ...
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
# and the base URL to retrieve data is ...
|
||||
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
|
||||
# retrieve a FASTA sequence:
|
||||
|
||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
||||
|
||||
# the GET() function from httr will get the data.
|
||||
response <- httr::GET(URL)
|
||||
|
||||
str(response) # the response object is a bit complex ...
|
||||
as.character(response) # ... but it is easy to pull out the data.
|
||||
|
||||
# to process ...
|
||||
x <- as.character(response)
|
||||
x <- strsplit(x, "\n")
|
||||
dbSanitizeSequence(x)
|
||||
|
||||
# Simple.
|
||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
||||
|
||||
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||
as.character(response)
|
||||
# this is a large HTML page that tells us the URL was not found. So we need to
|
||||
# check for errors. The Right Way to do this is to evaluate the staus code that
|
||||
# every Web server returns for every transaction.
|
||||
#
|
||||
httr::status_code(response) # 404 == Page Not Found
|
||||
|
||||
# There are many possible codes, but the only code we will be happy with
|
||||
# is 200 - oK.
|
||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
||||
|
||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
||||
response <- httr::GET(URL)
|
||||
httr::status_code(response)
|
||||
|
||||
|
||||
# == 1.1 Task - fetchUniProtSeq() function =================================
|
||||
|
||||
# Task: write a function that
|
||||
# - takes as input a vector of UniProt IDs,
|
||||
# - fetches the FASTA sequence for each
|
||||
# - returns a vector of the same length as the input, where an element is:
|
||||
# - ... the sequence, if the query was successful
|
||||
# - ... NA if there was an error
|
||||
# - each element has the UniProt ID as the name()
|
||||
# - bonus: the output has an attribute "headers" that is a vector of the
|
||||
# FASTA headers ( cf. ?attr )
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function - dbFetchUniProtSeq() - into
|
||||
# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
|
||||
# in the Environment pane.
|
||||
|
||||
# Test this:
|
||||
( x <- dbFetchUniProtSeq("P39678") )
|
||||
names(x)[1]
|
||||
attr(x, "headers")[1]
|
||||
x[1]
|
||||
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
|
||||
width = 40), sep = "\n")
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-UniProt_GET.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
|
||||
# added FASTA headers as attribute
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 UniProt files via GET 43
|
||||
#TOC> 1.1 Task - fetchUniProtSeq() function 105
|
||||
#TOC> 2 Task solutions 118
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 UniProt files via GET ===============================================
|
||||
|
||||
|
||||
# Perhaps the simplest example of scripted download is to retrieve a protein
|
||||
# FASTA sequence from UniProt. All we need is to construct an URL with the
|
||||
# correct UniProt ID.
|
||||
|
||||
# An interface between R scripts and Web servers is provided by the httr::
|
||||
# package. This sends and receives information via the http protocol, just like
|
||||
# a Web browser. Since this is a short and simple request, the GET verb is the
|
||||
# right tool:
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
# The UniProt ID for Mbp1 is ...
|
||||
|
||||
UniProtID <- "P39678"
|
||||
|
||||
# and the base URL to retrieve data is ...
|
||||
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
|
||||
# retrieve a FASTA sequence:
|
||||
|
||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
||||
|
||||
# the GET() function from httr will get the data.
|
||||
response <- httr::GET(URL)
|
||||
|
||||
str(response) # the response object is a bit complex ...
|
||||
as.character(response) # ... but it is easy to pull out the data.
|
||||
|
||||
# to process ...
|
||||
x <- as.character(response)
|
||||
x <- strsplit(x, "\n")
|
||||
dbSanitizeSequence(x)
|
||||
|
||||
# Simple.
|
||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
||||
|
||||
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||
as.character(response)
|
||||
# this is a large HTML page that tells us the URL was not found. So we need to
|
||||
# check for errors. The Right Way to do this is to evaluate the staus code that
|
||||
# every Web server returns for every transaction.
|
||||
#
|
||||
httr::status_code(response) # 404 == Page Not Found
|
||||
|
||||
# There are many possible codes, but the only code we will be happy with
|
||||
# is 200 - oK.
|
||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
||||
|
||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
||||
response <- httr::GET(URL)
|
||||
httr::status_code(response)
|
||||
|
||||
|
||||
# == 1.1 Task - fetchUniProtSeq() function =================================
|
||||
|
||||
# Task: write a function that
|
||||
# - takes as input a vector of UniProt IDs,
|
||||
# - fetches the FASTA sequence for each
|
||||
# - returns a vector of the same length as the input, where an element is:
|
||||
# - ... the sequence, if the query was successful
|
||||
# - ... NA if there was an error
|
||||
# - each element has the UniProt ID as the name()
|
||||
# - bonus: the output has an attribute "headers" that is a vector of the
|
||||
# FASTA headers ( cf. ?attr )
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
|
||||
# I have placed such a function - dbFetchUniProtSeq() - into
|
||||
# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
|
||||
# in the Environment pane.
|
||||
|
||||
# Test this:
|
||||
( x <- dbFetchUniProtSeq("P39678") )
|
||||
names(x)[1]
|
||||
attr(x, "headers")[1]
|
||||
x[1]
|
||||
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
|
||||
width = 40), sep = "\n")
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,234 +1,234 @@
|
||||
# tocID <- "RPR-Unit_testing.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Unit_testing unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates. Discuss local tests.
|
||||
# 1.1 Change from require() to requireNamespace()
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Unit Tests with testthat 42
|
||||
#TOC> 2 Organizing your tests 165
|
||||
#TOC> 2.1 Testing scripts 189
|
||||
#TOC> 2.2 Rethinking testing 202
|
||||
#TOC> 3 Task solutions 220
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Unit Tests with testthat ============================================
|
||||
|
||||
# The testthat package supports writing and executing unit tests in many ways.
|
||||
|
||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||
install.packages("testthat")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = testthat) # basic information
|
||||
# browseVignettes("testthat") # available vignettes
|
||||
# data(package = "testthat") # available datasets
|
||||
|
||||
# testthat is one of those packages that we either use A LOT in a script,
|
||||
# or not at all. Therefore it's more reasonable to depart from our usual
|
||||
# <package>::<function>() idiom, and load the entire library. In fact, if
|
||||
# we author packages, it is common practice to load testthat in the part
|
||||
# of the package that automates testing.
|
||||
|
||||
library(testthat)
|
||||
|
||||
# An atomic test consists of an expectation about the bahaviour of a function or
|
||||
# the existence of an object. testthat provides a number of useful expectations:
|
||||
|
||||
# At the most basic level, you can use expect_true() and expect_false():
|
||||
|
||||
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
|
||||
expect_true(file.exists("NO-SUCH-FILE.txt"))
|
||||
|
||||
expect_false(is.integer(NA))
|
||||
|
||||
# More commonly, you will test for equality of an output with a given result.
|
||||
# But you need to consider what it means for two numbers to be "equal" on a
|
||||
# digital computer. Consider:
|
||||
|
||||
49*(1/49) == 1 # Surprised? Read FAQ 7.31
|
||||
# https://cran.r-project.org/doc/FAQ/R-FAQ.html
|
||||
49*(1/49) - 1 # NOT zero (but almost)
|
||||
|
||||
# This is really unpredictable ...
|
||||
0.1 + 0.05 == 0.15
|
||||
0.2 + 0.07 == 0.27
|
||||
|
||||
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
|
||||
# R uses the function all.equal() to test whether two numbers are equal for
|
||||
# practical puposes up to machine precision.
|
||||
49*(1/49) == 1
|
||||
all.equal(49*(1/49), 1)
|
||||
|
||||
# The testthat function expect_equal() uses all.equal internally:
|
||||
expect_equal(49*(1/49), 1)
|
||||
|
||||
# ... which is reasonable, or, if things MUST be exactly the same ...
|
||||
expect_identical(49*(1/49), 1)
|
||||
|
||||
# ... but consider:
|
||||
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
|
||||
|
||||
# Some very useful expectations are expect_warning(), and expect_error(), for
|
||||
# constructing tests that check for erroneous output:
|
||||
|
||||
as.integer(c("1", "2", "three"))
|
||||
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
|
||||
# printed.
|
||||
1/"x"
|
||||
expect_warning(1/"x")
|
||||
expect_error(1/"x") # Again: note that the error is NOT printed, as well
|
||||
# code execution will continue.
|
||||
|
||||
# Even better, you can check if the warning or error is what you expect it
|
||||
# to be - because it could actually have occured somewhere else in your code.
|
||||
|
||||
v <- c("1", "x")
|
||||
log(v[1:2])
|
||||
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
|
||||
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
|
||||
expect_error(log(v[1,2])) # This appears oK, but ...
|
||||
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
|
||||
|
||||
# Producing unit tests simply means: we define a function, and then we check
|
||||
# whether all test pass. Consider a function that is loaded on startup from
|
||||
# the .utilities.R script:
|
||||
|
||||
biCode
|
||||
|
||||
# We could test it like so:
|
||||
|
||||
expect_equal(biCode(""), ".....")
|
||||
expect_equal(biCode(" "), ".....")
|
||||
expect_equal(biCode("123 12"), ".....")
|
||||
expect_equal(biCode("h sapiens"), "H..SA")
|
||||
expect_equal(biCode("homo sapiens"), "HOMSA")
|
||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
|
||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
|
||||
c("PHACI", "MACRU"))
|
||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
|
||||
|
||||
# The test_that() function allows to group related tests, include an informative
|
||||
# message which test is being executed, and run a number of tests that are
|
||||
# passed to the function inside a code block - i.e. {...}
|
||||
# test_that("<descriptive string>, {<code block>})
|
||||
|
||||
test_that("NA values are preserved", {
|
||||
# bicode() respects vector length: input and output must have the smae length.
|
||||
# Therefore NA's can't be simply skipped, bust must be properly passed
|
||||
# into output:
|
||||
expect_true(is.na((biCode(NA))))
|
||||
expect_equal(biCode(c("first", NA, "last")),
|
||||
c("FIRST", NA, "LAST."))
|
||||
})
|
||||
|
||||
|
||||
# Task: Write a function calcGC() that calculates GC content in a sequence.
|
||||
# Hint: you could strsplit() the sequence into a vector, and count
|
||||
# G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
|
||||
# A's and T's, and use nchar() before and after to calculate the content
|
||||
# from the length difference.
|
||||
# Then write tests that:
|
||||
# confirm that calcGC("AATT") is 0;
|
||||
# confirm that calcGC("ATGC") is 0.5;
|
||||
# confirm that calcGC("AC") is 0.5;
|
||||
# confirm that calcGC("CGCG") is 1;
|
||||
|
||||
|
||||
# = 2 Organizing your tests ===============================================
|
||||
|
||||
|
||||
# Tests are only useful if they are actually executed and we need to make sure
|
||||
# there are no barriers to do that. The testthat package supports automatic
|
||||
# execution of tests:
|
||||
# - put your tests into an R-script,
|
||||
# - save your tests in a file called "test_<my-function-name>.R"
|
||||
# - execute the test with test_file("test_<my-function-name>.R") ...
|
||||
# ... or, if you are working on a project ...
|
||||
# - place the file in a test-directory (e.g. the directory "test" in this
|
||||
# project),
|
||||
# - execute all your tests with test_dir("<my-test-directory>")
|
||||
|
||||
# For example I have provided a "tests" directory with this project, and
|
||||
# placed the file "test_biCode.R" inside.
|
||||
file.show("./tests/test_biCode.R")
|
||||
|
||||
# Execute the file ...
|
||||
test_file("./tests/test_biCode.R")
|
||||
|
||||
# .. or execute all the test files in the directory:
|
||||
test_dir("./tests")
|
||||
|
||||
# == 2.1 Testing scripts ===================================================
|
||||
|
||||
# Scripts need special consideration since we do not necessarily source() them
|
||||
# entirely. Therefore automated testing is not reasonable. What you can do
|
||||
# instead is to place a conditional block at the end of your script, that
|
||||
# never gets executed - then you can manually execute the code in the block
|
||||
# whenever you wish to test your functions. For example:
|
||||
|
||||
if (FALSE) {
|
||||
# ... your tests go here
|
||||
|
||||
}
|
||||
|
||||
# == 2.2 Rethinking testing ================================================
|
||||
|
||||
# However, it is important to keep in mind that different objectives lead to
|
||||
# different ideas of what works best. There is never a "best" in and of itself,
|
||||
# the question is always: "Best for what?" While automated unit testing is a
|
||||
# great way to assure the integrity of packages and larger software artefacts as
|
||||
# they are being developed, more loosely conceived aggregates of code - like the
|
||||
# scripts for this course for example - have different objectives and in this
|
||||
# case I find the testthat approach to actually be inferior. The reason is its
|
||||
# tendency to physically separate code and tests. Keeping assets, and functions
|
||||
# that operate on those assets separated is always poor design. I have found
|
||||
# over time that a more stable approach is to move individual functions into
|
||||
# their individual scripts, all in one folder, one function (and its helpers)
|
||||
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
|
||||
# explained above.
|
||||
|
||||
|
||||
|
||||
# = 3 Task solutions ======================================================
|
||||
|
||||
calcGC <- function(s) {
|
||||
s <- gsub("[^agctAGCT]", "", s)
|
||||
return(nchar(gsub("[atAT]", "", s)) / nchar(s))
|
||||
}
|
||||
|
||||
expect_equal(calcGC("AATT"), 0)
|
||||
expect_equal(calcGC("ATGC"), 0.5)
|
||||
expect_equal(calcGC("AC"), 0.5)
|
||||
expect_equal(calcGC("CGCG"), 1)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-Unit_testing.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Unit_testing unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Updates. Discuss local tests.
|
||||
# 1.1 Change from require() to requireNamespace()
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Unit Tests with testthat 42
|
||||
#TOC> 2 Organizing your tests 165
|
||||
#TOC> 2.1 Testing scripts 189
|
||||
#TOC> 2.2 Rethinking testing 202
|
||||
#TOC> 3 Task solutions 220
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Unit Tests with testthat ============================================
|
||||
|
||||
# The testthat package supports writing and executing unit tests in many ways.
|
||||
|
||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||
install.packages("testthat")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = testthat) # basic information
|
||||
# browseVignettes("testthat") # available vignettes
|
||||
# data(package = "testthat") # available datasets
|
||||
|
||||
# testthat is one of those packages that we either use A LOT in a script,
|
||||
# or not at all. Therefore it's more reasonable to depart from our usual
|
||||
# <package>::<function>() idiom, and load the entire library. In fact, if
|
||||
# we author packages, it is common practice to load testthat in the part
|
||||
# of the package that automates testing.
|
||||
|
||||
library(testthat)
|
||||
|
||||
# An atomic test consists of an expectation about the bahaviour of a function or
|
||||
# the existence of an object. testthat provides a number of useful expectations:
|
||||
|
||||
# At the most basic level, you can use expect_true() and expect_false():
|
||||
|
||||
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
|
||||
expect_true(file.exists("NO-SUCH-FILE.txt"))
|
||||
|
||||
expect_false(is.integer(NA))
|
||||
|
||||
# More commonly, you will test for equality of an output with a given result.
|
||||
# But you need to consider what it means for two numbers to be "equal" on a
|
||||
# digital computer. Consider:
|
||||
|
||||
49*(1/49) == 1 # Surprised? Read FAQ 7.31
|
||||
# https://cran.r-project.org/doc/FAQ/R-FAQ.html
|
||||
49*(1/49) - 1 # NOT zero (but almost)
|
||||
|
||||
# This is really unpredictable ...
|
||||
0.1 + 0.05 == 0.15
|
||||
0.2 + 0.07 == 0.27
|
||||
|
||||
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
|
||||
# R uses the function all.equal() to test whether two numbers are equal for
|
||||
# practical puposes up to machine precision.
|
||||
49*(1/49) == 1
|
||||
all.equal(49*(1/49), 1)
|
||||
|
||||
# The testthat function expect_equal() uses all.equal internally:
|
||||
expect_equal(49*(1/49), 1)
|
||||
|
||||
# ... which is reasonable, or, if things MUST be exactly the same ...
|
||||
expect_identical(49*(1/49), 1)
|
||||
|
||||
# ... but consider:
|
||||
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
|
||||
|
||||
# Some very useful expectations are expect_warning(), and expect_error(), for
|
||||
# constructing tests that check for erroneous output:
|
||||
|
||||
as.integer(c("1", "2", "three"))
|
||||
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
|
||||
# printed.
|
||||
1/"x"
|
||||
expect_warning(1/"x")
|
||||
expect_error(1/"x") # Again: note that the error is NOT printed, as well
|
||||
# code execution will continue.
|
||||
|
||||
# Even better, you can check if the warning or error is what you expect it
|
||||
# to be - because it could actually have occured somewhere else in your code.
|
||||
|
||||
v <- c("1", "x")
|
||||
log(v[1:2])
|
||||
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
|
||||
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
|
||||
expect_error(log(v[1,2])) # This appears oK, but ...
|
||||
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
|
||||
|
||||
# Producing unit tests simply means: we define a function, and then we check
|
||||
# whether all test pass. Consider a function that is loaded on startup from
|
||||
# the .utilities.R script:
|
||||
|
||||
biCode
|
||||
|
||||
# We could test it like so:
|
||||
|
||||
expect_equal(biCode(""), ".....")
|
||||
expect_equal(biCode(" "), ".....")
|
||||
expect_equal(biCode("123 12"), ".....")
|
||||
expect_equal(biCode("h sapiens"), "H..SA")
|
||||
expect_equal(biCode("homo sapiens"), "HOMSA")
|
||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
|
||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
|
||||
c("PHACI", "MACRU"))
|
||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
|
||||
|
||||
# The test_that() function allows to group related tests, include an informative
|
||||
# message which test is being executed, and run a number of tests that are
|
||||
# passed to the function inside a code block - i.e. {...}
|
||||
# test_that("<descriptive string>, {<code block>})
|
||||
|
||||
test_that("NA values are preserved", {
|
||||
# bicode() respects vector length: input and output must have the smae length.
|
||||
# Therefore NA's can't be simply skipped, bust must be properly passed
|
||||
# into output:
|
||||
expect_true(is.na((biCode(NA))))
|
||||
expect_equal(biCode(c("first", NA, "last")),
|
||||
c("FIRST", NA, "LAST."))
|
||||
})
|
||||
|
||||
|
||||
# Task: Write a function calcGC() that calculates GC content in a sequence.
|
||||
# Hint: you could strsplit() the sequence into a vector, and count
|
||||
# G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
|
||||
# A's and T's, and use nchar() before and after to calculate the content
|
||||
# from the length difference.
|
||||
# Then write tests that:
|
||||
# confirm that calcGC("AATT") is 0;
|
||||
# confirm that calcGC("ATGC") is 0.5;
|
||||
# confirm that calcGC("AC") is 0.5;
|
||||
# confirm that calcGC("CGCG") is 1;
|
||||
|
||||
|
||||
# = 2 Organizing your tests ===============================================
|
||||
|
||||
|
||||
# Tests are only useful if they are actually executed and we need to make sure
|
||||
# there are no barriers to do that. The testthat package supports automatic
|
||||
# execution of tests:
|
||||
# - put your tests into an R-script,
|
||||
# - save your tests in a file called "test_<my-function-name>.R"
|
||||
# - execute the test with test_file("test_<my-function-name>.R") ...
|
||||
# ... or, if you are working on a project ...
|
||||
# - place the file in a test-directory (e.g. the directory "test" in this
|
||||
# project),
|
||||
# - execute all your tests with test_dir("<my-test-directory>")
|
||||
|
||||
# For example I have provided a "tests" directory with this project, and
|
||||
# placed the file "test_biCode.R" inside.
|
||||
file.show("./tests/test_biCode.R")
|
||||
|
||||
# Execute the file ...
|
||||
test_file("./tests/test_biCode.R")
|
||||
|
||||
# .. or execute all the test files in the directory:
|
||||
test_dir("./tests")
|
||||
|
||||
# == 2.1 Testing scripts ===================================================
|
||||
|
||||
# Scripts need special consideration since we do not necessarily source() them
|
||||
# entirely. Therefore automated testing is not reasonable. What you can do
|
||||
# instead is to place a conditional block at the end of your script, that
|
||||
# never gets executed - then you can manually execute the code in the block
|
||||
# whenever you wish to test your functions. For example:
|
||||
|
||||
if (FALSE) {
|
||||
# ... your tests go here
|
||||
|
||||
}
|
||||
|
||||
# == 2.2 Rethinking testing ================================================
|
||||
|
||||
# However, it is important to keep in mind that different objectives lead to
|
||||
# different ideas of what works best. There is never a "best" in and of itself,
|
||||
# the question is always: "Best for what?" While automated unit testing is a
|
||||
# great way to assure the integrity of packages and larger software artefacts as
|
||||
# they are being developed, more loosely conceived aggregates of code - like the
|
||||
# scripts for this course for example - have different objectives and in this
|
||||
# case I find the testthat approach to actually be inferior. The reason is its
|
||||
# tendency to physically separate code and tests. Keeping assets, and functions
|
||||
# that operate on those assets separated is always poor design. I have found
|
||||
# over time that a more stable approach is to move individual functions into
|
||||
# their individual scripts, all in one folder, one function (and its helpers)
|
||||
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
|
||||
# explained above.
|
||||
|
||||
|
||||
|
||||
# = 3 Task solutions ======================================================
|
||||
|
||||
calcGC <- function(s) {
|
||||
s <- gsub("[^agctAGCT]", "", s)
|
||||
return(nchar(gsub("[atAT]", "", s)) / nchar(s))
|
||||
}
|
||||
|
||||
expect_equal(calcGC("AATT"), 0)
|
||||
expect_equal(calcGC("ATGC"), 0.5)
|
||||
expect_equal(calcGC("AC"), 0.5)
|
||||
expect_equal(calcGC("CGCG"), 1)
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
332
RPR-eUtils_XML.R
332
RPR-eUtils_XML.R
@ -1,166 +1,166 @@
|
||||
# tocID <- "RPR-eUtils_XML.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2.1
|
||||
#
|
||||
# Date: 2017-10 - 2021-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2.1 2021 Maintenance
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Working with NCBI eUtils 43
|
||||
#TOC> 1.1 Task - fetchNCBItaxData() function 145
|
||||
#TOC> 2 Task solutions 152
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Working with NCBI eUtils ============================================
|
||||
|
||||
|
||||
# To begin, we load the xml2 package that contains functions
|
||||
# we need to receive and parse html data. NCBI's eUtils send information in
|
||||
# XML format so we need to be able to parse XML.
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = xml2) # basic information
|
||||
# browseVignettes("xml2") # available vignettes
|
||||
# data(package = "xml2") # available datasets
|
||||
|
||||
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1
|
||||
refSeqID <- "NP_010227"
|
||||
|
||||
|
||||
# First we build a query URL...
|
||||
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
|
||||
|
||||
# Then we assemble an URL that will search for get the
|
||||
# unique, NCBI internal identifier,
|
||||
# for our refSeqID...
|
||||
URL <- paste(eUtilsBase,
|
||||
"esearch.fcgi?", # ...using the esearch program
|
||||
# that finds an entry in an
|
||||
# NCBI database
|
||||
"db=protein",
|
||||
"&term=", refSeqID,
|
||||
sep="")
|
||||
# Copy the URL and paste it into your browser to see
|
||||
# what the response should look like.
|
||||
URL
|
||||
|
||||
# To fetch a response in R, we use the function read_xml()
|
||||
# with our URL as its argument.
|
||||
( myXML <- xml2::read_xml(URL) )
|
||||
|
||||
# This is XML. We can take the response apart into
|
||||
# its individual components with the as_list() function.
|
||||
|
||||
xml2::as_list(myXML)
|
||||
|
||||
# Note how the XML "tree" is represented as a list of
|
||||
# lists of lists ...
|
||||
# If we know exactly what element we are looking for,
|
||||
# we can extract it from this structure:
|
||||
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
|
||||
|
||||
# But this is not very robust, it would break with the
|
||||
# slightest change that the NCBI makes to their data format -
|
||||
# and the NCBI changes things A LOT!
|
||||
|
||||
# Somewhat more robust is to specify the type of element
|
||||
# we want - its the text contained in an <Id>...</Id>
|
||||
# element, and use the XPath XML parsing language to
|
||||
# retrieve it.
|
||||
|
||||
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
|
||||
|
||||
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
|
||||
# of the node set
|
||||
|
||||
# We will need to do this more than once, so we write a function
|
||||
# for it...
|
||||
node2text <- function(doc, tag) {
|
||||
# an extractor function for the contents of elements
|
||||
# between given tags in an XML response.
|
||||
# Contents of all matching elements is returned in
|
||||
# a vector of strings.
|
||||
path <- paste0("//", tag)
|
||||
nodes <- xml2::xml_find_all(doc, path)
|
||||
return(xml2::xml_text(nodes))
|
||||
}
|
||||
|
||||
# using node2text() ...
|
||||
(GID <- node2text(myXML, "Id"))
|
||||
|
||||
# The GI is the pivot for data requests at the
|
||||
# NCBI.
|
||||
|
||||
# Let's first get the associated data for this GI
|
||||
URL <- paste0(eUtilsBase,
|
||||
"esummary.fcgi?",
|
||||
"db=protein",
|
||||
"&id=",
|
||||
GID,
|
||||
"&version=2.0")
|
||||
(myXML <- xml2::read_xml(URL))
|
||||
|
||||
(taxID <- node2text(myXML, "TaxId"))
|
||||
(organism <- node2text(myXML, "Organism"))
|
||||
|
||||
# This forms the base of a function that gets taxonomy data
|
||||
# from an Entrez result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchNCBItaxData() function ================================
|
||||
|
||||
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
|
||||
# information, returns a list with taxID and organism, if the operation is
|
||||
# successful, or a list of length 0 if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
# I have placed such a function into the dbUtilities script: look it up by
|
||||
# clicking on dbFetchNCBItaxData() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchNCBItaxData("XP_001837394")
|
||||
|
||||
# Expected outout:
|
||||
# ----------------
|
||||
# taxID organism
|
||||
# 1 240176 Coprinopsis cinerea okayama7#130
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "RPR-eUtils_XML.R"
|
||||
#
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.2.1
|
||||
#
|
||||
# Date: 2017-10 - 2021-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2.1 2021 Maintenance
|
||||
# 1.2 2020 Updates
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
#
|
||||
# == DO NOT SIMPLY source() THIS FILE! =======================================
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Working with NCBI eUtils 43
|
||||
#TOC> 1.1 Task - fetchNCBItaxData() function 145
|
||||
#TOC> 2 Task solutions 152
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Working with NCBI eUtils ============================================
|
||||
|
||||
|
||||
# To begin, we load the xml2 package that contains functions
|
||||
# we need to receive and parse html data. NCBI's eUtils send information in
|
||||
# XML format so we need to be able to parse XML.
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = xml2) # basic information
|
||||
# browseVignettes("xml2") # available vignettes
|
||||
# data(package = "xml2") # available datasets
|
||||
|
||||
|
||||
|
||||
# We will walk through the process with the refSeqID
|
||||
# of yeast Mbp1
|
||||
refSeqID <- "NP_010227"
|
||||
|
||||
|
||||
# First we build a query URL...
|
||||
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
|
||||
|
||||
# Then we assemble an URL that will search for get the
|
||||
# unique, NCBI internal identifier,
|
||||
# for our refSeqID...
|
||||
URL <- paste(eUtilsBase,
|
||||
"esearch.fcgi?", # ...using the esearch program
|
||||
# that finds an entry in an
|
||||
# NCBI database
|
||||
"db=protein",
|
||||
"&term=", refSeqID,
|
||||
sep="")
|
||||
# Copy the URL and paste it into your browser to see
|
||||
# what the response should look like.
|
||||
URL
|
||||
|
||||
# To fetch a response in R, we use the function read_xml()
|
||||
# with our URL as its argument.
|
||||
( myXML <- xml2::read_xml(URL) )
|
||||
|
||||
# This is XML. We can take the response apart into
|
||||
# its individual components with the as_list() function.
|
||||
|
||||
xml2::as_list(myXML)
|
||||
|
||||
# Note how the XML "tree" is represented as a list of
|
||||
# lists of lists ...
|
||||
# If we know exactly what element we are looking for,
|
||||
# we can extract it from this structure:
|
||||
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
|
||||
|
||||
# But this is not very robust, it would break with the
|
||||
# slightest change that the NCBI makes to their data format -
|
||||
# and the NCBI changes things A LOT!
|
||||
|
||||
# Somewhat more robust is to specify the type of element
|
||||
# we want - its the text contained in an <Id>...</Id>
|
||||
# element, and use the XPath XML parsing language to
|
||||
# retrieve it.
|
||||
|
||||
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
|
||||
|
||||
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
|
||||
# of the node set
|
||||
|
||||
# We will need to do this more than once, so we write a function
|
||||
# for it...
|
||||
node2text <- function(doc, tag) {
|
||||
# an extractor function for the contents of elements
|
||||
# between given tags in an XML response.
|
||||
# Contents of all matching elements is returned in
|
||||
# a vector of strings.
|
||||
path <- paste0("//", tag)
|
||||
nodes <- xml2::xml_find_all(doc, path)
|
||||
return(xml2::xml_text(nodes))
|
||||
}
|
||||
|
||||
# using node2text() ...
|
||||
(GID <- node2text(myXML, "Id"))
|
||||
|
||||
# The GI is the pivot for data requests at the
|
||||
# NCBI.
|
||||
|
||||
# Let's first get the associated data for this GI
|
||||
URL <- paste0(eUtilsBase,
|
||||
"esummary.fcgi?",
|
||||
"db=protein",
|
||||
"&id=",
|
||||
GID,
|
||||
"&version=2.0")
|
||||
(myXML <- xml2::read_xml(URL))
|
||||
|
||||
(taxID <- node2text(myXML, "TaxId"))
|
||||
(organism <- node2text(myXML, "Organism"))
|
||||
|
||||
# This forms the base of a function that gets taxonomy data
|
||||
# from an Entrez result. You can write this!
|
||||
|
||||
|
||||
# == 1.1 Task - fetchNCBItaxData() function ================================
|
||||
|
||||
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
|
||||
# information, returns a list with taxID and organism, if the operation is
|
||||
# successful, or a list of length 0 if there is an error.
|
||||
|
||||
|
||||
# = 2 Task solutions ======================================================
|
||||
|
||||
# I have placed such a function into the dbUtilities script: look it up by
|
||||
# clicking on dbFetchNCBItaxData() in the Environment pane.
|
||||
|
||||
# Test:
|
||||
dbFetchNCBItaxData("XP_001837394")
|
||||
|
||||
# Expected outout:
|
||||
# ----------------
|
||||
# taxID organism
|
||||
# 1 240176 Coprinopsis cinerea okayama7#130
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,10 +1,10 @@
|
||||
HEADER TEST 0TST 0TST 1
|
||||
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
|
||||
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
|
||||
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
|
||||
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
|
||||
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
|
||||
TER 5 GLY 1 0TST 7
|
||||
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
|
||||
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
|
||||
END 0TST 10
|
||||
HEADER TEST 0TST 0TST 1
|
||||
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
|
||||
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
|
||||
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
|
||||
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
|
||||
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
|
||||
TER 5 GLY 1 0TST 7
|
||||
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
|
||||
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
|
||||
END 0TST 10
|
||||
|
3104
data/1BM8.pdb
3104
data/1BM8.pdb
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
>2F1C:X|PDBID|CHAIN|SEQUENCE
|
||||
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
|
||||
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
|
||||
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
|
||||
>2F1C:X|PDBID|CHAIN|SEQUENCE
|
||||
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
|
||||
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
|
||||
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
|
||||
GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH
|
12
data/3FG7.fa
12
data/3FG7.fa
@ -1,6 +1,6 @@
|
||||
>3FG7:A|PDBID|CHAIN|SEQUENCE
|
||||
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
|
||||
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
|
||||
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
|
||||
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
|
||||
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
|
||||
>3FG7:A|PDBID|CHAIN|SEQUENCE
|
||||
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
|
||||
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
|
||||
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
|
||||
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
|
||||
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
|
||||
|
@ -1,20 +1,20 @@
|
||||
[
|
||||
{ "name" : "MBP1_SACCE",
|
||||
"RefSeqID" : "NP_010227",
|
||||
"UniProtID" : "P39678",
|
||||
"taxonomyID" : 559292,
|
||||
"sequence" : [
|
||||
"MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
|
||||
"GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
|
||||
"KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
|
||||
"PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
|
||||
"QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
|
||||
"NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
|
||||
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
|
||||
"SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
|
||||
"MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
|
||||
"MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
|
||||
"KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
|
||||
"LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
|
||||
}
|
||||
]
|
||||
[
|
||||
{ "name" : "MBP1_SACCE",
|
||||
"RefSeqID" : "NP_010227",
|
||||
"UniProtID" : "P39678",
|
||||
"taxonomyID" : 559292,
|
||||
"sequence" : [
|
||||
"MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
|
||||
"GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
|
||||
"KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
|
||||
"PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
|
||||
"QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
|
||||
"NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
|
||||
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
|
||||
"SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
|
||||
"MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
|
||||
"MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
|
||||
"KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
|
||||
"LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
|
||||
}
|
||||
]
|
||||
|
@ -1,30 +1,30 @@
|
||||
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
|
||||
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
|
||||
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
|
||||
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
|
||||
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
|
||||
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
|
||||
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
|
||||
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
|
||||
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
|
||||
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
|
||||
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
|
||||
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
|
||||
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
|
||||
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
|
||||
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
|
||||
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
|
||||
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
|
||||
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
|
||||
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
|
||||
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
|
||||
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
|
||||
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
|
||||
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
|
||||
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
|
||||
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
|
||||
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
|
||||
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
|
||||
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
|
||||
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
|
||||
CACCAGTCCCCAGAATGA
|
||||
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
|
||||
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
|
||||
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
|
||||
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
|
||||
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
|
||||
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
|
||||
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
|
||||
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
|
||||
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
|
||||
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
|
||||
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
|
||||
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
|
||||
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
|
||||
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
|
||||
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
|
||||
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
|
||||
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
|
||||
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
|
||||
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
|
||||
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
|
||||
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
|
||||
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
|
||||
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
|
||||
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
|
||||
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
|
||||
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
|
||||
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
|
||||
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
|
||||
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
|
||||
CACCAGTCCCCAGAATGA
|
||||
|
@ -1,12 +1,12 @@
|
||||
>RAB39B cds:protein_coding (ENST00000369454.4)
|
||||
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
|
||||
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
|
||||
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
|
||||
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
|
||||
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
|
||||
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
|
||||
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
|
||||
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
|
||||
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
|
||||
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
|
||||
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
|
||||
>RAB39B cds:protein_coding (ENST00000369454.4)
|
||||
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
|
||||
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
|
||||
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
|
||||
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
|
||||
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
|
||||
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
|
||||
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
|
||||
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
|
||||
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
|
||||
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
|
||||
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
|
||||
|
@ -1,131 +1,131 @@
|
||||
|
||||
|
||||
```{css, echo = FALSE}
|
||||
|
||||
.striped tr:nth-child(even) {
|
||||
background: #eaf1ff;
|
||||
}
|
||||
.striped {
|
||||
padding: 5px;
|
||||
}
|
||||
```
|
||||
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
|
||||
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
## Phobias! ##
|
||||
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
|
||||
|
||||
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
|
||||
```{r packages}
|
||||
if (! requireNamespace("rvest", quietly=TRUE)) {
|
||||
install.packages("rvest")
|
||||
}
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
```
|
||||
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
|
||||
|
||||
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
|
||||
|
||||
```{r getPageData, cache=TRUE}
|
||||
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
|
||||
allTables <- rvest::html_table(webPage, fill = TRUE)
|
||||
```
|
||||
|
||||
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
|
||||
|
||||
```{r collateTables, cache=TRUE}
|
||||
phobiaTable <- data.frame(Phobia = character(), Condition = character())
|
||||
for (i in seq_along(allTables)) {
|
||||
df <- allTables[[i]]
|
||||
if (all(colnames(df) == c("Phobia", "Condition"))) {
|
||||
phobiaTable <- rbind(phobiaTable, df)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
|
||||
|
||||
<p>
|
||||
<p>
|
||||
|
||||
```{r , ref.label="randRow", echo=FALSE}
|
||||
```
|
||||
|
||||
**Table**: seven random phobias<br/>
|
||||
```{r renderPhobiaTable, echo=FALSE, results='asis'}
|
||||
sel <- sample(1:nrow(phobiaTable), 7)
|
||||
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
|
||||
```
|
||||
|
||||
<p>
|
||||
<p>
|
||||
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
|
||||
|
||||
```{r randRow}
|
||||
randRow <- function(M, seed = FALSE) {
|
||||
# Return a random row from a dataframe M.
|
||||
if (seed) {
|
||||
oldseed <- .Random.seed # play nice and save the RNG state ...
|
||||
set.seed(as.integer(seed))
|
||||
}
|
||||
r <- M[sample(1:nrow(M), 1), ] # fetch one random row
|
||||
if (seed) { .Random.seed <- oldseed } # ... restore the RNG state
|
||||
return(r)
|
||||
}
|
||||
```
|
||||
<p>
|
||||
<p>
|
||||
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
|
||||
|
||||
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
|
||||
|
||||
<p>
|
||||
<p>
|
||||
|
||||
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
|
||||
|
||||
```{r preProcess}
|
||||
|
||||
# select only single-word phobias that end with "phobia"
|
||||
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
|
||||
names <- phobiaTable$Phobia[sel]
|
||||
|
||||
# extract the ones we did _not_ select
|
||||
x <- phobiaTable$Phobia[! sel]
|
||||
# use strsplit() to split them apart and flatten the resulting list
|
||||
x <- unlist(strsplit(x, ", "))
|
||||
x <- unlist(strsplit(x, " "))
|
||||
x <- unlist(strsplit(x, "/"))
|
||||
# use the same selection as above, and append the result to our "names""
|
||||
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
|
||||
names <- c(names, x[sel])
|
||||
|
||||
```
|
||||
|
||||
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
|
||||
|
||||
```{r showHist}
|
||||
|
||||
x <- nchar(names)
|
||||
pShort <- names[which(x == min(x))[1]] # pull out the shortest name ...
|
||||
pLong <- names[which(x == max(x))[1]] # ... and the longest name too.
|
||||
hist(x,
|
||||
main = "Length of phobia-names",
|
||||
sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
|
||||
pShort, nchar(pShort), pLong, nchar(pLong)),
|
||||
cex.sub = 0.8,
|
||||
xlab = "name",
|
||||
ylab = "counts",
|
||||
col ="#aef5ee")
|
||||
|
||||
```
|
||||
|
||||
That's all.
|
||||
|
||||
<!-- [END] -->
|
||||
|
||||
|
||||
```{css, echo = FALSE}
|
||||
|
||||
.striped tr:nth-child(even) {
|
||||
background: #eaf1ff;
|
||||
}
|
||||
.striped {
|
||||
padding: 5px;
|
||||
}
|
||||
```
|
||||
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
|
||||
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE)
|
||||
```
|
||||
|
||||
## Phobias! ##
|
||||
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
|
||||
|
||||
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
|
||||
```{r packages}
|
||||
if (! requireNamespace("rvest", quietly=TRUE)) {
|
||||
install.packages("rvest")
|
||||
}
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
}
|
||||
```
|
||||
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
|
||||
|
||||
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
|
||||
|
||||
```{r getPageData, cache=TRUE}
|
||||
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
|
||||
allTables <- rvest::html_table(webPage, fill = TRUE)
|
||||
```
|
||||
|
||||
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
|
||||
|
||||
```{r collateTables, cache=TRUE}
|
||||
phobiaTable <- data.frame(Phobia = character(), Condition = character())
|
||||
for (i in seq_along(allTables)) {
|
||||
df <- allTables[[i]]
|
||||
if (all(colnames(df) == c("Phobia", "Condition"))) {
|
||||
phobiaTable <- rbind(phobiaTable, df)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
|
||||
|
||||
<p>
|
||||
<p>
|
||||
|
||||
```{r , ref.label="randRow", echo=FALSE}
|
||||
```
|
||||
|
||||
**Table**: seven random phobias<br/>
|
||||
```{r renderPhobiaTable, echo=FALSE, results='asis'}
|
||||
sel <- sample(1:nrow(phobiaTable), 7)
|
||||
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
|
||||
```
|
||||
|
||||
<p>
|
||||
<p>
|
||||
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
|
||||
|
||||
```{r randRow}
|
||||
randRow <- function(M, seed = FALSE) {
|
||||
# Return a random row from a dataframe M.
|
||||
if (seed) {
|
||||
oldseed <- .Random.seed # play nice and save the RNG state ...
|
||||
set.seed(as.integer(seed))
|
||||
}
|
||||
r <- M[sample(1:nrow(M), 1), ] # fetch one random row
|
||||
if (seed) { .Random.seed <- oldseed } # ... restore the RNG state
|
||||
return(r)
|
||||
}
|
||||
```
|
||||
<p>
|
||||
<p>
|
||||
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
|
||||
|
||||
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
|
||||
|
||||
<p>
|
||||
<p>
|
||||
|
||||
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
|
||||
|
||||
```{r preProcess}
|
||||
|
||||
# select only single-word phobias that end with "phobia"
|
||||
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
|
||||
names <- phobiaTable$Phobia[sel]
|
||||
|
||||
# extract the ones we did _not_ select
|
||||
x <- phobiaTable$Phobia[! sel]
|
||||
# use strsplit() to split them apart and flatten the resulting list
|
||||
x <- unlist(strsplit(x, ", "))
|
||||
x <- unlist(strsplit(x, " "))
|
||||
x <- unlist(strsplit(x, "/"))
|
||||
# use the same selection as above, and append the result to our "names""
|
||||
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
|
||||
names <- c(names, x[sel])
|
||||
|
||||
```
|
||||
|
||||
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
|
||||
|
||||
```{r showHist}
|
||||
|
||||
x <- nchar(names)
|
||||
pShort <- names[which(x == min(x))[1]] # pull out the shortest name ...
|
||||
pLong <- names[which(x == max(x))[1]] # ... and the longest name too.
|
||||
hist(x,
|
||||
main = "Length of phobia-names",
|
||||
sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
|
||||
pShort, nchar(pShort), pLong, nchar(pLong)),
|
||||
cex.sub = 0.8,
|
||||
xlab = "name",
|
||||
ylab = "counts",
|
||||
col ="#aef5ee")
|
||||
|
||||
```
|
||||
|
||||
That's all.
|
||||
|
||||
<!-- [END] -->
|
||||
|
@ -1,43 +1,43 @@
|
||||
>MBP1 YDL056W SGDID:S000002214
|
||||
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
|
||||
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
|
||||
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
|
||||
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
|
||||
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
|
||||
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
|
||||
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
|
||||
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
|
||||
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
|
||||
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
|
||||
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
|
||||
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
|
||||
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
|
||||
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
|
||||
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
|
||||
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
|
||||
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
|
||||
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
|
||||
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
|
||||
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
|
||||
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
|
||||
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
|
||||
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
|
||||
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
|
||||
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
|
||||
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
|
||||
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
|
||||
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
|
||||
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
|
||||
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
|
||||
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
|
||||
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
|
||||
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
|
||||
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
|
||||
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
|
||||
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
|
||||
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
|
||||
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
|
||||
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
|
||||
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
|
||||
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
|
||||
>MBP1 YDL056W SGDID:S000002214
|
||||
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
|
||||
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
|
||||
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
|
||||
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
|
||||
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
|
||||
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
|
||||
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
|
||||
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
|
||||
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
|
||||
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
|
||||
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
|
||||
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
|
||||
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
|
||||
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
|
||||
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
|
||||
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
|
||||
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
|
||||
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
|
||||
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
|
||||
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
|
||||
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
|
||||
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
|
||||
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
|
||||
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
|
||||
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
|
||||
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
|
||||
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
|
||||
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
|
||||
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
|
||||
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
|
||||
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
|
||||
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
|
||||
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
|
||||
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
|
||||
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
|
||||
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
|
||||
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
|
||||
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
|
||||
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
|
||||
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
|
||||
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
|
||||
GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA
|
@ -1,47 +1,47 @@
|
||||
SGD_features.tab
|
||||
|
||||
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
|
||||
|
||||
The SGD_features.tab file is updated weekly (Saturday).
|
||||
|
||||
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
|
||||
used chromosomal_feature.tab file.
|
||||
|
||||
File contents:
|
||||
|
||||
1. Information on current chromosomal features in SGD, including Dubious ORFs.
|
||||
Also contains coordinates of intron, exons, and other subfeatures that are located
|
||||
within a chromosomal feature.
|
||||
|
||||
2. The relationship between subfeatures and the feature in which they
|
||||
are located is identified by the feature name in column #7 (parent
|
||||
feature). For example, the parent feature of the intron found in
|
||||
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
|
||||
chromosome 6.
|
||||
|
||||
3. The coordinates of all features are in chromosomal coordinates.
|
||||
|
||||
|
||||
Columns within SGD_features.tab:
|
||||
|
||||
1. Primary SGDID (mandatory)
|
||||
2. Feature type (mandatory)
|
||||
3. Feature qualifier (optional)
|
||||
4. Feature name (optional)
|
||||
5. Standard gene name (optional)
|
||||
6. Alias (optional, multiples separated by |)
|
||||
7. Parent feature name (optional)
|
||||
8. Secondary SGDID (optional, multiples separated by |)
|
||||
9. Chromosome (optional)
|
||||
10. Start_coordinate (optional)
|
||||
11. Stop_coordinate (optional)
|
||||
12. Strand (optional)
|
||||
13. Genetic position (optional)
|
||||
14. Coordinate version (optional)
|
||||
15. Sequence version (optional)
|
||||
16. Description (optional)
|
||||
|
||||
Note that "chromosome 17" is the mitochondrial chromosome.
|
||||
|
||||
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
|
||||
|
||||
SGD_features.tab
|
||||
|
||||
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
|
||||
|
||||
The SGD_features.tab file is updated weekly (Saturday).
|
||||
|
||||
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
|
||||
used chromosomal_feature.tab file.
|
||||
|
||||
File contents:
|
||||
|
||||
1. Information on current chromosomal features in SGD, including Dubious ORFs.
|
||||
Also contains coordinates of intron, exons, and other subfeatures that are located
|
||||
within a chromosomal feature.
|
||||
|
||||
2. The relationship between subfeatures and the feature in which they
|
||||
are located is identified by the feature name in column #7 (parent
|
||||
feature). For example, the parent feature of the intron found in
|
||||
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
|
||||
chromosome 6.
|
||||
|
||||
3. The coordinates of all features are in chromosomal coordinates.
|
||||
|
||||
|
||||
Columns within SGD_features.tab:
|
||||
|
||||
1. Primary SGDID (mandatory)
|
||||
2. Feature type (mandatory)
|
||||
3. Feature qualifier (optional)
|
||||
4. Feature name (optional)
|
||||
5. Standard gene name (optional)
|
||||
6. Alias (optional, multiples separated by |)
|
||||
7. Parent feature name (optional)
|
||||
8. Secondary SGDID (optional, multiples separated by |)
|
||||
9. Chromosome (optional)
|
||||
10. Start_coordinate (optional)
|
||||
11. Stop_coordinate (optional)
|
||||
12. Strand (optional)
|
||||
13. Genetic position (optional)
|
||||
14. Coordinate version (optional)
|
||||
15. Sequence version (optional)
|
||||
16. Description (optional)
|
||||
|
||||
Note that "chromosome 17" is the mitochondrial chromosome.
|
||||
|
||||
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
|
||||
|
||||
|
32908
data/SGD_features.tab
32908
data/SGD_features.tab
File diff suppressed because it is too large
Load Diff
2030
data/Species.csv
2030
data/Species.csv
File diff suppressed because it is too large
Load Diff
@ -1,179 +1,179 @@
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
|
||||
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
|
||||
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
|
||||
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
|
||||
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
|
||||
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
|
||||
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
|
||||
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
|
||||
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
|
||||
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
|
||||
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
|
||||
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
|
||||
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
|
||||
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
|
||||
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
|
||||
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
|
||||
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
|
||||
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
|
||||
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
|
||||
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
|
||||
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
|
||||
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
|
||||
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
|
||||
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
|
||||
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
|
||||
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
|
||||
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
|
||||
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
|
||||
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
|
||||
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
|
||||
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
|
||||
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
|
||||
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
|
||||
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
|
||||
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
|
||||
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
|
||||
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
|
||||
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
|
||||
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
|
||||
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
|
||||
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
|
||||
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
|
||||
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
|
||||
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
|
||||
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
|
||||
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
|
||||
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
|
||||
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
|
||||
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
|
||||
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
|
||||
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
|
||||
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078
|
||||
|
|
@ -1,49 +1,49 @@
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
|
||||
2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
|
||||
2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
|
||||
1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
|
||||
0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
|
||||
1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094
|
||||
|
|
@ -1,113 +1,113 @@
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
|
||||
4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
|
||||
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
|
||||
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
|
||||
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
|
||||
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
|
||||
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
|
||||
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677
|
||||
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
|
||||
5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
|
||||
4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
|
||||
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
|
||||
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
|
||||
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
|
||||
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
|
||||
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
|
||||
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
|
||||
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
|
||||
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
|
||||
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
|
||||
1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
|
||||
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
|
||||
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
|
||||
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
|
||||
1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677
|
||||
|
|
@ -1,39 +1,39 @@
|
||||
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
|
||||
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
|
||||
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
|
||||
|
||||
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
|
||||
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
|
||||
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
|
||||
|
||||
>MBP1_COPCI - XP_001837394 A8NYC6
|
||||
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
|
||||
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
|
||||
|
||||
>MBP1_CRYNE - XP_569090 Q5KMQ9
|
||||
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
|
||||
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
|
||||
|
||||
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
|
||||
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
|
||||
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
|
||||
|
||||
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
|
||||
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
|
||||
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
|
||||
|
||||
>MBP1_SACCE Mbp1 NP_010227 P39678
|
||||
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
|
||||
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
|
||||
|
||||
>MBP1_SCHPO Res2 NP_593032 P41412
|
||||
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
|
||||
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
|
||||
|
||||
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
|
||||
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
|
||||
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
|
||||
|
||||
>MBP1_WALME - XP_006957051 I4YGC0
|
||||
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
|
||||
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
|
||||
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
|
||||
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
|
||||
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
|
||||
|
||||
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
|
||||
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
|
||||
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
|
||||
|
||||
>MBP1_COPCI - XP_001837394 A8NYC6
|
||||
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
|
||||
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
|
||||
|
||||
>MBP1_CRYNE - XP_569090 Q5KMQ9
|
||||
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
|
||||
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
|
||||
|
||||
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
|
||||
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
|
||||
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
|
||||
|
||||
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
|
||||
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
|
||||
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
|
||||
|
||||
>MBP1_SACCE Mbp1 NP_010227 P39678
|
||||
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
|
||||
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
|
||||
|
||||
>MBP1_SCHPO Res2 NP_593032 P41412
|
||||
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
|
||||
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
|
||||
|
||||
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
|
||||
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
|
||||
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
|
||||
|
||||
>MBP1_WALME - XP_006957051 I4YGC0
|
||||
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
|
||||
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
|
||||
|
@ -1,490 +1,490 @@
|
||||
[
|
||||
{ "name" : "68476_WALME",
|
||||
"RefSeqID" : "XP_006957790",
|
||||
"UniProtID" : "I4YDD8",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
|
||||
"PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
|
||||
"GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
|
||||
"LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
|
||||
"LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
|
||||
"LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
|
||||
"FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
|
||||
"IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
|
||||
"GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
|
||||
},
|
||||
{ "name" : "00846_COPCI",
|
||||
"RefSeqID" : "XP_001831299",
|
||||
"UniProtID" : "A8N8X1",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
|
||||
"GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
|
||||
"NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
|
||||
"SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
|
||||
"PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
|
||||
"DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
|
||||
"KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
|
||||
"KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
|
||||
"RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
|
||||
"FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
|
||||
"SEAQVVDIGRVSGFMQKVRDGII"]
|
||||
},
|
||||
{ "name" : "8533_BIPOR",
|
||||
"RefSeqID" : "XP_007691662",
|
||||
"UniProtID" : "W6ZE71",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
|
||||
"SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
|
||||
"HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
|
||||
"RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
|
||||
"EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
|
||||
"ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
|
||||
"IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
|
||||
"VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
|
||||
"QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
|
||||
"DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
|
||||
"KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
|
||||
},
|
||||
{ "name" : "PGTG_02039",
|
||||
"RefSeqID" : "XP_003320997",
|
||||
"UniProtID" : "E3JX03",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
|
||||
"AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
|
||||
"ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
|
||||
"HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
|
||||
"NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
|
||||
"LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
|
||||
"LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
|
||||
"RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
|
||||
"GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
|
||||
"TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
|
||||
"LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
|
||||
},
|
||||
{ "name" : "MBPA_ASPNI",
|
||||
"RefSeqID" : "XP_664319",
|
||||
"UniProtID" : "Q5AYB5",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
|
||||
"FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
|
||||
"VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
|
||||
"QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
|
||||
"RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
|
||||
"FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
|
||||
"AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
|
||||
"QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
|
||||
"GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
|
||||
"IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
|
||||
"SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
|
||||
"MRDRGQDW"]
|
||||
},
|
||||
{ "name" : "05520_CRYNE",
|
||||
"RefSeqID" : "XP_570545",
|
||||
"UniProtID" : "Q5KHS0",
|
||||
"taxonomyID" : "214684",
|
||||
"sequence" : [
|
||||
"MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
|
||||
"FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
|
||||
"KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
|
||||
"PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
|
||||
"SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
|
||||
"DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
|
||||
"ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
|
||||
"VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
|
||||
"AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
|
||||
"EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
|
||||
"GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
|
||||
"VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
|
||||
},
|
||||
{ "name" : "RES1_SCHPO",
|
||||
"RefSeqID" : "NP_595496",
|
||||
"UniProtID" : "P33520",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
|
||||
"SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
|
||||
"GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
|
||||
"LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
|
||||
"KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
|
||||
"KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
|
||||
"AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
|
||||
"DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
|
||||
},
|
||||
{ "name" : "CDC10_SCHPO",
|
||||
"RefSeqID" : "NP_596132",
|
||||
"UniProtID" : "P01129",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
|
||||
"SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
|
||||
"LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
|
||||
"NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
|
||||
"KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
|
||||
"TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
|
||||
"DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
|
||||
"QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
|
||||
"KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
|
||||
"LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
|
||||
},
|
||||
{ "name" : "05338_USTMA",
|
||||
"RefSeqID" : "XP_011392041",
|
||||
"UniProtID" : "A0A0D1BWD8",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
|
||||
"SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
|
||||
"PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
|
||||
"AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
|
||||
"RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
|
||||
"DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
|
||||
"TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
|
||||
"DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
|
||||
"NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
|
||||
"GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
|
||||
"TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
|
||||
"LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
|
||||
"TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
|
||||
"DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
|
||||
},
|
||||
{ "name" : "SWI4_SACCE",
|
||||
"RefSeqID" : "NP_011036",
|
||||
"UniProtID" : "P25302",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
|
||||
"SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
|
||||
"GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
|
||||
"NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
|
||||
"SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
|
||||
"QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
|
||||
"KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
|
||||
"SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
|
||||
"IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
|
||||
"QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
|
||||
"KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
|
||||
"SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
|
||||
"EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
|
||||
"QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
|
||||
},
|
||||
{ "name" : "SWI6_NEUCR",
|
||||
"RefSeqID" : "XP_962967",
|
||||
"UniProtID" : "Q7SBG9",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
|
||||
"ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
|
||||
"PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
|
||||
"PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
|
||||
"EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
|
||||
"LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
|
||||
"KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
|
||||
"ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
|
||||
"HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
|
||||
"QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
|
||||
"EKPELEIARVRRFLGGVEGVVH"]
|
||||
},
|
||||
{ "name" : "15042_USTMA",
|
||||
"RefSeqID" : "XP_011388143",
|
||||
"UniProtID" : "A0A0D1CVS5",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
|
||||
"HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
|
||||
"GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
|
||||
"DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
|
||||
"ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
|
||||
"SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
|
||||
"AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
|
||||
"SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
|
||||
},
|
||||
{ "name" : "04778_USTMA",
|
||||
"RefSeqID" : "XP_011391646",
|
||||
"UniProtID" : "A0A0D1DQM4",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
|
||||
"VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
|
||||
"QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
|
||||
"LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
|
||||
"YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
|
||||
"PESAQLFTIHDFGSDPFYAEQVERG"]
|
||||
},
|
||||
{ "name" : "STUA_ASPNI",
|
||||
"RefSeqID" : "XP_663440",
|
||||
"UniProtID" : "P36011",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
|
||||
"SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
|
||||
"INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
|
||||
"RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
|
||||
"SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
|
||||
"GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
|
||||
"RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
|
||||
"SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
|
||||
},
|
||||
{ "name" : "STUA_NEUCR",
|
||||
"RefSeqID" : "XP_960837",
|
||||
"UniProtID" : "Q1K6U0",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
|
||||
"QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
|
||||
"RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
|
||||
"DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
|
||||
"NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
|
||||
"SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
|
||||
"NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
|
||||
"PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
|
||||
"RRR"]
|
||||
},
|
||||
{ "name" : "PHD1_SACCE",
|
||||
"RefSeqID" : "NP_012881",
|
||||
"UniProtID" : "P36093",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
|
||||
"VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
|
||||
"IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
|
||||
"EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
|
||||
"ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
|
||||
},
|
||||
{ "name" : "08099_COPCI",
|
||||
"RefSeqID" : "XP_001836714",
|
||||
"UniProtID" : "A8NVH3",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
|
||||
"KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
|
||||
"NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
|
||||
"HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
|
||||
"LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
|
||||
"EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
|
||||
"PHRPW"]
|
||||
},
|
||||
{ "name" : "68479_WALME",
|
||||
"RefSeqID" : "XP_006957792",
|
||||
"UniProtID" : "I4YDE0",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
|
||||
"ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
|
||||
"PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
|
||||
"SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
|
||||
},
|
||||
{ "name" : "11943_PUCGR",
|
||||
"RefSeqID" : "XP_003330006",
|
||||
"UniProtID" : "E3KMR2",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
|
||||
"HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
|
||||
"GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
|
||||
"RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
|
||||
"MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
|
||||
"QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
|
||||
"GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
|
||||
"TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
|
||||
},
|
||||
{ "name" : "03082_PUCGR",
|
||||
"RefSeqID" : "XP_003321545",
|
||||
"UniProtID" : "E3JYK1",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
|
||||
"STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
|
||||
"PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
|
||||
"FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
|
||||
"RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
|
||||
"RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
|
||||
"QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
|
||||
"AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
|
||||
"LIMEWNPSC"]
|
||||
},
|
||||
{ "name" : "SOK2_SACCE",
|
||||
"RefSeqID" : "NP_013729",
|
||||
"UniProtID" : "P53438",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
|
||||
"QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
|
||||
"MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
|
||||
"PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
|
||||
"NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
|
||||
"SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
|
||||
"HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
|
||||
"AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
|
||||
"TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
|
||||
"LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
|
||||
},
|
||||
{ "name" : "14426_COPCI",
|
||||
"RefSeqID" : "XP_002911429",
|
||||
"UniProtID" : "D6RMB0",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
|
||||
"EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
|
||||
"TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
|
||||
"KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
|
||||
"KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
|
||||
"ITYLPNFL"]
|
||||
},
|
||||
{ "name" : "BQT4_SCHPO",
|
||||
"RefSeqID" : "NP_596166",
|
||||
"UniProtID" : "O60158",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
|
||||
"FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
|
||||
"EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
|
||||
"SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
|
||||
"KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
|
||||
"LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
|
||||
},
|
||||
{ "name" : "PGTG_05590",
|
||||
"RefSeqID" : "XP_003323688",
|
||||
"UniProtID" : "E3K4V4",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
|
||||
"KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
|
||||
"SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
|
||||
"PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
|
||||
"ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
|
||||
"TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
|
||||
"GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
|
||||
},
|
||||
{ "name" : "06560_NEUCR",
|
||||
"RefSeqID" : "XP_962267",
|
||||
"UniProtID" : "Q7S9H5",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
|
||||
"PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
|
||||
"SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
|
||||
"VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
|
||||
"DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
|
||||
"TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
|
||||
"ANVL"]
|
||||
},
|
||||
{ "name" : "81480_BIPOR",
|
||||
"RefSeqID" : "XP_007682909",
|
||||
"UniProtID" : "W6ZKJ4",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
|
||||
"RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
|
||||
"EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
|
||||
"DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
|
||||
"TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
|
||||
"KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
|
||||
},
|
||||
{ "name" : "01622_ASPNI",
|
||||
"RefSeqID" : "XP_657766",
|
||||
"UniProtID" : "Q5BH18",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
|
||||
"QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
|
||||
"LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
|
||||
"QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
|
||||
"EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
|
||||
"RVRNRALMGVTAAFALAKPALVLLEA"]
|
||||
},
|
||||
{ "name" : "05405_ASPNI",
|
||||
"RefSeqID" : "XP_663009",
|
||||
"UniProtID" : "Q5B225",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
|
||||
"FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
|
||||
"LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
|
||||
"TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
|
||||
"PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
|
||||
"SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
|
||||
"DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
|
||||
"ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
|
||||
},
|
||||
{ "name" : "105954_BIPOR",
|
||||
"RefSeqID" : "XP_007691967",
|
||||
"UniProtID" : "W6Z1H5",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
|
||||
"YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
|
||||
"ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
|
||||
"TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
|
||||
"RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
|
||||
"GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
|
||||
},
|
||||
{ "name" : "69819_WALME",
|
||||
"RefSeqID" : "XP_006959479",
|
||||
"UniProtID" : "I4Y911",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
|
||||
"ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
|
||||
"PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
|
||||
"KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
|
||||
"NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
|
||||
},
|
||||
{ "name" : "02840_CRYNE",
|
||||
"RefSeqID" : "XP_568872",
|
||||
"UniProtID" : "Q5KM59",
|
||||
"taxonomyID" : "214684",
|
||||
"sequence" : [
|
||||
"MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
|
||||
"THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
|
||||
"QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
|
||||
"EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
|
||||
"VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
|
||||
},
|
||||
{ "name" : "11055_USTMA",
|
||||
"RefSeqID" : "XP_011390537",
|
||||
"UniProtID" : "A0A0D1DZM8",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
|
||||
"LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
|
||||
"LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
|
||||
"AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
|
||||
"VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
|
||||
"VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
|
||||
"ASILPW"]
|
||||
},
|
||||
{ "name" : "XBP1_NEUCR",
|
||||
"RefSeqID" : "XP_962373",
|
||||
"UniProtID" : "Q7S9W7",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
|
||||
"MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
|
||||
"GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
|
||||
"SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
|
||||
"RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
|
||||
"DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
|
||||
"SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
|
||||
"TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
|
||||
"DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
|
||||
},
|
||||
{ "name" : "XBP1_SACCE",
|
||||
"RefSeqID" : "NP_012165",
|
||||
"UniProtID" : "P40489",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
|
||||
"QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
|
||||
"ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
|
||||
"FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
|
||||
"YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
|
||||
"NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
|
||||
"KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
|
||||
"DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
|
||||
"FKTNSKQ"]
|
||||
}
|
||||
]
|
||||
[
|
||||
{ "name" : "68476_WALME",
|
||||
"RefSeqID" : "XP_006957790",
|
||||
"UniProtID" : "I4YDD8",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
|
||||
"PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
|
||||
"GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
|
||||
"LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
|
||||
"LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
|
||||
"LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
|
||||
"FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
|
||||
"IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
|
||||
"GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
|
||||
},
|
||||
{ "name" : "00846_COPCI",
|
||||
"RefSeqID" : "XP_001831299",
|
||||
"UniProtID" : "A8N8X1",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
|
||||
"GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
|
||||
"NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
|
||||
"SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
|
||||
"PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
|
||||
"DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
|
||||
"KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
|
||||
"KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
|
||||
"RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
|
||||
"FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
|
||||
"SEAQVVDIGRVSGFMQKVRDGII"]
|
||||
},
|
||||
{ "name" : "8533_BIPOR",
|
||||
"RefSeqID" : "XP_007691662",
|
||||
"UniProtID" : "W6ZE71",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
|
||||
"SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
|
||||
"HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
|
||||
"RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
|
||||
"EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
|
||||
"ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
|
||||
"IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
|
||||
"VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
|
||||
"QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
|
||||
"DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
|
||||
"KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
|
||||
},
|
||||
{ "name" : "PGTG_02039",
|
||||
"RefSeqID" : "XP_003320997",
|
||||
"UniProtID" : "E3JX03",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
|
||||
"AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
|
||||
"ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
|
||||
"HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
|
||||
"NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
|
||||
"LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
|
||||
"LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
|
||||
"RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
|
||||
"GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
|
||||
"TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
|
||||
"LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
|
||||
},
|
||||
{ "name" : "MBPA_ASPNI",
|
||||
"RefSeqID" : "XP_664319",
|
||||
"UniProtID" : "Q5AYB5",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
|
||||
"FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
|
||||
"VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
|
||||
"QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
|
||||
"RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
|
||||
"FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
|
||||
"AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
|
||||
"QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
|
||||
"GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
|
||||
"IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
|
||||
"SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
|
||||
"MRDRGQDW"]
|
||||
},
|
||||
{ "name" : "05520_CRYNE",
|
||||
"RefSeqID" : "XP_570545",
|
||||
"UniProtID" : "Q5KHS0",
|
||||
"taxonomyID" : "214684",
|
||||
"sequence" : [
|
||||
"MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
|
||||
"FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
|
||||
"KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
|
||||
"PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
|
||||
"SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
|
||||
"DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
|
||||
"ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
|
||||
"VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
|
||||
"AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
|
||||
"EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
|
||||
"GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
|
||||
"VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
|
||||
},
|
||||
{ "name" : "RES1_SCHPO",
|
||||
"RefSeqID" : "NP_595496",
|
||||
"UniProtID" : "P33520",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
|
||||
"SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
|
||||
"GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
|
||||
"LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
|
||||
"KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
|
||||
"KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
|
||||
"AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
|
||||
"DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
|
||||
},
|
||||
{ "name" : "CDC10_SCHPO",
|
||||
"RefSeqID" : "NP_596132",
|
||||
"UniProtID" : "P01129",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
|
||||
"SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
|
||||
"LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
|
||||
"NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
|
||||
"KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
|
||||
"TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
|
||||
"DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
|
||||
"QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
|
||||
"KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
|
||||
"LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
|
||||
},
|
||||
{ "name" : "05338_USTMA",
|
||||
"RefSeqID" : "XP_011392041",
|
||||
"UniProtID" : "A0A0D1BWD8",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
|
||||
"SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
|
||||
"PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
|
||||
"AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
|
||||
"RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
|
||||
"DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
|
||||
"TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
|
||||
"DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
|
||||
"NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
|
||||
"GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
|
||||
"TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
|
||||
"LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
|
||||
"TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
|
||||
"DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
|
||||
},
|
||||
{ "name" : "SWI4_SACCE",
|
||||
"RefSeqID" : "NP_011036",
|
||||
"UniProtID" : "P25302",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
|
||||
"SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
|
||||
"GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
|
||||
"NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
|
||||
"SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
|
||||
"QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
|
||||
"KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
|
||||
"SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
|
||||
"IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
|
||||
"QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
|
||||
"KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
|
||||
"SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
|
||||
"EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
|
||||
"QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
|
||||
},
|
||||
{ "name" : "SWI6_NEUCR",
|
||||
"RefSeqID" : "XP_962967",
|
||||
"UniProtID" : "Q7SBG9",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
|
||||
"ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
|
||||
"PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
|
||||
"PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
|
||||
"EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
|
||||
"LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
|
||||
"KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
|
||||
"ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
|
||||
"HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
|
||||
"QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
|
||||
"EKPELEIARVRRFLGGVEGVVH"]
|
||||
},
|
||||
{ "name" : "15042_USTMA",
|
||||
"RefSeqID" : "XP_011388143",
|
||||
"UniProtID" : "A0A0D1CVS5",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
|
||||
"HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
|
||||
"GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
|
||||
"DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
|
||||
"ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
|
||||
"SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
|
||||
"AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
|
||||
"SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
|
||||
},
|
||||
{ "name" : "04778_USTMA",
|
||||
"RefSeqID" : "XP_011391646",
|
||||
"UniProtID" : "A0A0D1DQM4",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
|
||||
"VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
|
||||
"QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
|
||||
"LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
|
||||
"YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
|
||||
"PESAQLFTIHDFGSDPFYAEQVERG"]
|
||||
},
|
||||
{ "name" : "STUA_ASPNI",
|
||||
"RefSeqID" : "XP_663440",
|
||||
"UniProtID" : "P36011",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
|
||||
"SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
|
||||
"INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
|
||||
"RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
|
||||
"SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
|
||||
"GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
|
||||
"RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
|
||||
"SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
|
||||
},
|
||||
{ "name" : "STUA_NEUCR",
|
||||
"RefSeqID" : "XP_960837",
|
||||
"UniProtID" : "Q1K6U0",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
|
||||
"QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
|
||||
"RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
|
||||
"DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
|
||||
"NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
|
||||
"SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
|
||||
"NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
|
||||
"PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
|
||||
"RRR"]
|
||||
},
|
||||
{ "name" : "PHD1_SACCE",
|
||||
"RefSeqID" : "NP_012881",
|
||||
"UniProtID" : "P36093",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
|
||||
"VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
|
||||
"IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
|
||||
"EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
|
||||
"ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
|
||||
},
|
||||
{ "name" : "08099_COPCI",
|
||||
"RefSeqID" : "XP_001836714",
|
||||
"UniProtID" : "A8NVH3",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
|
||||
"KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
|
||||
"NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
|
||||
"HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
|
||||
"LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
|
||||
"EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
|
||||
"PHRPW"]
|
||||
},
|
||||
{ "name" : "68479_WALME",
|
||||
"RefSeqID" : "XP_006957792",
|
||||
"UniProtID" : "I4YDE0",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
|
||||
"ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
|
||||
"PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
|
||||
"SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
|
||||
},
|
||||
{ "name" : "11943_PUCGR",
|
||||
"RefSeqID" : "XP_003330006",
|
||||
"UniProtID" : "E3KMR2",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
|
||||
"HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
|
||||
"GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
|
||||
"RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
|
||||
"MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
|
||||
"QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
|
||||
"GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
|
||||
"TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
|
||||
},
|
||||
{ "name" : "03082_PUCGR",
|
||||
"RefSeqID" : "XP_003321545",
|
||||
"UniProtID" : "E3JYK1",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
|
||||
"STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
|
||||
"PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
|
||||
"FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
|
||||
"RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
|
||||
"RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
|
||||
"QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
|
||||
"AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
|
||||
"LIMEWNPSC"]
|
||||
},
|
||||
{ "name" : "SOK2_SACCE",
|
||||
"RefSeqID" : "NP_013729",
|
||||
"UniProtID" : "P53438",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
|
||||
"QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
|
||||
"MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
|
||||
"PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
|
||||
"NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
|
||||
"SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
|
||||
"HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
|
||||
"AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
|
||||
"TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
|
||||
"LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
|
||||
},
|
||||
{ "name" : "14426_COPCI",
|
||||
"RefSeqID" : "XP_002911429",
|
||||
"UniProtID" : "D6RMB0",
|
||||
"taxonomyID" : "240176",
|
||||
"sequence" : [
|
||||
"MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
|
||||
"EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
|
||||
"TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
|
||||
"KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
|
||||
"KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
|
||||
"ITYLPNFL"]
|
||||
},
|
||||
{ "name" : "BQT4_SCHPO",
|
||||
"RefSeqID" : "NP_596166",
|
||||
"UniProtID" : "O60158",
|
||||
"taxonomyID" : "284812",
|
||||
"sequence" : [
|
||||
"MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
|
||||
"FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
|
||||
"EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
|
||||
"SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
|
||||
"KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
|
||||
"LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
|
||||
},
|
||||
{ "name" : "PGTG_05590",
|
||||
"RefSeqID" : "XP_003323688",
|
||||
"UniProtID" : "E3K4V4",
|
||||
"taxonomyID" : "418459",
|
||||
"sequence" : [
|
||||
"MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
|
||||
"KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
|
||||
"SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
|
||||
"PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
|
||||
"ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
|
||||
"TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
|
||||
"GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
|
||||
},
|
||||
{ "name" : "06560_NEUCR",
|
||||
"RefSeqID" : "XP_962267",
|
||||
"UniProtID" : "Q7S9H5",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
|
||||
"PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
|
||||
"SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
|
||||
"VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
|
||||
"DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
|
||||
"TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
|
||||
"ANVL"]
|
||||
},
|
||||
{ "name" : "81480_BIPOR",
|
||||
"RefSeqID" : "XP_007682909",
|
||||
"UniProtID" : "W6ZKJ4",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
|
||||
"RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
|
||||
"EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
|
||||
"DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
|
||||
"TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
|
||||
"KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
|
||||
},
|
||||
{ "name" : "01622_ASPNI",
|
||||
"RefSeqID" : "XP_657766",
|
||||
"UniProtID" : "Q5BH18",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
|
||||
"QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
|
||||
"LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
|
||||
"QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
|
||||
"EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
|
||||
"RVRNRALMGVTAAFALAKPALVLLEA"]
|
||||
},
|
||||
{ "name" : "05405_ASPNI",
|
||||
"RefSeqID" : "XP_663009",
|
||||
"UniProtID" : "Q5B225",
|
||||
"taxonomyID" : "227321",
|
||||
"sequence" : [
|
||||
"MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
|
||||
"FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
|
||||
"LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
|
||||
"TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
|
||||
"PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
|
||||
"SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
|
||||
"DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
|
||||
"ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
|
||||
},
|
||||
{ "name" : "105954_BIPOR",
|
||||
"RefSeqID" : "XP_007691967",
|
||||
"UniProtID" : "W6Z1H5",
|
||||
"taxonomyID" : "930090",
|
||||
"sequence" : [
|
||||
"MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
|
||||
"YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
|
||||
"ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
|
||||
"TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
|
||||
"RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
|
||||
"GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
|
||||
},
|
||||
{ "name" : "69819_WALME",
|
||||
"RefSeqID" : "XP_006959479",
|
||||
"UniProtID" : "I4Y911",
|
||||
"taxonomyID" : "671144",
|
||||
"sequence" : [
|
||||
"MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
|
||||
"ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
|
||||
"PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
|
||||
"KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
|
||||
"NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
|
||||
},
|
||||
{ "name" : "02840_CRYNE",
|
||||
"RefSeqID" : "XP_568872",
|
||||
"UniProtID" : "Q5KM59",
|
||||
"taxonomyID" : "214684",
|
||||
"sequence" : [
|
||||
"MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
|
||||
"THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
|
||||
"QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
|
||||
"EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
|
||||
"VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
|
||||
},
|
||||
{ "name" : "11055_USTMA",
|
||||
"RefSeqID" : "XP_011390537",
|
||||
"UniProtID" : "A0A0D1DZM8",
|
||||
"taxonomyID" : "237631",
|
||||
"sequence" : [
|
||||
"MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
|
||||
"LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
|
||||
"LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
|
||||
"AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
|
||||
"VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
|
||||
"VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
|
||||
"ASILPW"]
|
||||
},
|
||||
{ "name" : "XBP1_NEUCR",
|
||||
"RefSeqID" : "XP_962373",
|
||||
"UniProtID" : "Q7S9W7",
|
||||
"taxonomyID" : "367110",
|
||||
"sequence" : [
|
||||
"MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
|
||||
"MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
|
||||
"GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
|
||||
"SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
|
||||
"RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
|
||||
"DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
|
||||
"SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
|
||||
"TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
|
||||
"DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
|
||||
},
|
||||
{ "name" : "XBP1_SACCE",
|
||||
"RefSeqID" : "NP_012165",
|
||||
"UniProtID" : "P40489",
|
||||
"taxonomyID" : "559292",
|
||||
"sequence" : [
|
||||
"MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
|
||||
"QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
|
||||
"ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
|
||||
"FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
|
||||
"YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
|
||||
"NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
|
||||
"KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
|
||||
"DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
|
||||
"FKTNSKQ"]
|
||||
}
|
||||
]
|
||||
|
@ -1,116 +1,116 @@
|
||||
[
|
||||
{"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
|
||||
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
|
||||
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
|
||||
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
|
||||
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
|
||||
|
||||
{"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
|
||||
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
|
||||
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
|
||||
|
||||
{"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
|
||||
|
||||
{"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
|
||||
]
|
||||
[
|
||||
{"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
|
||||
{"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
|
||||
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
|
||||
{"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
|
||||
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
|
||||
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
|
||||
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
|
||||
{"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
|
||||
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
|
||||
{"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
|
||||
|
||||
{"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
|
||||
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
|
||||
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
|
||||
{"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
|
||||
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
|
||||
{"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
|
||||
|
||||
{"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
|
||||
{"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
|
||||
|
||||
{"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
|
||||
{"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
|
||||
]
|
||||
|
@ -1,47 +1,47 @@
|
||||
[
|
||||
{ "name" : "APSES fold",
|
||||
"description " : "DNA binding domain by similarity to structure",
|
||||
"sourceDB" : "PDB",
|
||||
"accession" : "1BM8_A_1_99"},
|
||||
|
||||
{ "name" : "KilA-N",
|
||||
"description " : "DNA binding domain by Pfam annotation",
|
||||
"sourceDB" : "Pfam",
|
||||
"accession" : "PF04383"},
|
||||
|
||||
{ "name" : "AT hook",
|
||||
"description " : "DNA interaction motif by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "low complexity",
|
||||
"description " : "SEG annotation by SMART",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "Ankyrin fold",
|
||||
"description " : "Ankyrin domain by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : "SM00248"},
|
||||
|
||||
{ "name" : "Swi6 fold",
|
||||
"description " : "Swi6 fold by similarity to structure",
|
||||
"sourceDB" : "PDB",
|
||||
"accession" : "1SW6_B"},
|
||||
|
||||
{ "name" : "coiled coil",
|
||||
"description " : "Coiled coil by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "McInerny 2011",
|
||||
"description " : "Yeast cell cycle review",
|
||||
"sourceDB" : "PubMed",
|
||||
"accession" : "21310294"}
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
[
|
||||
{ "name" : "APSES fold",
|
||||
"description " : "DNA binding domain by similarity to structure",
|
||||
"sourceDB" : "PDB",
|
||||
"accession" : "1BM8_A_1_99"},
|
||||
|
||||
{ "name" : "KilA-N",
|
||||
"description " : "DNA binding domain by Pfam annotation",
|
||||
"sourceDB" : "Pfam",
|
||||
"accession" : "PF04383"},
|
||||
|
||||
{ "name" : "AT hook",
|
||||
"description " : "DNA interaction motif by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "low complexity",
|
||||
"description " : "SEG annotation by SMART",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "Ankyrin fold",
|
||||
"description " : "Ankyrin domain by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : "SM00248"},
|
||||
|
||||
{ "name" : "Swi6 fold",
|
||||
"description " : "Swi6 fold by similarity to structure",
|
||||
"sourceDB" : "PDB",
|
||||
"accession" : "1SW6_B"},
|
||||
|
||||
{ "name" : "coiled coil",
|
||||
"description " : "Coiled coil by SMART annotation",
|
||||
"sourceDB" : "SMART",
|
||||
"accession" : null},
|
||||
|
||||
{ "name" : "McInerny 2011",
|
||||
"description " : "Yeast cell cycle review",
|
||||
"sourceDB" : "PubMed",
|
||||
"accession" : "21310294"}
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,155 +1,155 @@
|
||||
[
|
||||
{ "name" : "MBP1_SCHPO",
|
||||
"RefSeqID" : "NP_593032",
|
||||
"UniProtID" : "P41412",
|
||||
"taxonomyID" : 284812,
|
||||
"sequence" : [
|
||||
"MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
|
||||
"GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
|
||||
"STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
|
||||
"YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
|
||||
"SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
|
||||
"SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
|
||||
"QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
|
||||
"NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
|
||||
"DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
|
||||
"AMSCGINPEDLSLEILDAVEEALTREK"]
|
||||
},
|
||||
{ "name" : "MBP1_ASPNI",
|
||||
"RefSeqID" : "XP_660758",
|
||||
"UniProtID" : "Q5B8H6",
|
||||
"taxonomyID" : 227321,
|
||||
"sequence" : [
|
||||
"MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
|
||||
"QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
|
||||
"FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
|
||||
"AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
|
||||
"LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
|
||||
"SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
|
||||
"SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
|
||||
"TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
|
||||
"QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
|
||||
"DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
|
||||
},
|
||||
{ "name" : "MBP1_BIPOR",
|
||||
"RefSeqID" : "XP_007682304",
|
||||
"UniProtID" : "W6ZM86",
|
||||
"taxonomyID" : 930090,
|
||||
"sequence" : [
|
||||
"MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
|
||||
"QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
|
||||
"AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
|
||||
"NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
|
||||
"GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
|
||||
"SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
|
||||
"GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
|
||||
"SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
|
||||
"EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
|
||||
"REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
|
||||
},
|
||||
{ "name" : "MBP1_NEUCR",
|
||||
"RefSeqID" : "XP_955821",
|
||||
"UniProtID" : "Q7RW59",
|
||||
"taxonomyID" : 367110,
|
||||
"sequence" : [
|
||||
"MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
|
||||
"QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
|
||||
"PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
|
||||
"TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
|
||||
"DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
|
||||
"IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
|
||||
"NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
|
||||
"QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
|
||||
"MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
|
||||
"NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
|
||||
"RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
|
||||
"GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
|
||||
},
|
||||
{ "name" : "MBP1_COPCI",
|
||||
"RefSeqID" : "XP_001837394",
|
||||
"UniProtID" : "A8NYC6",
|
||||
"taxonomyID" : 240176,
|
||||
"sequence" : [
|
||||
"MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
|
||||
"YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
|
||||
"TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
|
||||
"RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
|
||||
"MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
|
||||
"VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
|
||||
"HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
|
||||
"RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
|
||||
"NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
|
||||
"VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
|
||||
"GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
|
||||
},
|
||||
{ "name" : "MBP1_CRYNE",
|
||||
"RefSeqID" : "XP_569090",
|
||||
"UniProtID" : "Q5KMQ9",
|
||||
"taxonomyID" : 214684,
|
||||
"sequence" : [
|
||||
"MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
|
||||
"QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
|
||||
"KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
|
||||
"TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
|
||||
"IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
|
||||
"AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
|
||||
"YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
|
||||
"DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
|
||||
"RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
|
||||
"EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
|
||||
"IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
|
||||
},
|
||||
{ "name" : "MBP1_PUCGR",
|
||||
"RefSeqID" : "XP_003327086",
|
||||
"UniProtID" : "E3KED4",
|
||||
"taxonomyID" : 418459,
|
||||
"sequence" : [
|
||||
"MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
|
||||
"HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
|
||||
"REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
|
||||
"VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
|
||||
"LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
|
||||
"QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
|
||||
"QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
|
||||
"DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
|
||||
"DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
|
||||
"GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
|
||||
"LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
|
||||
"RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
|
||||
"SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
|
||||
"NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
|
||||
},
|
||||
{ "name" : "MBP1_USTMA",
|
||||
"RefSeqID" : "XP_011392621",
|
||||
"UniProtID" : "A0A0D1DP35",
|
||||
"taxonomyID" : 237631,
|
||||
"sequence" : [
|
||||
"MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
|
||||
"GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
|
||||
"RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
|
||||
"ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
|
||||
"TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
|
||||
"GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
|
||||
"AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
|
||||
"IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
|
||||
"TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
|
||||
"AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
|
||||
"P"]
|
||||
},
|
||||
{ "name" : "MBP1_WALME",
|
||||
"RefSeqID" : "XP_006957051",
|
||||
"UniProtID" : "I4YGC0",
|
||||
"taxonomyID" : 671144,
|
||||
"sequence" : [
|
||||
"MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
|
||||
"YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
|
||||
"VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
|
||||
"GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
|
||||
"NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
|
||||
"KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
|
||||
"LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
|
||||
"NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
|
||||
"ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
|
||||
"NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
|
||||
}
|
||||
]
|
||||
[
|
||||
{ "name" : "MBP1_SCHPO",
|
||||
"RefSeqID" : "NP_593032",
|
||||
"UniProtID" : "P41412",
|
||||
"taxonomyID" : 284812,
|
||||
"sequence" : [
|
||||
"MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
|
||||
"GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
|
||||
"STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
|
||||
"YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
|
||||
"SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
|
||||
"SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
|
||||
"QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
|
||||
"NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
|
||||
"DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
|
||||
"AMSCGINPEDLSLEILDAVEEALTREK"]
|
||||
},
|
||||
{ "name" : "MBP1_ASPNI",
|
||||
"RefSeqID" : "XP_660758",
|
||||
"UniProtID" : "Q5B8H6",
|
||||
"taxonomyID" : 227321,
|
||||
"sequence" : [
|
||||
"MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
|
||||
"QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
|
||||
"FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
|
||||
"AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
|
||||
"LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
|
||||
"SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
|
||||
"SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
|
||||
"TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
|
||||
"QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
|
||||
"DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
|
||||
},
|
||||
{ "name" : "MBP1_BIPOR",
|
||||
"RefSeqID" : "XP_007682304",
|
||||
"UniProtID" : "W6ZM86",
|
||||
"taxonomyID" : 930090,
|
||||
"sequence" : [
|
||||
"MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
|
||||
"QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
|
||||
"AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
|
||||
"NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
|
||||
"GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
|
||||
"SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
|
||||
"GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
|
||||
"SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
|
||||
"EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
|
||||
"REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
|
||||
},
|
||||
{ "name" : "MBP1_NEUCR",
|
||||
"RefSeqID" : "XP_955821",
|
||||
"UniProtID" : "Q7RW59",
|
||||
"taxonomyID" : 367110,
|
||||
"sequence" : [
|
||||
"MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
|
||||
"QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
|
||||
"PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
|
||||
"TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
|
||||
"DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
|
||||
"IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
|
||||
"NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
|
||||
"QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
|
||||
"MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
|
||||
"NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
|
||||
"RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
|
||||
"GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
|
||||
},
|
||||
{ "name" : "MBP1_COPCI",
|
||||
"RefSeqID" : "XP_001837394",
|
||||
"UniProtID" : "A8NYC6",
|
||||
"taxonomyID" : 240176,
|
||||
"sequence" : [
|
||||
"MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
|
||||
"YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
|
||||
"TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
|
||||
"RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
|
||||
"MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
|
||||
"VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
|
||||
"HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
|
||||
"RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
|
||||
"NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
|
||||
"VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
|
||||
"GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
|
||||
},
|
||||
{ "name" : "MBP1_CRYNE",
|
||||
"RefSeqID" : "XP_569090",
|
||||
"UniProtID" : "Q5KMQ9",
|
||||
"taxonomyID" : 214684,
|
||||
"sequence" : [
|
||||
"MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
|
||||
"QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
|
||||
"KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
|
||||
"TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
|
||||
"IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
|
||||
"AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
|
||||
"YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
|
||||
"DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
|
||||
"RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
|
||||
"EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
|
||||
"IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
|
||||
},
|
||||
{ "name" : "MBP1_PUCGR",
|
||||
"RefSeqID" : "XP_003327086",
|
||||
"UniProtID" : "E3KED4",
|
||||
"taxonomyID" : 418459,
|
||||
"sequence" : [
|
||||
"MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
|
||||
"HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
|
||||
"REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
|
||||
"VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
|
||||
"LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
|
||||
"QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
|
||||
"QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
|
||||
"DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
|
||||
"DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
|
||||
"GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
|
||||
"LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
|
||||
"RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
|
||||
"SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
|
||||
"NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
|
||||
},
|
||||
{ "name" : "MBP1_USTMA",
|
||||
"RefSeqID" : "XP_011392621",
|
||||
"UniProtID" : "A0A0D1DP35",
|
||||
"taxonomyID" : 237631,
|
||||
"sequence" : [
|
||||
"MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
|
||||
"GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
|
||||
"RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
|
||||
"ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
|
||||
"TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
|
||||
"GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
|
||||
"AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
|
||||
"IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
|
||||
"TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
|
||||
"AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
|
||||
"P"]
|
||||
},
|
||||
{ "name" : "MBP1_WALME",
|
||||
"RefSeqID" : "XP_006957051",
|
||||
"UniProtID" : "I4YGC0",
|
||||
"taxonomyID" : 671144,
|
||||
"sequence" : [
|
||||
"MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
|
||||
"YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
|
||||
"VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
|
||||
"GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
|
||||
"NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
|
||||
"KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
|
||||
"LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
|
||||
"NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
|
||||
"ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
|
||||
"NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
|
||||
}
|
||||
]
|
||||
|
@ -1,22 +1,22 @@
|
||||
[
|
||||
{ "ID" : 227321,
|
||||
"species" : "Aspergillus nidulans FGSC A4"},
|
||||
{ "ID" : 930090,
|
||||
"species" : "Bipolaris oryzae ATCC 44560"},
|
||||
{ "ID" : 240176,
|
||||
"species" : "Coprinopsis cinerea okayama7#130"},
|
||||
{ "ID" : 214684,
|
||||
"species" : "Cryptococcus neoformans var. neoformans JEC21"},
|
||||
{ "ID" : 367110,
|
||||
"species" : "Neurospora crassa OR74A"},
|
||||
{ "ID" : 418459,
|
||||
"species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
|
||||
{ "ID" : 559292,
|
||||
"species" : "Saccharomyces cerevisiae S288C"},
|
||||
{ "ID" : 284812,
|
||||
"species" : "Schizosaccharomyces pombe 972h-"},
|
||||
{ "ID" : 237631,
|
||||
"species" : "Ustilago maydis 521"},
|
||||
{ "ID" : 671144,
|
||||
"species" : "Wallemia mellicola CBS 633.66"}
|
||||
]
|
||||
[
|
||||
{ "ID" : 227321,
|
||||
"species" : "Aspergillus nidulans FGSC A4"},
|
||||
{ "ID" : 930090,
|
||||
"species" : "Bipolaris oryzae ATCC 44560"},
|
||||
{ "ID" : 240176,
|
||||
"species" : "Coprinopsis cinerea okayama7#130"},
|
||||
{ "ID" : 214684,
|
||||
"species" : "Cryptococcus neoformans var. neoformans JEC21"},
|
||||
{ "ID" : 367110,
|
||||
"species" : "Neurospora crassa OR74A"},
|
||||
{ "ID" : 418459,
|
||||
"species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
|
||||
{ "ID" : 559292,
|
||||
"species" : "Saccharomyces cerevisiae S288C"},
|
||||
{ "ID" : 284812,
|
||||
"species" : "Schizosaccharomyces pombe 972h-"},
|
||||
{ "ID" : 237631,
|
||||
"species" : "Ustilago maydis 521"},
|
||||
{ "ID" : 671144,
|
||||
"species" : "Wallemia mellicola CBS 633.66"}
|
||||
]
|
||||
|
@ -1,115 +1,115 @@
|
||||
ID protein.ID feature.ID start end note
|
||||
# MBP1_SACCE
|
||||
NA ref_pro_4 ref_ftr_1 4 102 APSES fold
|
||||
NA ref_pro_4 ref_ftr_2 22 105 KilA-N
|
||||
NA ref_pro_4 ref_ftr_4 108 122 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 236 241 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 279 307 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 700 717 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 700 717 low complexity
|
||||
NA ref_pro_4 ref_ftr_5 394 423 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_5 427 463 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_5 512 541 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold
|
||||
NA ref_pro_4 ref_ftr_7 633 655 coiled coil
|
||||
# MBP1_ASPNI
|
||||
NA ref_pro_1 ref_ftr_1 9 106 APSES fold
|
||||
NA ref_pro_1 ref_ftr_2 26 109 KilA-N
|
||||
NA ref_pro_1 ref_ftr_4 529 534 low complexity
|
||||
NA ref_pro_1 ref_ftr_5 260 289 Ankyrin
|
||||
NA ref_pro_1 ref_ftr_5 381 413 Ankyrin
|
||||
NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold
|
||||
NA ref_pro_1 ref_ftr_7 509 572 coiled coil
|
||||
# MBP1_BIPOR
|
||||
NA ref_pro_2 ref_ftr_1 8 106 APSES fold
|
||||
NA ref_pro_2 ref_ftr_2 26 109 KilA-N
|
||||
NA ref_pro_2 ref_ftr_4 134 152 low complexity
|
||||
NA ref_pro_2 ref_ftr_4 267 278 low complexity
|
||||
NA ref_pro_2 ref_ftr_4 670 685 low complexity
|
||||
NA ref_pro_2 ref_ftr_5 266 295 Ankyrin
|
||||
NA ref_pro_2 ref_ftr_5 387 416 Ankyrin
|
||||
NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold
|
||||
NA ref_pro_2 ref_ftr_7 659 681 coiled coil
|
||||
NA ref_pro_2 ref_ftr_7 500 590 coiled coil
|
||||
# MBP1_NEUCR
|
||||
NA ref_pro_3 ref_ftr_1 14 114 APSES fold
|
||||
NA ref_pro_3 ref_ftr_2 34 117 KilA-N
|
||||
NA ref_pro_3 ref_ftr_4 130 141 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 253 266 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 514 525 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 554 564 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 601 618 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 620 629 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 636 652 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 658 672 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 725 735 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 752 771 low complexity
|
||||
NA ref_pro_3 ref_ftr_5 268 297 Ankyrin
|
||||
NA ref_pro_3 ref_ftr_5 390 419 Ankyrin
|
||||
NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold
|
||||
NA ref_pro_3 ref_ftr_7 500 550 coiled coil
|
||||
# MBP1_SCHPO
|
||||
NA ref_pro_5 ref_ftr_1 8 104 APSES fold
|
||||
NA ref_pro_5 ref_ftr_2 25 113 KilA-N
|
||||
NA ref_pro_5 ref_ftr_4 111 125 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 136 145 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 176 191 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 422 447 low complexity
|
||||
NA ref_pro_5 ref_ftr_5 247 276 Ankyrin
|
||||
NA ref_pro_5 ref_ftr_5 368 397 Ankyrin
|
||||
NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold
|
||||
NA ref_pro_5 ref_ftr_7 457 538 coiled coil
|
||||
# MBP1_COPCI
|
||||
NA ref_pro_6 ref_ftr_1 5 103 APSES fold
|
||||
NA ref_pro_6 ref_ftr_2 23 106 KilA-N
|
||||
NA ref_pro_6 ref_ftr_4 170 191 low complexity
|
||||
NA ref_pro_6 ref_ftr_4 435 450 low complexity
|
||||
NA ref_pro_6 ref_ftr_4 611 626 low complexity
|
||||
NA ref_pro_6 ref_ftr_5 270 299 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_5 389 418 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_5 474 509 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold
|
||||
NA ref_pro_6 ref_ftr_7 500 570 coiled coil
|
||||
NA ref_pro_6 ref_ftr_7 651 678 coiled coil
|
||||
# MBP1_CRYNE
|
||||
NA ref_pro_7 ref_ftr_1 113 211 APSES fold
|
||||
NA ref_pro_7 ref_ftr_2 131 215 KilA-N
|
||||
NA ref_pro_7 ref_ftr_4 66 85 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 413 423 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 633 644 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 697 709 low complexity
|
||||
NA ref_pro_7 ref_ftr_5 477 506 Ankyrin
|
||||
NA ref_pro_7 ref_ftr_5 618 647 Ankyrin
|
||||
NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold
|
||||
# MBP1_PUCGR
|
||||
NA ref_pro_8 ref_ftr_1 90 187 APSES fold
|
||||
NA ref_pro_8 ref_ftr_2 107 190 KilA-N
|
||||
NA ref_pro_8 ref_ftr_4 208 227 low complexity
|
||||
NA ref_pro_8 ref_ftr_4 273 291 low complexity
|
||||
NA ref_pro_8 ref_ftr_5 442 271 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_5 475 509 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_5 561 590 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold
|
||||
NA ref_pro_8 ref_ftr_7 827 863 coiled coil
|
||||
# MBP1_USTMA
|
||||
NA ref_pro_9 ref_ftr_1 7 104 APSES fold
|
||||
NA ref_pro_9 ref_ftr_2 24 107 KilA-N
|
||||
NA ref_pro_9 ref_ftr_4 106 116 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 161 183 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 657 672 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 776 796 low complexity
|
||||
NA ref_pro_9 ref_ftr_5 245 274 Ankyrin
|
||||
NA ref_pro_9 ref_ftr_5 355 384 Ankyrin
|
||||
NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold
|
||||
NA ref_pro_9 ref_ftr_7 581 609 coiled coil
|
||||
# MBP1_WALME
|
||||
NA ref_pro_10 ref_ftr_1 6 103 APSES fold
|
||||
NA ref_pro_10 ref_ftr_2 23 106 KilA-N
|
||||
NA ref_pro_10 ref_ftr_4 149 162 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 171 188 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 618 628 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 634 660 low complexity
|
||||
NA ref_pro_10 ref_ftr_5 250 279 Ankyrin
|
||||
NA ref_pro_10 ref_ftr_5 369 398 Ankyrin
|
||||
NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold
|
||||
NA ref_pro_10 ref_ftr_7 461 585 coiled coil
|
||||
ID protein.ID feature.ID start end note
|
||||
# MBP1_SACCE
|
||||
NA ref_pro_4 ref_ftr_1 4 102 APSES fold
|
||||
NA ref_pro_4 ref_ftr_2 22 105 KilA-N
|
||||
NA ref_pro_4 ref_ftr_4 108 122 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 236 241 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 279 307 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 700 717 low complexity
|
||||
NA ref_pro_4 ref_ftr_4 700 717 low complexity
|
||||
NA ref_pro_4 ref_ftr_5 394 423 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_5 427 463 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_5 512 541 Ankyrin
|
||||
NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold
|
||||
NA ref_pro_4 ref_ftr_7 633 655 coiled coil
|
||||
# MBP1_ASPNI
|
||||
NA ref_pro_1 ref_ftr_1 9 106 APSES fold
|
||||
NA ref_pro_1 ref_ftr_2 26 109 KilA-N
|
||||
NA ref_pro_1 ref_ftr_4 529 534 low complexity
|
||||
NA ref_pro_1 ref_ftr_5 260 289 Ankyrin
|
||||
NA ref_pro_1 ref_ftr_5 381 413 Ankyrin
|
||||
NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold
|
||||
NA ref_pro_1 ref_ftr_7 509 572 coiled coil
|
||||
# MBP1_BIPOR
|
||||
NA ref_pro_2 ref_ftr_1 8 106 APSES fold
|
||||
NA ref_pro_2 ref_ftr_2 26 109 KilA-N
|
||||
NA ref_pro_2 ref_ftr_4 134 152 low complexity
|
||||
NA ref_pro_2 ref_ftr_4 267 278 low complexity
|
||||
NA ref_pro_2 ref_ftr_4 670 685 low complexity
|
||||
NA ref_pro_2 ref_ftr_5 266 295 Ankyrin
|
||||
NA ref_pro_2 ref_ftr_5 387 416 Ankyrin
|
||||
NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold
|
||||
NA ref_pro_2 ref_ftr_7 659 681 coiled coil
|
||||
NA ref_pro_2 ref_ftr_7 500 590 coiled coil
|
||||
# MBP1_NEUCR
|
||||
NA ref_pro_3 ref_ftr_1 14 114 APSES fold
|
||||
NA ref_pro_3 ref_ftr_2 34 117 KilA-N
|
||||
NA ref_pro_3 ref_ftr_4 130 141 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 253 266 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 514 525 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 554 564 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 601 618 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 620 629 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 636 652 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 658 672 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 725 735 low complexity
|
||||
NA ref_pro_3 ref_ftr_4 752 771 low complexity
|
||||
NA ref_pro_3 ref_ftr_5 268 297 Ankyrin
|
||||
NA ref_pro_3 ref_ftr_5 390 419 Ankyrin
|
||||
NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold
|
||||
NA ref_pro_3 ref_ftr_7 500 550 coiled coil
|
||||
# MBP1_SCHPO
|
||||
NA ref_pro_5 ref_ftr_1 8 104 APSES fold
|
||||
NA ref_pro_5 ref_ftr_2 25 113 KilA-N
|
||||
NA ref_pro_5 ref_ftr_4 111 125 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 136 145 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 176 191 low complexity
|
||||
NA ref_pro_5 ref_ftr_4 422 447 low complexity
|
||||
NA ref_pro_5 ref_ftr_5 247 276 Ankyrin
|
||||
NA ref_pro_5 ref_ftr_5 368 397 Ankyrin
|
||||
NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold
|
||||
NA ref_pro_5 ref_ftr_7 457 538 coiled coil
|
||||
# MBP1_COPCI
|
||||
NA ref_pro_6 ref_ftr_1 5 103 APSES fold
|
||||
NA ref_pro_6 ref_ftr_2 23 106 KilA-N
|
||||
NA ref_pro_6 ref_ftr_4 170 191 low complexity
|
||||
NA ref_pro_6 ref_ftr_4 435 450 low complexity
|
||||
NA ref_pro_6 ref_ftr_4 611 626 low complexity
|
||||
NA ref_pro_6 ref_ftr_5 270 299 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_5 389 418 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_5 474 509 Ankyrin
|
||||
NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold
|
||||
NA ref_pro_6 ref_ftr_7 500 570 coiled coil
|
||||
NA ref_pro_6 ref_ftr_7 651 678 coiled coil
|
||||
# MBP1_CRYNE
|
||||
NA ref_pro_7 ref_ftr_1 113 211 APSES fold
|
||||
NA ref_pro_7 ref_ftr_2 131 215 KilA-N
|
||||
NA ref_pro_7 ref_ftr_4 66 85 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 413 423 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 633 644 low complexity
|
||||
NA ref_pro_7 ref_ftr_4 697 709 low complexity
|
||||
NA ref_pro_7 ref_ftr_5 477 506 Ankyrin
|
||||
NA ref_pro_7 ref_ftr_5 618 647 Ankyrin
|
||||
NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold
|
||||
# MBP1_PUCGR
|
||||
NA ref_pro_8 ref_ftr_1 90 187 APSES fold
|
||||
NA ref_pro_8 ref_ftr_2 107 190 KilA-N
|
||||
NA ref_pro_8 ref_ftr_4 208 227 low complexity
|
||||
NA ref_pro_8 ref_ftr_4 273 291 low complexity
|
||||
NA ref_pro_8 ref_ftr_5 442 271 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_5 475 509 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_5 561 590 Ankyrin
|
||||
NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold
|
||||
NA ref_pro_8 ref_ftr_7 827 863 coiled coil
|
||||
# MBP1_USTMA
|
||||
NA ref_pro_9 ref_ftr_1 7 104 APSES fold
|
||||
NA ref_pro_9 ref_ftr_2 24 107 KilA-N
|
||||
NA ref_pro_9 ref_ftr_4 106 116 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 161 183 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 657 672 low complexity
|
||||
NA ref_pro_9 ref_ftr_4 776 796 low complexity
|
||||
NA ref_pro_9 ref_ftr_5 245 274 Ankyrin
|
||||
NA ref_pro_9 ref_ftr_5 355 384 Ankyrin
|
||||
NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold
|
||||
NA ref_pro_9 ref_ftr_7 581 609 coiled coil
|
||||
# MBP1_WALME
|
||||
NA ref_pro_10 ref_ftr_1 6 103 APSES fold
|
||||
NA ref_pro_10 ref_ftr_2 23 106 KilA-N
|
||||
NA ref_pro_10 ref_ftr_4 149 162 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 171 188 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 618 628 low complexity
|
||||
NA ref_pro_10 ref_ftr_4 634 660 low complexity
|
||||
NA ref_pro_10 ref_ftr_5 250 279 Ankyrin
|
||||
NA ref_pro_10 ref_ftr_5 369 398 Ankyrin
|
||||
NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold
|
||||
NA ref_pro_10 ref_ftr_7 461 585 coiled coil
|
||||
|
@ -1,37 +1,37 @@
|
||||
# functionTemplate.R
|
||||
#
|
||||
# Purpose: (General)
|
||||
#
|
||||
# ToDo:
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
myFunction <- function(a, b=1) {
|
||||
# Purpose:
|
||||
# Describe ...
|
||||
# Version:
|
||||
# Date:
|
||||
# Author:
|
||||
#
|
||||
# Parameters:
|
||||
# a: ...
|
||||
# b: ...
|
||||
# Value:
|
||||
# result: ...
|
||||
# Example: <example invocation>
|
||||
|
||||
# code ...
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
# [END]
|
||||
# functionTemplate.R
|
||||
#
|
||||
# Purpose: (General)
|
||||
#
|
||||
# ToDo:
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
myFunction <- function(a, b=1) {
|
||||
# Purpose:
|
||||
# Describe ...
|
||||
# Version:
|
||||
# Date:
|
||||
# Author:
|
||||
#
|
||||
# Parameters:
|
||||
# a: ...
|
||||
# b: ...
|
||||
# Value:
|
||||
# result: ...
|
||||
# Example: <example invocation>
|
||||
|
||||
# code ...
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
# [END]
|
||||
|
@ -1,21 +1,21 @@
|
||||
# .myProfile.R
|
||||
# This contains information which the course framework needs from time to time
|
||||
# to personalize assignments, validate submissions etc. Make sure that
|
||||
# the information correctly matches our official records.
|
||||
# myEmail char A string with your eMail address. Use your official
|
||||
# UofT eMail address.
|
||||
# myStudentNumber numeric Your UofT student number. Take care to have this
|
||||
# correct.
|
||||
#
|
||||
# NOTE:
|
||||
# After you have updated this script, move the file to your "myScripts" folder.
|
||||
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
|
||||
#
|
||||
# ==============================================================================
|
||||
# options(stringsAsFactors = FALSE)
|
||||
|
||||
myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca"
|
||||
myStudentNumber <- 1005845285 # e.g. 1003141592
|
||||
MYSPE <- "Cutaneotrichosporon oleaginosum"
|
||||
|
||||
# [END]
|
||||
# .myProfile.R
|
||||
# This contains information which the course framework needs from time to time
|
||||
# to personalize assignments, validate submissions etc. Make sure that
|
||||
# the information correctly matches our official records.
|
||||
# myEmail char A string with your eMail address. Use your official
|
||||
# UofT eMail address.
|
||||
# myStudentNumber numeric Your UofT student number. Take care to have this
|
||||
# correct.
|
||||
#
|
||||
# NOTE:
|
||||
# After you have updated this script, move the file to your "myScripts" folder.
|
||||
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
|
||||
#
|
||||
# ==============================================================================
|
||||
# options(stringsAsFactors = FALSE)
|
||||
|
||||
myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca"
|
||||
myStudentNumber <- 1005845285 # e.g. 1003141592
|
||||
MYSPE <- "Cutaneotrichosporon oleaginosum"
|
||||
|
||||
# [END]
|
||||
|
@ -1,54 +1,51 @@
|
||||
myFA <- readFASTA("data/RAB39B_HSa_coding.fa")
|
||||
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
|
||||
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
|
||||
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
|
||||
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
|
||||
|
||||
gen_mutations <- function(seq, N) {
|
||||
stats <- c()
|
||||
stats <- cbind(stats, c(0, 0, 0))
|
||||
rownames(stats) <- c("silent", "missense", "nonsense")
|
||||
colnames(stats) <- c("occurrences")
|
||||
# Actual function
|
||||
for (i in 1:217) {
|
||||
# select index for mutation
|
||||
working_seq <- Biostrings::DNAString(seq)
|
||||
aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE)
|
||||
mut_action <- sample(c("ins", "del", "sub"), 1, TRUE)
|
||||
mut_seq <- Biostrings::DNAString(seq)
|
||||
if (mut_action == "sub") {
|
||||
mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
|
||||
possible_mutations <- Biostrings::DNA_BASES
|
||||
possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))]
|
||||
mut_change <- sample(possible_mutations, 1, replace = TRUE)
|
||||
mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change)
|
||||
} else if (mut_action == "ins") {
|
||||
mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE)
|
||||
possible_mutations <- Biostrings::DNA_BASES
|
||||
mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = ""))
|
||||
} else {
|
||||
mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
|
||||
mut_seq <- mut_seq[-mut_index]
|
||||
}
|
||||
mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
|
||||
mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
|
||||
|
||||
# Note: we need silent, nonsense, and missense
|
||||
mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa))
|
||||
aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq))
|
||||
if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) {
|
||||
stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
|
||||
} else if (mut_aa == aa_seq) {
|
||||
stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
|
||||
} else {
|
||||
stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
|
||||
}
|
||||
}
|
||||
return(stats)
|
||||
}
|
||||
N_test <- 1200
|
||||
gen_mutations("ATGATGATGATGATGATG", N_test)
|
||||
gen_mutations("CCCCCCCCCCCCCCCCCC", N_test)
|
||||
gen_mutations("TATTACTATTACTATTAC", N_test)
|
||||
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
|
||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)
|
||||
gen_mutations <- function(seq, N) {
|
||||
sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
|
||||
stats <- c()
|
||||
stats <- cbind(stats, c(0, 0, 0))
|
||||
rownames(stats) <- c("silent", "missense", "nonsense")
|
||||
colnames(stats) <- c("occurrences")
|
||||
# Actual function
|
||||
for (i in 1:N) {
|
||||
original_seq <- Biostrings::DNAString(seq)
|
||||
aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
|
||||
|
||||
mut_seq <- Biostrings::DNAString(seq)
|
||||
mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
|
||||
possible_mutations <- Biostrings::DNA_BASES
|
||||
possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
|
||||
mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
|
||||
mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
|
||||
|
||||
|
||||
term_aa <- regexpr(pattern = "\\*", aa_seq)
|
||||
term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
|
||||
if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
|
||||
stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
|
||||
} else if (mut_aa == aa_seq) {
|
||||
stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
|
||||
} else {
|
||||
stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
|
||||
}
|
||||
}
|
||||
sealKey()
|
||||
return(stats)
|
||||
}
|
||||
|
||||
gen_mutations("ATGATGATGATGATGATG", 1000)
|
||||
gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
|
||||
gen_mutations("TATTACTATTACTATTAC", 500)
|
||||
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
|
||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
|
||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
|
||||
|
||||
|
||||
myFA <- readFASTA("data/RAB39B_HSa_coding.fa")
|
||||
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
|
||||
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
|
||||
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
|
||||
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
|
||||
|
||||
gen_mutations(myFA["RAB39B", 2], 10000)
|
||||
gen_mutations(myFA["PTPN5", 2], 10000)
|
||||
gen_mutations(myFA["PTPN11", 2], 10000)
|
||||
gen_mutations(myFA["KRAS", 2], 10000)
|
||||
|
@ -1,41 +1,41 @@
|
||||
# == 1.3 Task: submit for credit (part 1/2) ================================
|
||||
# == Submission - Code to add another philosopher to the datamodel:
|
||||
|
||||
pID <- autoincrement(philDB$person)
|
||||
immanuelKant <- data.frame(id = pID,
|
||||
name = "Immanuel Kant",
|
||||
born = "1724",
|
||||
died = "1804",
|
||||
school = "Enlightenment Philosophy")
|
||||
philDB$person <- rbind(philDB$person, immanuelKant)
|
||||
|
||||
bID = autoincrement(philDB$books)
|
||||
immanuelKantWork <- data.frame(id = bID,
|
||||
title = "Critique of Pure Reason",
|
||||
published = "1781")
|
||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
|
||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
|
||||
|
||||
bID = autoincrement(philDB$books)
|
||||
immanuelKantWork <- data.frame(id = bID,
|
||||
title = "Critique of Judgement",
|
||||
published = "1790")
|
||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
|
||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
|
||||
|
||||
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
|
||||
|
||||
schools <- unique(philDB$person$school)
|
||||
schools <- sort(schools)
|
||||
|
||||
for (s in schools) {
|
||||
cat(sprintf("%s\n", s))
|
||||
authors = which(philDB$person$school == s)
|
||||
for (author in authors) {
|
||||
works = which(philDB$works$personID == author)
|
||||
for (work in works) {
|
||||
bookId = which(philDB$books$id == philDB$works$bookID[work])
|
||||
cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
|
||||
}
|
||||
}
|
||||
# == 1.3 Task: submit for credit (part 1/2) ================================
|
||||
# == Submission - Code to add another philosopher to the datamodel:
|
||||
|
||||
pID <- autoincrement(philDB$person)
|
||||
immanuelKant <- data.frame(id = pID,
|
||||
name = "Immanuel Kant",
|
||||
born = "1724",
|
||||
died = "1804",
|
||||
school = "Enlightenment Philosophy")
|
||||
philDB$person <- rbind(philDB$person, immanuelKant)
|
||||
|
||||
bID = autoincrement(philDB$books)
|
||||
immanuelKantWork <- data.frame(id = bID,
|
||||
title = "Critique of Pure Reason",
|
||||
published = "1781")
|
||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
|
||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
|
||||
|
||||
bID = autoincrement(philDB$books)
|
||||
immanuelKantWork <- data.frame(id = bID,
|
||||
title = "Critique of Judgement",
|
||||
published = "1790")
|
||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
|
||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
|
||||
|
||||
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
|
||||
|
||||
schools <- unique(philDB$person$school)
|
||||
schools <- sort(schools)
|
||||
|
||||
for (s in schools) {
|
||||
cat(sprintf("%s\n", s))
|
||||
authors = which(philDB$person$school == s)
|
||||
for (author in authors) {
|
||||
works = which(philDB$works$personID == author)
|
||||
for (work in works) {
|
||||
bookId = which(philDB$books$id == philDB$works$bookID[work])
|
||||
cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
|
||||
}
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
[{
|
||||
"ID": 879819,
|
||||
"species": "Cutaneotrichosporon oleaginosum"}
|
||||
]
|
||||
[{
|
||||
"ID": 879819,
|
||||
"species": "Cutaneotrichosporon oleaginosum"}
|
||||
]
|
||||
|
@ -1,19 +1,19 @@
|
||||
[
|
||||
{ "name" : "MBP1_CUTOL",
|
||||
"RefSeqID" : "XP_018278493.1",
|
||||
"UniProtID" : "A0A0J0XLN0",
|
||||
"taxonomyID" : 879819,
|
||||
"sequence" : [
|
||||
"MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
|
||||
"KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
|
||||
"PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
|
||||
"MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
|
||||
"TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
|
||||
"YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
|
||||
"EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
|
||||
"SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
|
||||
"AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
|
||||
"LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
|
||||
"DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
|
||||
}
|
||||
]
|
||||
[
|
||||
{ "name" : "MBP1_CUTOL",
|
||||
"RefSeqID" : "XP_018278493.1",
|
||||
"UniProtID" : "A0A0J0XLN0",
|
||||
"taxonomyID" : 879819,
|
||||
"sequence" : [
|
||||
"MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
|
||||
"KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
|
||||
"PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
|
||||
"MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
|
||||
"TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
|
||||
"YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
|
||||
"EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
|
||||
"SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
|
||||
"AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
|
||||
"LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
|
||||
"DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
|
||||
}
|
||||
]
|
||||
|
@ -1,8 +1,8 @@
|
||||
README - myScripts folder:
|
||||
==========================
|
||||
|
||||
The "myScripts" folder is a place to keep your personal files
|
||||
safe. No files will be submitted into this folder on the GitHub, master
|
||||
copy. Thefore everything you put into this folder is safe from being
|
||||
inadvertently overwritten by a file with the same name that would be
|
||||
downloaded in a GitHub "pull" request.
|
||||
README - myScripts folder:
|
||||
==========================
|
||||
|
||||
The "myScripts" folder is a place to keep your personal files
|
||||
safe. No files will be submitted into this folder on the GitHub, master
|
||||
copy. Thefore everything you put into this folder is safe from being
|
||||
inadvertently overwritten by a file with the same name that would be
|
||||
downloaded in a GitHub "pull" request.
|
||||
|
@ -1,4 +1,4 @@
|
||||
source("./scripts/ABC-createRefDB.R")
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
|
||||
source("./scripts/ABC-createRefDB.R")
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
|
||||
|
@ -1,38 +1,38 @@
|
||||
# myScript.R
|
||||
#
|
||||
# --- As you work with this file, you can delete the instructions below --------
|
||||
# Write your notes and code experiments into this document. Save it
|
||||
# from time to time - however I recommend that you do not _commit_
|
||||
# your saved version.
|
||||
#
|
||||
# As long as you do not _commit_ this script to version control,
|
||||
# you can _pull_ updated versions of the entire project from GitHub
|
||||
# by using the RStudio version control interface. However, once
|
||||
# you _commit_ any file in your local version, RStudio will require
|
||||
# you to resolve conflicts before you can _pull_ updates.
|
||||
# --- As you work with this file, you can delete the instructions above --------
|
||||
#
|
||||
## Purpose: <...>
|
||||
#
|
||||
# Version: <...>
|
||||
#
|
||||
# Date: <...>
|
||||
# Author: <Name> (<namee@mail.utoronto.ca>)
|
||||
#
|
||||
# Versions:
|
||||
#
|
||||
# <number> <Features>
|
||||
#
|
||||
# TODO:
|
||||
# <...>
|
||||
#
|
||||
# ====================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
||||
# myScript.R
|
||||
#
|
||||
# --- As you work with this file, you can delete the instructions below --------
|
||||
# Write your notes and code experiments into this document. Save it
|
||||
# from time to time - however I recommend that you do not _commit_
|
||||
# your saved version.
|
||||
#
|
||||
# As long as you do not _commit_ this script to version control,
|
||||
# you can _pull_ updated versions of the entire project from GitHub
|
||||
# by using the RStudio version control interface. However, once
|
||||
# you _commit_ any file in your local version, RStudio will require
|
||||
# you to resolve conflicts before you can _pull_ updates.
|
||||
# --- As you work with this file, you can delete the instructions above --------
|
||||
#
|
||||
## Purpose: <...>
|
||||
#
|
||||
# Version: <...>
|
||||
#
|
||||
# Date: <...>
|
||||
# Author: <Name> (<namee@mail.utoronto.ca>)
|
||||
#
|
||||
# Versions:
|
||||
#
|
||||
# <number> <Features>
|
||||
#
|
||||
# TODO:
|
||||
# <...>
|
||||
#
|
||||
# ====================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
||||
|
2868
plottingReference.R
2868
plottingReference.R
File diff suppressed because it is too large
Load Diff
150
scriptTemplate.R
150
scriptTemplate.R
@ -1,75 +1,75 @@
|
||||
# scriptTemplate.R
|
||||
#
|
||||
# Purpose:
|
||||
# Version:
|
||||
# Date:
|
||||
# Author:
|
||||
#
|
||||
# Input:
|
||||
# Output:
|
||||
# Dependencies:
|
||||
#
|
||||
# ToDo:
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
setwd("<your/project/directory>")
|
||||
|
||||
# ==== PARAMETERS ============================================================
|
||||
# Define and explain all parameters. No "magic numbers" in your code below.
|
||||
|
||||
|
||||
|
||||
# ==== PACKAGES ==============================================================
|
||||
# Check that required packages have been installed. Install if needed.
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Note: use package functions with the :: operator - eg.
|
||||
# seqinr::aaa("K")
|
||||
|
||||
|
||||
|
||||
# ==== FUNCTIONS =============================================================
|
||||
|
||||
# Define functions or source external files
|
||||
source("<myUtilityFunctionsScript.R>")
|
||||
|
||||
myFunction <- function(a, b=1) {
|
||||
# Purpose:
|
||||
# Describe ...
|
||||
# Parameters:
|
||||
# a: ...
|
||||
# b: ...
|
||||
# Value:
|
||||
# result: ...
|
||||
|
||||
# code ...
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
|
||||
|
||||
# ==== PROCESS ===============================================================
|
||||
# Enter the step-by-step process of your project here. Strive to write your
|
||||
# code so that you can simply run this entire file and re-create all
|
||||
# intermediate results.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
|
||||
# [END]
|
||||
# scriptTemplate.R
|
||||
#
|
||||
# Purpose:
|
||||
# Version:
|
||||
# Date:
|
||||
# Author:
|
||||
#
|
||||
# Input:
|
||||
# Output:
|
||||
# Dependencies:
|
||||
#
|
||||
# ToDo:
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
setwd("<your/project/directory>")
|
||||
|
||||
# ==== PARAMETERS ============================================================
|
||||
# Define and explain all parameters. No "magic numbers" in your code below.
|
||||
|
||||
|
||||
|
||||
# ==== PACKAGES ==============================================================
|
||||
# Check that required packages have been installed. Install if needed.
|
||||
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Note: use package functions with the :: operator - eg.
|
||||
# seqinr::aaa("K")
|
||||
|
||||
|
||||
|
||||
# ==== FUNCTIONS =============================================================
|
||||
|
||||
# Define functions or source external files
|
||||
source("<myUtilityFunctionsScript.R>")
|
||||
|
||||
myFunction <- function(a, b=1) {
|
||||
# Purpose:
|
||||
# Describe ...
|
||||
# Parameters:
|
||||
# a: ...
|
||||
# b: ...
|
||||
# Value:
|
||||
# result: ...
|
||||
|
||||
# code ...
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
|
||||
|
||||
# ==== PROCESS ===============================================================
|
||||
# Enter the step-by-step process of your project here. Strive to write your
|
||||
# code so that you can simply run this entire file and re-create all
|
||||
# intermediate results.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,30 +1,30 @@
|
||||
# ABC-createRefDB.R
|
||||
#
|
||||
# Create a reference protein database for Mbp1-like proteins
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# For the species, see:
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||
#
|
||||
# For the data model, see
|
||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
myDB <- dbInit()
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
|
||||
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
|
||||
|
||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
|
||||
|
||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
|
||||
|
||||
|
||||
# [END]
|
||||
# ABC-createRefDB.R
|
||||
#
|
||||
# Create a reference protein database for Mbp1-like proteins
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# For the species, see:
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||
#
|
||||
# For the data model, see
|
||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
myDB <- dbInit()
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
|
||||
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
|
||||
|
||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
|
||||
|
||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
|
||||
|
||||
|
||||
# [END]
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,443 +1,443 @@
|
||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
|
||||
#
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2016 09 - 2021 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions
|
||||
# 1.4 New retrieval logic
|
||||
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||
# since 2012. Use ensembl fungi as initial source.
|
||||
# 1.2 Change from require() to requireNamespace()
|
||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# 1.1 Update 2017
|
||||
# 1.0 First code 2016
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# DO NOT source() THIS FILE!
|
||||
#
|
||||
# This file is code I provide for your deeper understanding of a process and
|
||||
# to provide you with useful sample code. It is not actually necessary for
|
||||
# you to run this code, but I encourage you to read it carefully and discuss
|
||||
# if there are parts you don't understand.
|
||||
#
|
||||
# Run the commands that interact with the NCBI servers only if you want to
|
||||
# experiment specifically with the code and/or parameters. I have commented out
|
||||
# those parts. If you only want to study the general workflow, just load()
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------
|
||||
#TOC> 1 The strategy 55
|
||||
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
|
||||
#TOC> 3 ENSEMBL FUNGI 75
|
||||
#TOC> 3.1 Import 78
|
||||
#TOC> 4 BLAST SEARCH 155
|
||||
#TOC> 4.1 find homologous proteins 161
|
||||
#TOC> 4.2 Identify species in "hits" 192
|
||||
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
|
||||
#TOC> 6 STUDENT NUMBERS 375
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
# MYSPEspecies that is stored in the data subdirectory of this project from
|
||||
# where it can be loaded. The strategy is as follows: we download a list of
|
||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
|
||||
# species that have been annotated.
|
||||
# Next we perform a BLAST search, to identify fungal species that have
|
||||
# genes that are homologous to yeast MBP1.
|
||||
#
|
||||
# ...
|
||||
|
||||
# = 2 PACKAGES AND INITIALIZATIONS ========================================
|
||||
|
||||
# httr provides interfaces to Webservers on the Internet
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
# = 3 ENSEMBL FUNGI =======================================================
|
||||
|
||||
|
||||
# == 3.1 Import ============================================================
|
||||
|
||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
|
||||
# list of all species: https://fungi.ensembl.org/species.html
|
||||
# On the page, click on the spreadsheet symbol top right and choose
|
||||
# "download whole table". The file will be named "Species.csv", in your
|
||||
# usual downloads folder. Move it to the data folder, and read it.
|
||||
|
||||
sDat <- read.csv("./data/Species.csv")
|
||||
str(sDat)
|
||||
|
||||
# The most obvious way to partition these is according to Classification ...
|
||||
# (poking around a bit in the UniProt taxonomy database shows that the
|
||||
# classification used here is the taxonomic rank of "order").
|
||||
# how many classifications do we have?
|
||||
length(unique(sDat$Classification)) # 66
|
||||
|
||||
# To have a good set for the class, we should have about 100.
|
||||
# Let's see for which of these we can find Mbp1 homologues.
|
||||
# First, we'll keep only the colums for name, classification, and taxID, and
|
||||
# drop the rest ...
|
||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
|
||||
colnames(sDat) <- c("name", "order", "taxID")
|
||||
|
||||
# Next, we make an extra column: genus - the first part of the binomial name.
|
||||
# We'll use the gsub() function, and for that we need a "regular expression"
|
||||
# that matches to all characters from the first blank to the end of the string:
|
||||
myPatt <- "\\s.*$" # one whitespace (\\s) ...
|
||||
# followed by any character (.) 0..n times (*) ...
|
||||
# until the end of the string
|
||||
|
||||
# using gsub() we substitue all matching characters with the empty string "" -
|
||||
# this deletes the matching characters
|
||||
# Test this:
|
||||
gsub(myPatt, "", "Genus") # one word: unchanged
|
||||
gsub(myPatt, "", "gEnus species") # two words: return only first
|
||||
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
|
||||
|
||||
# apply this to the "name" column and add the result as a separate column
|
||||
# called "genus"
|
||||
sDat$genus <- gsub(myPatt, "", sDat$name)
|
||||
|
||||
# what do we get?
|
||||
c(head(unique(sDat$genus)),
|
||||
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
|
||||
# is a problem that we have to keep in mind.
|
||||
# (Always inspect your results!)
|
||||
# Drop all rows for which the genus contains special chracters -
|
||||
# like "[Candida]"
|
||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
|
||||
|
||||
length(table(sDat$genus)) # how many genus?
|
||||
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
|
||||
# most genus have very few, but
|
||||
# some have very many species.
|
||||
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
|
||||
|
||||
# We should have at least one species from each taxonomic order, but we can
|
||||
# add a few genus until we have about 100 validated species.
|
||||
|
||||
# Let's add a column for species, by changing our regular expression a bit,
|
||||
# using ^ (start of string), \\S (NOT a whitespace),
|
||||
# and + (one or more matches), capturing the match (...), and returning
|
||||
# it as the substitution (\\1) ...
|
||||
|
||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
|
||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
|
||||
|
||||
# And we reorder the columns, just for aesthetics:
|
||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
|
||||
|
||||
# Final check:
|
||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
|
||||
|
||||
#
|
||||
# Now we check which of these have Mbp1 homologues ...
|
||||
|
||||
# = 4 BLAST SEARCH ========================================================
|
||||
|
||||
|
||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
|
||||
# fungus. With the results, we'll annotate our sDat table.
|
||||
|
||||
# == 4.1 find homologous proteins ==========================================
|
||||
#
|
||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
||||
# contain them.
|
||||
|
||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
|
||||
# amount of error handling involved that is not supported by the API in a
|
||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||
# standard task of communicating with servers and parsing responses - everyday
|
||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||
# parser in currently available packages.
|
||||
#
|
||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
|
||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
|
||||
# server, they will blacklist your IP-address. See:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||
# db = "refseq_protein", # database to search in
|
||||
# nHits = 3000, # 945 hits in 2020
|
||||
# E = 0.01, #
|
||||
# limits = "txid4751[ORGN]") # = fungi
|
||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
|
||||
#
|
||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
|
||||
#
|
||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
|
||||
|
||||
# == 4.2 Identify species in "hits" ========================================
|
||||
|
||||
# This is a very big list that can't be usefully analyzed manually. Here
|
||||
# we are only interested in the species names that it contains.
|
||||
|
||||
# How many hits in the list?
|
||||
length(BLASThits$hits) # 1,134
|
||||
|
||||
# Let's look at a hit somewhere down the list
|
||||
str(BLASThits$hit[[277]])
|
||||
|
||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
|
||||
# in a useful way. The species information is in the $species element of every
|
||||
# hit.
|
||||
|
||||
# Run a loop to extract all the species names into a vector. We subset ...
|
||||
# Blasthits$hits ... the list of hits, from which we choose ...
|
||||
# Blasthits$hits[[i]] ... the i-th hit, and get ...
|
||||
# Blasthits$hits[[i]]$species ... the species element from that.
|
||||
# Subsetting FTW.
|
||||
|
||||
BLASTspecies <- character()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
BLASTspecies[i] <- BLASThits$hits[[i]]$species
|
||||
}
|
||||
|
||||
# You can confirm that BLASTspecies has the expected size.
|
||||
length(BLASTspecies)
|
||||
|
||||
# if we delete some of these later on, we still want to remember which hit
|
||||
# they came from. Thus we name() the elements with their index, which is the
|
||||
# same as the index of the hit in BLASThits
|
||||
names(BLASTspecies) <- 1:length(BLASTspecies)
|
||||
|
||||
|
||||
# let's plot the distribution of E-values
|
||||
eVals <- numeric()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
eVals[i] <- BLASThits$hits[[i]]$E
|
||||
}
|
||||
range(eVals)
|
||||
sum(eVals == 0)
|
||||
|
||||
# let's plot the log of all values > 0 to see how they are distributed
|
||||
# plotting only one vectyor of numbers plots their index as x, and
|
||||
# their value as y ...
|
||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||
|
||||
# This is very informative: I would suspect that the first ten or so are
|
||||
# virtually identical to the yeast protein, then we have about 800 hits with
|
||||
# decreasing similarity, and then about 200 more that may actually be false
|
||||
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||
# Lower E-values strictly come before higher E-values.
|
||||
|
||||
# Again, some species appear more than once, e.g. ...
|
||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||||
|
||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
||||
|
||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
|
||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
|
||||
# species, we will retain only the protein that has the highest similarity
|
||||
# to yeast Mbp1, not any of its more distant paralogues.
|
||||
sel <- ! duplicated(BLASTspecies)
|
||||
BLASTspecies <- BLASTspecies[sel]
|
||||
|
||||
length(BLASTspecies)
|
||||
# i.e. we got rid of about two thirds of the hits.
|
||||
tail(BLASTspecies) # see how the names are useful!
|
||||
# again - there are some special characters ...
|
||||
# what are they?
|
||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
|
||||
|
||||
# remove the brackets ...
|
||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
|
||||
# drop any new duplicates ...
|
||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
||||
|
||||
# check the number again:
|
||||
length(BLASTspecies)
|
||||
# Think a bit about this: what may be the biological reason to find that
|
||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
|
||||
# three sequences that are homologous to yeast Mbp1?
|
||||
|
||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||
# we plot all values that are TRUE in the vector "sel" that we created above,
|
||||
# AND greater than 0
|
||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
|
||||
|
||||
|
||||
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
|
||||
|
||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
|
||||
# the E-value, and the Query-bounds from which we can estimate which domains
|
||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
|
||||
# Mbp1's N-terminal APSES domain.)
|
||||
#
|
||||
# First we pull the hits we wanted from the BLASTspecies:
|
||||
iHits <- as.numeric(names(BLASTspecies))
|
||||
length(iHits) # one index for each TRUE in sel
|
||||
|
||||
# add columns to sDat
|
||||
l <- nrow(sDat)
|
||||
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
|
||||
sDat$eVal <- numeric(l) # E-value of the hit
|
||||
sDat$lAli <- numeric(l) # length of the aligned region
|
||||
|
||||
# extract and merge
|
||||
for (iHit in iHits) {
|
||||
thisSp <- BLASThits$hits[[iHit]]$species
|
||||
sel <- sDat$species == thisSp
|
||||
|
||||
sDat$iHit[sel] <- iHit
|
||||
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
|
||||
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
|
||||
}
|
||||
|
||||
# Are all reference species accounted for?
|
||||
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
|
||||
REFspecies %in% sDat$species[selA] # yes, all there
|
||||
|
||||
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
|
||||
|
||||
sum(selA & selB) # How many rows?
|
||||
|
||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
|
||||
# may be different strains. We'll use the same strategy as before and keep
|
||||
# only the best hit: order the rows by E-value, then drop all rows which
|
||||
# are duplicated.
|
||||
|
||||
|
||||
# drop all rows without BLAST hits ...
|
||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
|
||||
|
||||
# order sDat by E-value ...
|
||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
|
||||
|
||||
# drop all rows with duplicated species ...
|
||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
|
||||
|
||||
# Lets look at the E-values ...
|
||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
|
||||
|
||||
# and alignment lengths ...
|
||||
plot(sDat$lAli, col = "#00DDAA")
|
||||
|
||||
# How many ...
|
||||
length(unique(sDat$name))
|
||||
length(unique(sDat$species))
|
||||
length(unique(sDat$genus))
|
||||
length(unique(sDat$order))
|
||||
|
||||
# I need an extra species for admin purposes later on ...
|
||||
sel <- grep("Sporothrix schenckii", sDat$species)
|
||||
SPOSCdat <- sDat[sel, ]
|
||||
sDat <- sDat[-sel, ]
|
||||
|
||||
# To get the final dataset, we remove the reference species with their
|
||||
# entire orders ...
|
||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||
sel <- sDat$order %in% REForders
|
||||
REFdat <- sDat[sel , ]
|
||||
sDat <- sDat[ ! sel , ]
|
||||
|
||||
# REFdat should now contain only the REFspecies ...
|
||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
|
||||
|
||||
# ... but all of them
|
||||
sum(REFspecies %in% REFdat$species)
|
||||
|
||||
# ... and we have enough left in sDat to prune sDat to unique genus
|
||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||
nrow(sDat) # 84
|
||||
|
||||
# I add back "Sporothrix schenckii" ...
|
||||
sDat <- rbind(SPOSCdat, sDat)
|
||||
|
||||
# ... and save for future use.
|
||||
# saveRDS(sDat, file = "data/sDat.rds")
|
||||
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||
|
||||
|
||||
|
||||
# = 6 STUDENT NUMBERS =====================================================
|
||||
#
|
||||
# An asymmetric function to retrieve a MYSPE species
|
||||
#
|
||||
sDat <- readRDS(file = "data/sDat.rds")
|
||||
|
||||
students <- read.csv("../BCH441-2021-students.csv")
|
||||
sN <- students$Integration.ID
|
||||
sN <- sN[! is.na(sN)]
|
||||
sN <- as.character(sN)
|
||||
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
|
||||
|
||||
set.seed(112358)
|
||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
|
||||
all(sort(theseSpecies$name) == sort(sDat$name))
|
||||
nrow((theseSpecies))
|
||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
|
||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
|
||||
rndMin <- 992000000
|
||||
rndMax <- 1020000000
|
||||
N <- 10000
|
||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
|
||||
keys <- keys[! (keys %in% sN)]
|
||||
keys <- keys[1:N]
|
||||
keys[1:length(sN)] <- sN
|
||||
|
||||
nRep <- floor(N/nrow(theseSpecies))
|
||||
MYSPEdat <- theseSpecies
|
||||
for(i in 1:nRep) {
|
||||
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
|
||||
}
|
||||
MYSPEdat <- MYSPEdat[1:N, ]
|
||||
for (i in 1:N) {
|
||||
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
|
||||
}
|
||||
set.seed(NULL)
|
||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
|
||||
|
||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
|
||||
|
||||
# === validate
|
||||
x <- character()
|
||||
for (n in sN) {
|
||||
sp <- getMYSPE(n)
|
||||
if (length(sp) != 1) {
|
||||
stop(print(as.character(n)))
|
||||
} else {
|
||||
x <- c(x, sp)
|
||||
}
|
||||
}
|
||||
|
||||
# === species for late-comers
|
||||
y <- unique(MYSPEdat$species)
|
||||
print(y[!(y %in% x)])
|
||||
|
||||
|
||||
# === validate
|
||||
l <- length(sN)
|
||||
sp <- character(l)
|
||||
for(i in 1:l) {
|
||||
sp[i] <- getMYSPE(sN[i])
|
||||
}
|
||||
any(duplicated(sp))
|
||||
length(unique(sp))
|
||||
which(! sDat$species %in% sp) # these can be assigned to late-comers
|
||||
|
||||
# Done.
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
|
||||
#
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2016 09 - 2021 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions
|
||||
# 1.4 New retrieval logic
|
||||
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||
# since 2012. Use ensembl fungi as initial source.
|
||||
# 1.2 Change from require() to requireNamespace()
|
||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# 1.1 Update 2017
|
||||
# 1.0 First code 2016
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# DO NOT source() THIS FILE!
|
||||
#
|
||||
# This file is code I provide for your deeper understanding of a process and
|
||||
# to provide you with useful sample code. It is not actually necessary for
|
||||
# you to run this code, but I encourage you to read it carefully and discuss
|
||||
# if there are parts you don't understand.
|
||||
#
|
||||
# Run the commands that interact with the NCBI servers only if you want to
|
||||
# experiment specifically with the code and/or parameters. I have commented out
|
||||
# those parts. If you only want to study the general workflow, just load()
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------
|
||||
#TOC> 1 The strategy 55
|
||||
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
|
||||
#TOC> 3 ENSEMBL FUNGI 75
|
||||
#TOC> 3.1 Import 78
|
||||
#TOC> 4 BLAST SEARCH 155
|
||||
#TOC> 4.1 find homologous proteins 161
|
||||
#TOC> 4.2 Identify species in "hits" 192
|
||||
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
|
||||
#TOC> 6 STUDENT NUMBERS 375
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
# MYSPEspecies that is stored in the data subdirectory of this project from
|
||||
# where it can be loaded. The strategy is as follows: we download a list of
|
||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
|
||||
# species that have been annotated.
|
||||
# Next we perform a BLAST search, to identify fungal species that have
|
||||
# genes that are homologous to yeast MBP1.
|
||||
#
|
||||
# ...
|
||||
|
||||
# = 2 PACKAGES AND INITIALIZATIONS ========================================
|
||||
|
||||
# httr provides interfaces to Webservers on the Internet
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
# = 3 ENSEMBL FUNGI =======================================================
|
||||
|
||||
|
||||
# == 3.1 Import ============================================================
|
||||
|
||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
|
||||
# list of all species: https://fungi.ensembl.org/species.html
|
||||
# On the page, click on the spreadsheet symbol top right and choose
|
||||
# "download whole table". The file will be named "Species.csv", in your
|
||||
# usual downloads folder. Move it to the data folder, and read it.
|
||||
|
||||
sDat <- read.csv("./data/Species.csv")
|
||||
str(sDat)
|
||||
|
||||
# The most obvious way to partition these is according to Classification ...
|
||||
# (poking around a bit in the UniProt taxonomy database shows that the
|
||||
# classification used here is the taxonomic rank of "order").
|
||||
# how many classifications do we have?
|
||||
length(unique(sDat$Classification)) # 66
|
||||
|
||||
# To have a good set for the class, we should have about 100.
|
||||
# Let's see for which of these we can find Mbp1 homologues.
|
||||
# First, we'll keep only the colums for name, classification, and taxID, and
|
||||
# drop the rest ...
|
||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
|
||||
colnames(sDat) <- c("name", "order", "taxID")
|
||||
|
||||
# Next, we make an extra column: genus - the first part of the binomial name.
|
||||
# We'll use the gsub() function, and for that we need a "regular expression"
|
||||
# that matches to all characters from the first blank to the end of the string:
|
||||
myPatt <- "\\s.*$" # one whitespace (\\s) ...
|
||||
# followed by any character (.) 0..n times (*) ...
|
||||
# until the end of the string
|
||||
|
||||
# using gsub() we substitue all matching characters with the empty string "" -
|
||||
# this deletes the matching characters
|
||||
# Test this:
|
||||
gsub(myPatt, "", "Genus") # one word: unchanged
|
||||
gsub(myPatt, "", "gEnus species") # two words: return only first
|
||||
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
|
||||
|
||||
# apply this to the "name" column and add the result as a separate column
|
||||
# called "genus"
|
||||
sDat$genus <- gsub(myPatt, "", sDat$name)
|
||||
|
||||
# what do we get?
|
||||
c(head(unique(sDat$genus)),
|
||||
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
|
||||
# is a problem that we have to keep in mind.
|
||||
# (Always inspect your results!)
|
||||
# Drop all rows for which the genus contains special chracters -
|
||||
# like "[Candida]"
|
||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
|
||||
|
||||
length(table(sDat$genus)) # how many genus?
|
||||
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
|
||||
# most genus have very few, but
|
||||
# some have very many species.
|
||||
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
|
||||
|
||||
# We should have at least one species from each taxonomic order, but we can
|
||||
# add a few genus until we have about 100 validated species.
|
||||
|
||||
# Let's add a column for species, by changing our regular expression a bit,
|
||||
# using ^ (start of string), \\S (NOT a whitespace),
|
||||
# and + (one or more matches), capturing the match (...), and returning
|
||||
# it as the substitution (\\1) ...
|
||||
|
||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
|
||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
|
||||
|
||||
# And we reorder the columns, just for aesthetics:
|
||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
|
||||
|
||||
# Final check:
|
||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
|
||||
|
||||
#
|
||||
# Now we check which of these have Mbp1 homologues ...
|
||||
|
||||
# = 4 BLAST SEARCH ========================================================
|
||||
|
||||
|
||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
|
||||
# fungus. With the results, we'll annotate our sDat table.
|
||||
|
||||
# == 4.1 find homologous proteins ==========================================
|
||||
#
|
||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
||||
# contain them.
|
||||
|
||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
|
||||
# amount of error handling involved that is not supported by the API in a
|
||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||
# standard task of communicating with servers and parsing responses - everyday
|
||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||
# parser in currently available packages.
|
||||
#
|
||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
|
||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
|
||||
# server, they will blacklist your IP-address. See:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||
# db = "refseq_protein", # database to search in
|
||||
# nHits = 3000, # 945 hits in 2020
|
||||
# E = 0.01, #
|
||||
# limits = "txid4751[ORGN]") # = fungi
|
||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
|
||||
#
|
||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
|
||||
#
|
||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
|
||||
|
||||
# == 4.2 Identify species in "hits" ========================================
|
||||
|
||||
# This is a very big list that can't be usefully analyzed manually. Here
|
||||
# we are only interested in the species names that it contains.
|
||||
|
||||
# How many hits in the list?
|
||||
length(BLASThits$hits) # 1,134
|
||||
|
||||
# Let's look at a hit somewhere down the list
|
||||
str(BLASThits$hit[[277]])
|
||||
|
||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
|
||||
# in a useful way. The species information is in the $species element of every
|
||||
# hit.
|
||||
|
||||
# Run a loop to extract all the species names into a vector. We subset ...
|
||||
# Blasthits$hits ... the list of hits, from which we choose ...
|
||||
# Blasthits$hits[[i]] ... the i-th hit, and get ...
|
||||
# Blasthits$hits[[i]]$species ... the species element from that.
|
||||
# Subsetting FTW.
|
||||
|
||||
BLASTspecies <- character()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
BLASTspecies[i] <- BLASThits$hits[[i]]$species
|
||||
}
|
||||
|
||||
# You can confirm that BLASTspecies has the expected size.
|
||||
length(BLASTspecies)
|
||||
|
||||
# if we delete some of these later on, we still want to remember which hit
|
||||
# they came from. Thus we name() the elements with their index, which is the
|
||||
# same as the index of the hit in BLASThits
|
||||
names(BLASTspecies) <- 1:length(BLASTspecies)
|
||||
|
||||
|
||||
# let's plot the distribution of E-values
|
||||
eVals <- numeric()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
eVals[i] <- BLASThits$hits[[i]]$E
|
||||
}
|
||||
range(eVals)
|
||||
sum(eVals == 0)
|
||||
|
||||
# let's plot the log of all values > 0 to see how they are distributed
|
||||
# plotting only one vectyor of numbers plots their index as x, and
|
||||
# their value as y ...
|
||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||
|
||||
# This is very informative: I would suspect that the first ten or so are
|
||||
# virtually identical to the yeast protein, then we have about 800 hits with
|
||||
# decreasing similarity, and then about 200 more that may actually be false
|
||||
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||
# Lower E-values strictly come before higher E-values.
|
||||
|
||||
# Again, some species appear more than once, e.g. ...
|
||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||||
|
||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
||||
|
||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
|
||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
|
||||
# species, we will retain only the protein that has the highest similarity
|
||||
# to yeast Mbp1, not any of its more distant paralogues.
|
||||
sel <- ! duplicated(BLASTspecies)
|
||||
BLASTspecies <- BLASTspecies[sel]
|
||||
|
||||
length(BLASTspecies)
|
||||
# i.e. we got rid of about two thirds of the hits.
|
||||
tail(BLASTspecies) # see how the names are useful!
|
||||
# again - there are some special characters ...
|
||||
# what are they?
|
||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
|
||||
|
||||
# remove the brackets ...
|
||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
|
||||
# drop any new duplicates ...
|
||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
||||
|
||||
# check the number again:
|
||||
length(BLASTspecies)
|
||||
# Think a bit about this: what may be the biological reason to find that
|
||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
|
||||
# three sequences that are homologous to yeast Mbp1?
|
||||
|
||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||
# we plot all values that are TRUE in the vector "sel" that we created above,
|
||||
# AND greater than 0
|
||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
|
||||
|
||||
|
||||
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
|
||||
|
||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
|
||||
# the E-value, and the Query-bounds from which we can estimate which domains
|
||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
|
||||
# Mbp1's N-terminal APSES domain.)
|
||||
#
|
||||
# First we pull the hits we wanted from the BLASTspecies:
|
||||
iHits <- as.numeric(names(BLASTspecies))
|
||||
length(iHits) # one index for each TRUE in sel
|
||||
|
||||
# add columns to sDat
|
||||
l <- nrow(sDat)
|
||||
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
|
||||
sDat$eVal <- numeric(l) # E-value of the hit
|
||||
sDat$lAli <- numeric(l) # length of the aligned region
|
||||
|
||||
# extract and merge
|
||||
for (iHit in iHits) {
|
||||
thisSp <- BLASThits$hits[[iHit]]$species
|
||||
sel <- sDat$species == thisSp
|
||||
|
||||
sDat$iHit[sel] <- iHit
|
||||
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
|
||||
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
|
||||
}
|
||||
|
||||
# Are all reference species accounted for?
|
||||
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
|
||||
REFspecies %in% sDat$species[selA] # yes, all there
|
||||
|
||||
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
|
||||
|
||||
sum(selA & selB) # How many rows?
|
||||
|
||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
|
||||
# may be different strains. We'll use the same strategy as before and keep
|
||||
# only the best hit: order the rows by E-value, then drop all rows which
|
||||
# are duplicated.
|
||||
|
||||
|
||||
# drop all rows without BLAST hits ...
|
||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
|
||||
|
||||
# order sDat by E-value ...
|
||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
|
||||
|
||||
# drop all rows with duplicated species ...
|
||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
|
||||
|
||||
# Lets look at the E-values ...
|
||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
|
||||
|
||||
# and alignment lengths ...
|
||||
plot(sDat$lAli, col = "#00DDAA")
|
||||
|
||||
# How many ...
|
||||
length(unique(sDat$name))
|
||||
length(unique(sDat$species))
|
||||
length(unique(sDat$genus))
|
||||
length(unique(sDat$order))
|
||||
|
||||
# I need an extra species for admin purposes later on ...
|
||||
sel <- grep("Sporothrix schenckii", sDat$species)
|
||||
SPOSCdat <- sDat[sel, ]
|
||||
sDat <- sDat[-sel, ]
|
||||
|
||||
# To get the final dataset, we remove the reference species with their
|
||||
# entire orders ...
|
||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||
sel <- sDat$order %in% REForders
|
||||
REFdat <- sDat[sel , ]
|
||||
sDat <- sDat[ ! sel , ]
|
||||
|
||||
# REFdat should now contain only the REFspecies ...
|
||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
|
||||
|
||||
# ... but all of them
|
||||
sum(REFspecies %in% REFdat$species)
|
||||
|
||||
# ... and we have enough left in sDat to prune sDat to unique genus
|
||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||
nrow(sDat) # 84
|
||||
|
||||
# I add back "Sporothrix schenckii" ...
|
||||
sDat <- rbind(SPOSCdat, sDat)
|
||||
|
||||
# ... and save for future use.
|
||||
# saveRDS(sDat, file = "data/sDat.rds")
|
||||
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||
|
||||
|
||||
|
||||
# = 6 STUDENT NUMBERS =====================================================
|
||||
#
|
||||
# An asymmetric function to retrieve a MYSPE species
|
||||
#
|
||||
sDat <- readRDS(file = "data/sDat.rds")
|
||||
|
||||
students <- read.csv("../BCH441-2021-students.csv")
|
||||
sN <- students$Integration.ID
|
||||
sN <- sN[! is.na(sN)]
|
||||
sN <- as.character(sN)
|
||||
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
|
||||
|
||||
set.seed(112358)
|
||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
|
||||
all(sort(theseSpecies$name) == sort(sDat$name))
|
||||
nrow((theseSpecies))
|
||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
|
||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
|
||||
rndMin <- 992000000
|
||||
rndMax <- 1020000000
|
||||
N <- 10000
|
||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
|
||||
keys <- keys[! (keys %in% sN)]
|
||||
keys <- keys[1:N]
|
||||
keys[1:length(sN)] <- sN
|
||||
|
||||
nRep <- floor(N/nrow(theseSpecies))
|
||||
MYSPEdat <- theseSpecies
|
||||
for(i in 1:nRep) {
|
||||
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
|
||||
}
|
||||
MYSPEdat <- MYSPEdat[1:N, ]
|
||||
for (i in 1:N) {
|
||||
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
|
||||
}
|
||||
set.seed(NULL)
|
||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
|
||||
|
||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
|
||||
|
||||
# === validate
|
||||
x <- character()
|
||||
for (n in sN) {
|
||||
sp <- getMYSPE(n)
|
||||
if (length(sp) != 1) {
|
||||
stop(print(as.character(n)))
|
||||
} else {
|
||||
x <- c(x, sp)
|
||||
}
|
||||
}
|
||||
|
||||
# === species for late-comers
|
||||
y <- unique(MYSPEdat$species)
|
||||
print(y[!(y %in% x)])
|
||||
|
||||
|
||||
# === validate
|
||||
l <- length(sN)
|
||||
sp <- character(l)
|
||||
for(i in 1:l) {
|
||||
sp[i] <- getMYSPE(sN[i])
|
||||
}
|
||||
any(duplicated(sp))
|
||||
length(unique(sp))
|
||||
which(! sDat$species %in% sp) # these can be assigned to late-comers
|
||||
|
||||
# Done.
|
||||
|
||||
# [END]
|
||||
|
@ -1,168 +1,168 @@
|
||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence human STRING edges.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafile is NOT posted to github. If you want to
|
||||
# experiment with the original data, download it and place it into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||
# Data: (127.6 Mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 Rewrite
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Initialize 44
|
||||
#TOC> 2 Read STRING Data 51
|
||||
#TOC> 3 Define cutoff and subset 63
|
||||
#TOC> 4 Drop duplicates 103
|
||||
#TOC> 5 Simple statistics 127
|
||||
#TOC> 6 Write to file 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Initialize ==========================================================
|
||||
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 Read STRING Data ====================================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||
# 848 Mb. Fortunately readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
nrow(STR) # 11,759,454 rows
|
||||
head(STR)
|
||||
|
||||
|
||||
# = 3 Define cutoff and subset ============================================
|
||||
|
||||
# approximate distribution of combined_score
|
||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||
|
||||
# Let's table the counts >= 850 and plot them for better resolution.
|
||||
|
||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||
|
||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||
# frequently assigns a combined score of 0.900
|
||||
|
||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||
# as combined scores.
|
||||
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||
plot(myX,
|
||||
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||
xlim = c(1.0, 0.85), # reverse x-axis
|
||||
type = "l",
|
||||
main = "STRING interactions for 9606 (top 600,000)",
|
||||
xlab = "combined_score",
|
||||
ylab = "cumulative counts",
|
||||
col = "#CC0000")
|
||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# What's the cutoff for 100,000 edges?
|
||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||
|
||||
# confirm
|
||||
sum(STR$combined_score >= 964) # 101,348
|
||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# subset the table, and use only the protein IDs and the combined_score
|
||||
STR <- STR[STR$combined_score >= 964,
|
||||
c("protein1", "protein2", "combined_score")]
|
||||
colnames(STR) <- c("a", "b", "score")
|
||||
|
||||
|
||||
# = 4 Drop duplicates ====================================================
|
||||
|
||||
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||
# sort order, then checking for duplicated().
|
||||
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||
# (X:U, X:U) has a duplicate.
|
||||
|
||||
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||
tmp <- STR$b # copy column b
|
||||
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||
all(STR$a >= STR$b) # confirm: TRUE
|
||||
|
||||
# now, make combined keys, like this:
|
||||
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||
|
||||
tmp <- paste0(STR$a, ":", STR$b)
|
||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||
# both a:b and b:a !
|
||||
|
||||
# drop all duplicated interactions from tmp
|
||||
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||
|
||||
|
||||
# = 5 Simple statistics ===================================================
|
||||
|
||||
# how many unique genes?
|
||||
length(unique(c(STR$a, STR$b))) # 8,445
|
||||
|
||||
# how many self-edges?
|
||||
sum(STR$a == STR$b) # none
|
||||
|
||||
# log(rank) / log(frequency)
|
||||
myTbl <- table(c(STR$a, STR$b))
|
||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||
|
||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||
|
||||
# number of singletons
|
||||
sum(myTbl == 1) # almost a quarter
|
||||
|
||||
# maximum?
|
||||
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||
# Google: CDC5L
|
||||
|
||||
# Zipf-plot
|
||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||
type = "b", cex = 0.7,
|
||||
main = "STRINGedges - degrees",
|
||||
xlab = "log(rank)",
|
||||
ylab = "log(frequency)",
|
||||
col = "#FFBB88")
|
||||
|
||||
sprintf("Average number of interactions: %5.2f",
|
||||
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||
|
||||
|
||||
# = 6 Write to file =======================================================
|
||||
|
||||
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||
|
||||
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence human STRING edges.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafile is NOT posted to github. If you want to
|
||||
# experiment with the original data, download it and place it into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||
# Data: (127.6 Mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 Rewrite
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Initialize 44
|
||||
#TOC> 2 Read STRING Data 51
|
||||
#TOC> 3 Define cutoff and subset 63
|
||||
#TOC> 4 Drop duplicates 103
|
||||
#TOC> 5 Simple statistics 127
|
||||
#TOC> 6 Write to file 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Initialize ==========================================================
|
||||
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 Read STRING Data ====================================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||
# 848 Mb. Fortunately readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
nrow(STR) # 11,759,454 rows
|
||||
head(STR)
|
||||
|
||||
|
||||
# = 3 Define cutoff and subset ============================================
|
||||
|
||||
# approximate distribution of combined_score
|
||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||
|
||||
# Let's table the counts >= 850 and plot them for better resolution.
|
||||
|
||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||
|
||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||
# frequently assigns a combined score of 0.900
|
||||
|
||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||
# as combined scores.
|
||||
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||
plot(myX,
|
||||
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||
xlim = c(1.0, 0.85), # reverse x-axis
|
||||
type = "l",
|
||||
main = "STRING interactions for 9606 (top 600,000)",
|
||||
xlab = "combined_score",
|
||||
ylab = "cumulative counts",
|
||||
col = "#CC0000")
|
||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# What's the cutoff for 100,000 edges?
|
||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||
|
||||
# confirm
|
||||
sum(STR$combined_score >= 964) # 101,348
|
||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# subset the table, and use only the protein IDs and the combined_score
|
||||
STR <- STR[STR$combined_score >= 964,
|
||||
c("protein1", "protein2", "combined_score")]
|
||||
colnames(STR) <- c("a", "b", "score")
|
||||
|
||||
|
||||
# = 4 Drop duplicates ====================================================
|
||||
|
||||
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||
# sort order, then checking for duplicated().
|
||||
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||
# (X:U, X:U) has a duplicate.
|
||||
|
||||
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||
tmp <- STR$b # copy column b
|
||||
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||
all(STR$a >= STR$b) # confirm: TRUE
|
||||
|
||||
# now, make combined keys, like this:
|
||||
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||
|
||||
tmp <- paste0(STR$a, ":", STR$b)
|
||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||
# both a:b and b:a !
|
||||
|
||||
# drop all duplicated interactions from tmp
|
||||
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||
|
||||
|
||||
# = 5 Simple statistics ===================================================
|
||||
|
||||
# how many unique genes?
|
||||
length(unique(c(STR$a, STR$b))) # 8,445
|
||||
|
||||
# how many self-edges?
|
||||
sum(STR$a == STR$b) # none
|
||||
|
||||
# log(rank) / log(frequency)
|
||||
myTbl <- table(c(STR$a, STR$b))
|
||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||
|
||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||
|
||||
# number of singletons
|
||||
sum(myTbl == 1) # almost a quarter
|
||||
|
||||
# maximum?
|
||||
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||
# Google: CDC5L
|
||||
|
||||
# Zipf-plot
|
||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||
type = "b", cex = 0.7,
|
||||
main = "STRINGedges - degrees",
|
||||
xlab = "log(rank)",
|
||||
ylab = "log(frequency)",
|
||||
col = "#FFBB88")
|
||||
|
||||
sprintf("Average number of interactions: %5.2f",
|
||||
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||
|
||||
|
||||
# = 6 Write to file =======================================================
|
||||
|
||||
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||
|
||||
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,167 +1,167 @@
|
||||
# tocID <- "scripts/ABC-makeScCCnet.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
||||
# GOSlim annotation.
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafiles are NOT posted to github. If you want to
|
||||
# experiment with your own code, download them and place them into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
|
||||
# Data: (20.1 mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# GOSlim data source: (Note: this has moved from GO to SGD)
|
||||
# Info page: https://www.yeastgenome.org/downloads
|
||||
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
|
||||
# Data: (3 mb)
|
||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||
#
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Update. GO Slim Yeast mow at SGD
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
# SRCDIR <- "./instructor"
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 INITIALIZE 58
|
||||
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
|
||||
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
|
||||
#TOC> 3.1 Intersect interactions and annotations 122
|
||||
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 INITIALIZE ==========================================================
|
||||
|
||||
SRCDIR <- "./data"
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
|
||||
# really not necessary to uncompress since readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
|
||||
# Subset only IDs and combined_score column
|
||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||
|
||||
# head(STR)
|
||||
# sum(STR$combined_score > 909) # 100270 edges
|
||||
# subset for 100,000 highest confidence edges
|
||||
STR <- STR[(STR$combined_score > 909), ]
|
||||
head(STR)
|
||||
|
||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
|
||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
|
||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
|
||||
head(STR)
|
||||
|
||||
# get a vector of gene names in this list
|
||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
||||
# names
|
||||
length(myIntxGenes)
|
||||
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
|
||||
|
||||
|
||||
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
|
||||
#
|
||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
|
||||
|
||||
Gsl <- readr::read_tsv(fn,
|
||||
col_names = c("ID",
|
||||
"name",
|
||||
"SGDId",
|
||||
"Ontology",
|
||||
"termName",
|
||||
"termID",
|
||||
"status"))
|
||||
|
||||
head(Gsl)
|
||||
|
||||
# What cell cycle names does it contain?
|
||||
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
|
||||
myGslTermNames[grep("cycle", myGslTermNames)]
|
||||
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
|
||||
|
||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
|
||||
|
||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
|
||||
length(scCCgenes) # 324 genes annotated to that term
|
||||
|
||||
# == 3.1 Intersect interactions and annotations ============================
|
||||
|
||||
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
|
||||
# # functional interactions
|
||||
|
||||
|
||||
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
|
||||
#
|
||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
|
||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
|
||||
#
|
||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
|
||||
(STR$protein2 %in% scCCgenes), ]
|
||||
|
||||
# How many genes are there?
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
|
||||
|
||||
# Each edge is listed twice - now remove duplicates.
|
||||
|
||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
|
||||
# smaller Than the second one. This brings the two names into a defined
|
||||
# order. Then concatenate them with a "." - the resulting string
|
||||
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
|
||||
# and c("B", "A") also gives "A.B". This identifies duplicates.
|
||||
|
||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
|
||||
1,
|
||||
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
|
||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
|
||||
|
||||
sum(duplicated(x)) # 1453
|
||||
|
||||
# Step 2: drop all rows that contain duplicates in x
|
||||
scCCnet <- scCCnet[! duplicated(x), ]
|
||||
|
||||
# Confirm we didn't loose genes
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
|
||||
nrow(scCCnet)
|
||||
# Network has 283 nodes, 1453 edges
|
||||
|
||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
|
||||
|
||||
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeScCCnet.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
||||
# GOSlim annotation.
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafiles are NOT posted to github. If you want to
|
||||
# experiment with your own code, download them and place them into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
|
||||
# Data: (20.1 mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# GOSlim data source: (Note: this has moved from GO to SGD)
|
||||
# Info page: https://www.yeastgenome.org/downloads
|
||||
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
|
||||
# Data: (3 mb)
|
||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||
#
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Update. GO Slim Yeast mow at SGD
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
# SRCDIR <- "./instructor"
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 INITIALIZE 58
|
||||
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
|
||||
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
|
||||
#TOC> 3.1 Intersect interactions and annotations 122
|
||||
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 INITIALIZE ==========================================================
|
||||
|
||||
SRCDIR <- "./data"
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
|
||||
# really not necessary to uncompress since readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
|
||||
# Subset only IDs and combined_score column
|
||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||
|
||||
# head(STR)
|
||||
# sum(STR$combined_score > 909) # 100270 edges
|
||||
# subset for 100,000 highest confidence edges
|
||||
STR <- STR[(STR$combined_score > 909), ]
|
||||
head(STR)
|
||||
|
||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
|
||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
|
||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
|
||||
head(STR)
|
||||
|
||||
# get a vector of gene names in this list
|
||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
||||
# names
|
||||
length(myIntxGenes)
|
||||
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
|
||||
|
||||
|
||||
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
|
||||
#
|
||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
|
||||
|
||||
Gsl <- readr::read_tsv(fn,
|
||||
col_names = c("ID",
|
||||
"name",
|
||||
"SGDId",
|
||||
"Ontology",
|
||||
"termName",
|
||||
"termID",
|
||||
"status"))
|
||||
|
||||
head(Gsl)
|
||||
|
||||
# What cell cycle names does it contain?
|
||||
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
|
||||
myGslTermNames[grep("cycle", myGslTermNames)]
|
||||
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
|
||||
|
||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
|
||||
|
||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
|
||||
length(scCCgenes) # 324 genes annotated to that term
|
||||
|
||||
# == 3.1 Intersect interactions and annotations ============================
|
||||
|
||||
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
|
||||
# # functional interactions
|
||||
|
||||
|
||||
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
|
||||
#
|
||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
|
||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
|
||||
#
|
||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
|
||||
(STR$protein2 %in% scCCgenes), ]
|
||||
|
||||
# How many genes are there?
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
|
||||
|
||||
# Each edge is listed twice - now remove duplicates.
|
||||
|
||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
|
||||
# smaller Than the second one. This brings the two names into a defined
|
||||
# order. Then concatenate them with a "." - the resulting string
|
||||
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
|
||||
# and c("B", "A") also gives "A.B". This identifies duplicates.
|
||||
|
||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
|
||||
1,
|
||||
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
|
||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
|
||||
|
||||
sum(duplicated(x)) # 1453
|
||||
|
||||
# Step 2: drop all rows that contain duplicates in x
|
||||
scCCnet <- scCCnet[! duplicated(x), ]
|
||||
|
||||
# Confirm we didn't loose genes
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
|
||||
nrow(scCCnet)
|
||||
# Network has 283 nodes, 1453 edges
|
||||
|
||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
|
||||
|
||||
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,135 +1,135 @@
|
||||
# tocID <- "scripts/ABC-writeALN.R"
|
||||
#
|
||||
# ToDo: calculate consensus line
|
||||
# append sequence numbers
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeALN <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 60) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.0
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector.
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full alignment length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (blockWidth < 1) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
if (blockWidth > 60) {
|
||||
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad any sequence with "-" that is shorter than ranges[2]
|
||||
for (i in seq_along(sSet)) {
|
||||
if (nchar(sSet[i]) < range[2]) {
|
||||
sSet[i] <- paste0(sSet[i],
|
||||
paste0(rep("-", range[2] - nchar(sSet[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad sequence names
|
||||
sNames <- names(sSet)
|
||||
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
|
||||
for (i in seq_along(sNames)) {
|
||||
sNames[i] <- paste0(sNames[i],
|
||||
paste0(rep(" ", len - nchar(sNames[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
|
||||
|
||||
# Process each sequence
|
||||
txt <- paste0("CLUSTAL W format. ", note)
|
||||
txt[2] <- ""
|
||||
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
for (i in seq_along(iStarts)) {
|
||||
for (j in seq_along(sSet)) {
|
||||
txt <- c(txt,
|
||||
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
|
||||
}
|
||||
txt <- c(txt, "") # append a blank consenus line
|
||||
txt <- c(txt, "") # append a separator line
|
||||
}
|
||||
|
||||
writeLines(txt, con= myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-writeALN.R"
|
||||
#
|
||||
# ToDo: calculate consensus line
|
||||
# append sequence numbers
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeALN <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 60) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.0
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector.
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full alignment length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (blockWidth < 1) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
if (blockWidth > 60) {
|
||||
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad any sequence with "-" that is shorter than ranges[2]
|
||||
for (i in seq_along(sSet)) {
|
||||
if (nchar(sSet[i]) < range[2]) {
|
||||
sSet[i] <- paste0(sSet[i],
|
||||
paste0(rep("-", range[2] - nchar(sSet[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad sequence names
|
||||
sNames <- names(sSet)
|
||||
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
|
||||
for (i in seq_along(sNames)) {
|
||||
sNames[i] <- paste0(sNames[i],
|
||||
paste0(rep(" ", len - nchar(sNames[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
|
||||
|
||||
# Process each sequence
|
||||
txt <- paste0("CLUSTAL W format. ", note)
|
||||
txt[2] <- ""
|
||||
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
for (i in seq_along(iStarts)) {
|
||||
for (j in seq_along(sSet)) {
|
||||
txt <- c(txt,
|
||||
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
|
||||
}
|
||||
txt <- c(txt, "") # append a blank consenus line
|
||||
txt <- c(txt, "") # append a separator line
|
||||
}
|
||||
|
||||
writeLines(txt, con= myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -1,121 +1,121 @@
|
||||
# ABC-writeMFA.R
|
||||
#
|
||||
# ToDo:
|
||||
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeMFA <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 80) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.1
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full sequence length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (! blockWidth > 0){
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Process each sequence
|
||||
txt <- character()
|
||||
if (note != "") { # construct header line
|
||||
headers <- paste(names(sSet), note)
|
||||
} else {
|
||||
headers <- names(sSet)
|
||||
}
|
||||
|
||||
for (i in seq_along(sSet)) {
|
||||
|
||||
# output FASTA header
|
||||
txt <- c(txt, sprintf(">%s", headers[i]))
|
||||
|
||||
# output the sequence in blocks of blockWidth per line ...
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
|
||||
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
|
||||
txt <- c(txt, thisSeq)
|
||||
|
||||
txt <- c(txt, "") # append an empty line for readability
|
||||
}
|
||||
|
||||
writeLines(txt, con = myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# ABC-writeMFA.R
|
||||
#
|
||||
# ToDo:
|
||||
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeMFA <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 80) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.1
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full sequence length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (! blockWidth > 0){
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Process each sequence
|
||||
txt <- character()
|
||||
if (note != "") { # construct header line
|
||||
headers <- paste(names(sSet), note)
|
||||
} else {
|
||||
headers <- names(sSet)
|
||||
}
|
||||
|
||||
for (i in seq_along(sSet)) {
|
||||
|
||||
# output FASTA header
|
||||
txt <- c(txt, sprintf(">%s", headers[i]))
|
||||
|
||||
# output the sequence in blocks of blockWidth per line ...
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
|
||||
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
|
||||
txt <- c(txt, thisSeq)
|
||||
|
||||
txt <- c(txt, "") # append an empty line for readability
|
||||
}
|
||||
|
||||
writeLines(txt, con = myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
768
scripts/BLAST.R
768
scripts/BLAST.R
@ -1,384 +1,384 @@
|
||||
# BLAST.R
|
||||
#
|
||||
# Purpose: Send off one BLAST search and return parsed list of results
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
#
|
||||
# Version: 3.2
|
||||
# Date: 2016 09 - 2020 09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 3.2 2020 updates
|
||||
# 3.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 3.0 parsing logic had not been fully implemented; Fixed.
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
# 2.0 Completely rewritten because the interface completely changed.
|
||||
# Code adpated in part from NCBI Perl sample code:
|
||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
||||
#
|
||||
# ToDo: Return the organism/strain name in the output, and propagate
|
||||
# into MYSPE selection script.
|
||||
#
|
||||
# Notes: This is somewhat pedestrian, but apparently there are currently
|
||||
# no R packages that contain such code.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
BLAST <- function(Q,
|
||||
db = "refseq_protein",
|
||||
nHits = 30,
|
||||
E = 0.1,
|
||||
limits = "",
|
||||
rid = "",
|
||||
query = "",
|
||||
quietly = FALSE,
|
||||
myTimeout = 120) {
|
||||
# Purpose:
|
||||
# Basic BLAST search
|
||||
#
|
||||
# Parameters:
|
||||
# Q: query - either a valid ID or a sequence
|
||||
# db: "refseq_protein" by default,
|
||||
# other legal values include: "nr", "pdb", "swissprot" ...
|
||||
# nHits: number of hits to maximally return
|
||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||
# to occur E or more times in a database of random sequence.
|
||||
# limits: a valid ENTREZ filter
|
||||
# rid: a request ID - to retrieve earlier search results
|
||||
# query: the actual query string (needed when retrieving results
|
||||
# with an rid)
|
||||
# quietly: controls printing of wait-time progress bar
|
||||
# timeout: how much longer _after_ rtoe to wait for a result
|
||||
# before giving up (seconds)
|
||||
# Value:
|
||||
# result: list of process status or resulting hits, and some metadata
|
||||
|
||||
|
||||
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
||||
|
||||
results <- list()
|
||||
results$query = query
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
|
||||
if (rid == "") { # If no rid is available, spawn a search.
|
||||
# Else, proceed directly to retrieval.
|
||||
|
||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Put",
|
||||
"&PROGRAM=", "blastp",
|
||||
"&QUERY=", URLencode(Q),
|
||||
"&DATABASE=", db,
|
||||
"&MATRIX=", "BLOSUM62",
|
||||
"&EXPECT=", as.character(E),
|
||||
"&HITLIST_SIZE=", as.character(nHits),
|
||||
"&ALIGNMENTS=", as.character(nHits),
|
||||
"&FORMAT_TYPE=Text")
|
||||
|
||||
if (limits != "") {
|
||||
results$query <- paste0(
|
||||
results$query,
|
||||
"&ENTREZ_QUERY=", limits)
|
||||
}
|
||||
|
||||
# send it off ...
|
||||
response <- httr::GET(results$query)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
patt <- "RID = (\\w+)" # match the request id
|
||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||
|
||||
patt <- "RTOE = (\\d+)" # match the expected completion time
|
||||
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
|
||||
|
||||
# Now we wait ...
|
||||
if (quietly) {
|
||||
Sys.sleep(results$rtoe)
|
||||
} else {
|
||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
||||
waitTimer(results$rtoe)
|
||||
}
|
||||
|
||||
} # done sending query and retrieving rid, rtoe
|
||||
|
||||
# Enter an infinite loop to check for result availability
|
||||
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
"&FORMAT_OBJECT=SearchInfo",
|
||||
sep = "")
|
||||
|
||||
while (TRUE) {
|
||||
# Check whether the result is ready
|
||||
response <- httr::GET(checkStatus)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||
myTimeout <- myTimeout - EXTRAWAIT
|
||||
|
||||
if (myTimeout <= 0) { # abort
|
||||
cat("BLAST search not concluded before timeout. Aborting.\n")
|
||||
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
|
||||
"Trying checking back later with >",
|
||||
results$rid))
|
||||
return(results)
|
||||
}
|
||||
|
||||
if (quietly) {
|
||||
Sys.sleep(EXTRAWAIT)
|
||||
} else {
|
||||
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
|
||||
EXTRAWAIT,
|
||||
myTimeout))
|
||||
waitTimer(EXTRAWAIT)
|
||||
next
|
||||
}
|
||||
|
||||
} else if (length(grep("Status=FAILED", txt)) > 0) {
|
||||
cat("BLAST search returned status \"FAILED\". Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
|
||||
cat("BLAST search returned status \"UNKNOWN\".\n")
|
||||
cat("This probably means the rid has expired. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=READY", txt)) > 0) { # Done
|
||||
|
||||
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
|
||||
cat("BLAST search ready but no hits found. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else {
|
||||
break # done ... retrieve search result
|
||||
}
|
||||
}
|
||||
} # end result-check loop
|
||||
|
||||
# retrieve results from BLAST server
|
||||
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"&CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
sep = "")
|
||||
|
||||
response <- httr::GET(retrieve)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
# txt contains the whole set of results. Process:
|
||||
|
||||
# First, we strsplit() on linebreaks:
|
||||
txt <- unlist(strsplit(txt, "\n"))
|
||||
|
||||
# The alignments range from the first line that begins with ">" ...
|
||||
iFirst <- grep("^>", txt)[1]
|
||||
|
||||
# ... to the last line that begins with "Sbjct"
|
||||
x <- grep("^Sbjct", txt)
|
||||
iLast <- x[length(x)]
|
||||
|
||||
# Get the alignments block
|
||||
txt <- txt[iFirst:iLast]
|
||||
|
||||
# Drop empty lines
|
||||
txt <- txt[!(nchar(txt) == 0)]
|
||||
|
||||
# A line that ends "]" but does not begin ">" seems to be a split
|
||||
# defline ... eg.
|
||||
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
|
||||
# [2] "EXF-2481]"
|
||||
# Merge these lines to the preceding lines and delete them.
|
||||
#
|
||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||
if (length(x) > 0) {
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
}
|
||||
|
||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||
# redundant, identical sequences. Keep only the first instance.
|
||||
iKeep <- ! grepl("^>", txt)
|
||||
x <- rle(iKeep)
|
||||
x$positions <- cumsum(x$lengths)
|
||||
i <- which(x$lengths > 1 & x$values == FALSE)
|
||||
if (length(i) > 0) {
|
||||
firsts <- x$positions[i] - x$lengths[i] + 1
|
||||
iKeep[firsts] <- TRUE
|
||||
txt <- txt[iKeep]
|
||||
}
|
||||
|
||||
# After this preprocessing the following should be true:
|
||||
# - Every alignment block begins with a defline in which the
|
||||
# first character is ">"
|
||||
# - There is only one defline in each block.
|
||||
# - Lines are not split.
|
||||
|
||||
# Make a dataframe of first and last indices of alignment blocks
|
||||
x <- grep("^>", txt)
|
||||
blocks <- data.frame(iFirst = x,
|
||||
iLast = c((x[-1] - 1), length(txt)))
|
||||
|
||||
# Build the hits list by parsing the blocks
|
||||
results$hits <- list()
|
||||
|
||||
for (i in seq_len(nrow(blocks))) {
|
||||
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
|
||||
results$hits[[i]] <- parseBLASTalignment(thisBlock)
|
||||
}
|
||||
|
||||
return(results)
|
||||
}
|
||||
|
||||
parseBLASTalignment <- function(hit) {
|
||||
# Parse data from a character vector containing a BLAST hit
|
||||
# Parameters:
|
||||
# hit char one BLAST hit as char vector
|
||||
# Value:
|
||||
# list $def chr defline
|
||||
# $accession chr accession number
|
||||
# $organism chr complete organism definition
|
||||
# $species chr binomial species
|
||||
# $E num E value
|
||||
# $lengthAli num length of the alignment
|
||||
# $nIdentitites num number of identities
|
||||
# $nGaps num number of gaps
|
||||
# $Qbounds num 2-element vector of query start-end
|
||||
# $Sbounds num 2-element vector of subject start-end
|
||||
# $Qseq chr query sequence
|
||||
# $midSeq chr midline string
|
||||
# $Sseq chr subject sequence
|
||||
|
||||
getToken <- function(patt, v) {
|
||||
# get the first token identified by pattern patt in character vector v
|
||||
v <- v[grep(patt, v)]
|
||||
if (length(v) > 1) { v <- v[1] }
|
||||
if (length(v) == 0) { token <- NA
|
||||
} else {
|
||||
token <- regmatches(v, regexec(patt, v))[[1]][2] }
|
||||
return(token)
|
||||
}
|
||||
|
||||
h <- list()
|
||||
|
||||
# FASTA defline
|
||||
h$def <- hit[1]
|
||||
|
||||
# accesion number (ID), use the first if there are several, separated by "|"
|
||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# organism
|
||||
patt <- "\\[(.+)]"
|
||||
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# species
|
||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||
if (length(x) >= 2) {
|
||||
h$species <- paste(x[1], x[2])
|
||||
} else if (length(x) == 1) {
|
||||
h$species <- paste(x[1], "sp.")
|
||||
} else {
|
||||
h$species <- NA
|
||||
}
|
||||
|
||||
# E-value
|
||||
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
|
||||
|
||||
# length of alignment
|
||||
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
|
||||
|
||||
# number of identities
|
||||
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
|
||||
|
||||
# number of gaps
|
||||
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
|
||||
|
||||
# split up alignment section
|
||||
idx <- grep("^Query ", hit)
|
||||
Que <- hit[idx]
|
||||
Mid <- hit[idx + 1]
|
||||
Sbj <- hit[idx + 2]
|
||||
|
||||
# first and last positions
|
||||
h$Qbounds <- c(start = 0, end = 0)
|
||||
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
|
||||
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
|
||||
|
||||
h$Sbounds <- c(start = 0, end = 0)
|
||||
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
|
||||
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
|
||||
|
||||
# aligned sequences
|
||||
for (i in seq_along(Que)) {
|
||||
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
|
||||
m <- regexec(patt, Que[i])
|
||||
iFirst <- m[[1]][2]
|
||||
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
|
||||
Que[i] <- substring(Que[i], iFirst, iLast)
|
||||
Mid[i] <- substring(Mid[i], iFirst, iLast)
|
||||
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
|
||||
}
|
||||
|
||||
h$Qseq <- paste0(Que, collapse = "")
|
||||
h$midSeq <- paste0(Mid, collapse = "")
|
||||
h$Sseq <- paste0(Sbj, collapse = "")
|
||||
|
||||
return(h)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
if (FALSE) {
|
||||
# define query:
|
||||
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
sep="")
|
||||
# or ...
|
||||
q <- "NP_010227" # refseq ID
|
||||
|
||||
test <- BLAST(q,
|
||||
nHits = 100,
|
||||
E = 0.001,
|
||||
rid = "",
|
||||
limits = "txid4751[ORGN]") # Fungi
|
||||
str(test)
|
||||
length(test$hits)
|
||||
}
|
||||
|
||||
# [END]
|
||||
|
||||
# BLAST.R
|
||||
#
|
||||
# Purpose: Send off one BLAST search and return parsed list of results
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
#
|
||||
# Version: 3.2
|
||||
# Date: 2016 09 - 2020 09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 3.2 2020 updates
|
||||
# 3.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 3.0 parsing logic had not been fully implemented; Fixed.
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
# 2.0 Completely rewritten because the interface completely changed.
|
||||
# Code adpated in part from NCBI Perl sample code:
|
||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
||||
#
|
||||
# ToDo: Return the organism/strain name in the output, and propagate
|
||||
# into MYSPE selection script.
|
||||
#
|
||||
# Notes: This is somewhat pedestrian, but apparently there are currently
|
||||
# no R packages that contain such code.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
BLAST <- function(Q,
|
||||
db = "refseq_protein",
|
||||
nHits = 30,
|
||||
E = 0.1,
|
||||
limits = "",
|
||||
rid = "",
|
||||
query = "",
|
||||
quietly = FALSE,
|
||||
myTimeout = 120) {
|
||||
# Purpose:
|
||||
# Basic BLAST search
|
||||
#
|
||||
# Parameters:
|
||||
# Q: query - either a valid ID or a sequence
|
||||
# db: "refseq_protein" by default,
|
||||
# other legal values include: "nr", "pdb", "swissprot" ...
|
||||
# nHits: number of hits to maximally return
|
||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||
# to occur E or more times in a database of random sequence.
|
||||
# limits: a valid ENTREZ filter
|
||||
# rid: a request ID - to retrieve earlier search results
|
||||
# query: the actual query string (needed when retrieving results
|
||||
# with an rid)
|
||||
# quietly: controls printing of wait-time progress bar
|
||||
# timeout: how much longer _after_ rtoe to wait for a result
|
||||
# before giving up (seconds)
|
||||
# Value:
|
||||
# result: list of process status or resulting hits, and some metadata
|
||||
|
||||
|
||||
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
||||
|
||||
results <- list()
|
||||
results$query = query
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
|
||||
if (rid == "") { # If no rid is available, spawn a search.
|
||||
# Else, proceed directly to retrieval.
|
||||
|
||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Put",
|
||||
"&PROGRAM=", "blastp",
|
||||
"&QUERY=", URLencode(Q),
|
||||
"&DATABASE=", db,
|
||||
"&MATRIX=", "BLOSUM62",
|
||||
"&EXPECT=", as.character(E),
|
||||
"&HITLIST_SIZE=", as.character(nHits),
|
||||
"&ALIGNMENTS=", as.character(nHits),
|
||||
"&FORMAT_TYPE=Text")
|
||||
|
||||
if (limits != "") {
|
||||
results$query <- paste0(
|
||||
results$query,
|
||||
"&ENTREZ_QUERY=", limits)
|
||||
}
|
||||
|
||||
# send it off ...
|
||||
response <- httr::GET(results$query)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
patt <- "RID = (\\w+)" # match the request id
|
||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||
|
||||
patt <- "RTOE = (\\d+)" # match the expected completion time
|
||||
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
|
||||
|
||||
# Now we wait ...
|
||||
if (quietly) {
|
||||
Sys.sleep(results$rtoe)
|
||||
} else {
|
||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
||||
waitTimer(results$rtoe)
|
||||
}
|
||||
|
||||
} # done sending query and retrieving rid, rtoe
|
||||
|
||||
# Enter an infinite loop to check for result availability
|
||||
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
"&FORMAT_OBJECT=SearchInfo",
|
||||
sep = "")
|
||||
|
||||
while (TRUE) {
|
||||
# Check whether the result is ready
|
||||
response <- httr::GET(checkStatus)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||
myTimeout <- myTimeout - EXTRAWAIT
|
||||
|
||||
if (myTimeout <= 0) { # abort
|
||||
cat("BLAST search not concluded before timeout. Aborting.\n")
|
||||
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
|
||||
"Trying checking back later with >",
|
||||
results$rid))
|
||||
return(results)
|
||||
}
|
||||
|
||||
if (quietly) {
|
||||
Sys.sleep(EXTRAWAIT)
|
||||
} else {
|
||||
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
|
||||
EXTRAWAIT,
|
||||
myTimeout))
|
||||
waitTimer(EXTRAWAIT)
|
||||
next
|
||||
}
|
||||
|
||||
} else if (length(grep("Status=FAILED", txt)) > 0) {
|
||||
cat("BLAST search returned status \"FAILED\". Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
|
||||
cat("BLAST search returned status \"UNKNOWN\".\n")
|
||||
cat("This probably means the rid has expired. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=READY", txt)) > 0) { # Done
|
||||
|
||||
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
|
||||
cat("BLAST search ready but no hits found. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else {
|
||||
break # done ... retrieve search result
|
||||
}
|
||||
}
|
||||
} # end result-check loop
|
||||
|
||||
# retrieve results from BLAST server
|
||||
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"&CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
sep = "")
|
||||
|
||||
response <- httr::GET(retrieve)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
# txt contains the whole set of results. Process:
|
||||
|
||||
# First, we strsplit() on linebreaks:
|
||||
txt <- unlist(strsplit(txt, "\n"))
|
||||
|
||||
# The alignments range from the first line that begins with ">" ...
|
||||
iFirst <- grep("^>", txt)[1]
|
||||
|
||||
# ... to the last line that begins with "Sbjct"
|
||||
x <- grep("^Sbjct", txt)
|
||||
iLast <- x[length(x)]
|
||||
|
||||
# Get the alignments block
|
||||
txt <- txt[iFirst:iLast]
|
||||
|
||||
# Drop empty lines
|
||||
txt <- txt[!(nchar(txt) == 0)]
|
||||
|
||||
# A line that ends "]" but does not begin ">" seems to be a split
|
||||
# defline ... eg.
|
||||
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
|
||||
# [2] "EXF-2481]"
|
||||
# Merge these lines to the preceding lines and delete them.
|
||||
#
|
||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||
if (length(x) > 0) {
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
}
|
||||
|
||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||
# redundant, identical sequences. Keep only the first instance.
|
||||
iKeep <- ! grepl("^>", txt)
|
||||
x <- rle(iKeep)
|
||||
x$positions <- cumsum(x$lengths)
|
||||
i <- which(x$lengths > 1 & x$values == FALSE)
|
||||
if (length(i) > 0) {
|
||||
firsts <- x$positions[i] - x$lengths[i] + 1
|
||||
iKeep[firsts] <- TRUE
|
||||
txt <- txt[iKeep]
|
||||
}
|
||||
|
||||
# After this preprocessing the following should be true:
|
||||
# - Every alignment block begins with a defline in which the
|
||||
# first character is ">"
|
||||
# - There is only one defline in each block.
|
||||
# - Lines are not split.
|
||||
|
||||
# Make a dataframe of first and last indices of alignment blocks
|
||||
x <- grep("^>", txt)
|
||||
blocks <- data.frame(iFirst = x,
|
||||
iLast = c((x[-1] - 1), length(txt)))
|
||||
|
||||
# Build the hits list by parsing the blocks
|
||||
results$hits <- list()
|
||||
|
||||
for (i in seq_len(nrow(blocks))) {
|
||||
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
|
||||
results$hits[[i]] <- parseBLASTalignment(thisBlock)
|
||||
}
|
||||
|
||||
return(results)
|
||||
}
|
||||
|
||||
parseBLASTalignment <- function(hit) {
|
||||
# Parse data from a character vector containing a BLAST hit
|
||||
# Parameters:
|
||||
# hit char one BLAST hit as char vector
|
||||
# Value:
|
||||
# list $def chr defline
|
||||
# $accession chr accession number
|
||||
# $organism chr complete organism definition
|
||||
# $species chr binomial species
|
||||
# $E num E value
|
||||
# $lengthAli num length of the alignment
|
||||
# $nIdentitites num number of identities
|
||||
# $nGaps num number of gaps
|
||||
# $Qbounds num 2-element vector of query start-end
|
||||
# $Sbounds num 2-element vector of subject start-end
|
||||
# $Qseq chr query sequence
|
||||
# $midSeq chr midline string
|
||||
# $Sseq chr subject sequence
|
||||
|
||||
getToken <- function(patt, v) {
|
||||
# get the first token identified by pattern patt in character vector v
|
||||
v <- v[grep(patt, v)]
|
||||
if (length(v) > 1) { v <- v[1] }
|
||||
if (length(v) == 0) { token <- NA
|
||||
} else {
|
||||
token <- regmatches(v, regexec(patt, v))[[1]][2] }
|
||||
return(token)
|
||||
}
|
||||
|
||||
h <- list()
|
||||
|
||||
# FASTA defline
|
||||
h$def <- hit[1]
|
||||
|
||||
# accesion number (ID), use the first if there are several, separated by "|"
|
||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# organism
|
||||
patt <- "\\[(.+)]"
|
||||
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# species
|
||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||
if (length(x) >= 2) {
|
||||
h$species <- paste(x[1], x[2])
|
||||
} else if (length(x) == 1) {
|
||||
h$species <- paste(x[1], "sp.")
|
||||
} else {
|
||||
h$species <- NA
|
||||
}
|
||||
|
||||
# E-value
|
||||
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
|
||||
|
||||
# length of alignment
|
||||
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
|
||||
|
||||
# number of identities
|
||||
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
|
||||
|
||||
# number of gaps
|
||||
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
|
||||
|
||||
# split up alignment section
|
||||
idx <- grep("^Query ", hit)
|
||||
Que <- hit[idx]
|
||||
Mid <- hit[idx + 1]
|
||||
Sbj <- hit[idx + 2]
|
||||
|
||||
# first and last positions
|
||||
h$Qbounds <- c(start = 0, end = 0)
|
||||
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
|
||||
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
|
||||
|
||||
h$Sbounds <- c(start = 0, end = 0)
|
||||
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
|
||||
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
|
||||
|
||||
# aligned sequences
|
||||
for (i in seq_along(Que)) {
|
||||
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
|
||||
m <- regexec(patt, Que[i])
|
||||
iFirst <- m[[1]][2]
|
||||
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
|
||||
Que[i] <- substring(Que[i], iFirst, iLast)
|
||||
Mid[i] <- substring(Mid[i], iFirst, iLast)
|
||||
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
|
||||
}
|
||||
|
||||
h$Qseq <- paste0(Que, collapse = "")
|
||||
h$midSeq <- paste0(Mid, collapse = "")
|
||||
h$Sseq <- paste0(Sbj, collapse = "")
|
||||
|
||||
return(h)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
if (FALSE) {
|
||||
# define query:
|
||||
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
sep="")
|
||||
# or ...
|
||||
q <- "NP_010227" # refseq ID
|
||||
|
||||
test <- BLAST(q,
|
||||
nHits = 100,
|
||||
E = 0.001,
|
||||
rid = "",
|
||||
limits = "txid4751[ORGN]") # Fungi
|
||||
str(test)
|
||||
length(test$hits)
|
||||
}
|
||||
|
||||
# [END]
|
||||
|
||||
|
@ -1,32 +1,32 @@
|
||||
# test_biCode.R
|
||||
#
|
||||
|
||||
context("biCode() utility function tests") # A set of tests for some
|
||||
# functionality
|
||||
|
||||
test_that("expected input is processed correctly", { # Related expectations
|
||||
expect_equal(biCode("homo sapiens"), "HOMSA")
|
||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
|
||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
|
||||
c("PHACI", "MACRU"))
|
||||
})
|
||||
|
||||
test_that("unexpected input is managed", {
|
||||
expect_equal(biCode(""), ".....")
|
||||
expect_equal(biCode(" "), ".....")
|
||||
expect_equal(biCode("123 12"), ".....")
|
||||
expect_equal(biCode("h sapiens"), "H..SA")
|
||||
})
|
||||
|
||||
test_that("NA values are preserved", {
|
||||
expect_true(is.na((biCode(NA))))
|
||||
expect_equal(biCode(c("first", NA, "last")),
|
||||
c("FIRST", NA, "LAST."))
|
||||
})
|
||||
|
||||
test_that("Missing argument throws an error", {
|
||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
|
||||
})
|
||||
|
||||
|
||||
# [END]
|
||||
# test_biCode.R
|
||||
#
|
||||
|
||||
context("biCode() utility function tests") # A set of tests for some
|
||||
# functionality
|
||||
|
||||
test_that("expected input is processed correctly", { # Related expectations
|
||||
expect_equal(biCode("homo sapiens"), "HOMSA")
|
||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
|
||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
|
||||
c("PHACI", "MACRU"))
|
||||
})
|
||||
|
||||
test_that("unexpected input is managed", {
|
||||
expect_equal(biCode(""), ".....")
|
||||
expect_equal(biCode(" "), ".....")
|
||||
expect_equal(biCode("123 12"), ".....")
|
||||
expect_equal(biCode("h sapiens"), "H..SA")
|
||||
})
|
||||
|
||||
test_that("NA values are preserved", {
|
||||
expect_true(is.na((biCode(NA))))
|
||||
expect_equal(biCode(c("first", NA, "last")),
|
||||
c("FIRST", NA, "LAST."))
|
||||
})
|
||||
|
||||
test_that("Missing argument throws an error", {
|
||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
|
||||
})
|
||||
|
||||
|
||||
# [END]
|
||||
|
Loading…
x
Reference in New Issue
Block a user