Line termination change and old code.

This commit is contained in:
Harrison Deng 2021-11-16 00:31:48 -05:00
parent b1e00f52f7
commit affe00f6fb
86 changed files with 37873 additions and 37876 deletions

258
.Rprofile
View File

@ -1,129 +1,129 @@
# .Rprofile
#
# This script is automatically executed on startup
# ==============================================================================
init <- function() {
# Create a local copy of myScript.R if not done yet.
if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
file.copy(".tmp.R", "myScript.R")
cat("A new file \"myScript.R\" was created. You can use it for\n")
cat("notes and code experiments.\n\n")
}
cat("\n\n")
cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
cat("\"files\" pane), edit it and save it.\n")
cat("Then click the checkbox, and use the More -> Move... dialogue\n")
cat("to move it into the \"myScripts\" folder.\n\n")
file.edit("ABC-units.R")
return(invisible(NULL))
}
if (! file.exists("./myScripts/.myProfile.R")) {
cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
cat(" =================")
cat("\n\n")
cat(" WELCOME !\n")
cat("\n")
cat(" Type 'init()' to begin\n\n")
cat("\n")
cat(" =================")
cat("\n\n")
} else { # local profile exists ... validate state:
cat("\n\nLoading local functions ...")
source(".utilities.R") # local profile appears sane, source utilities
source("./myScripts/.myProfile.R")
if (! exists("myEMail")) { # ... has eMail been defined?
cat("ERROR !\n")
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myEMail\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n")
}
if (! exists("myStudentNumber")) { # ... has the Student Number been defined?
cat("ERROR !\n")
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myStudentNumber\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n")
}
if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
cat("ERROR !\n") # is the Student Number valid?
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("your Student Number could not be validated.\n")
cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
cat(" and fix the problem or contact your instructor to continue.\n\n")
}
if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now
# ... and write it into the profile.
prf <- readLines("./myScripts/.myProfile.R")
iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
out <- prf[1:iEmail]
out <- c(out, sprintf("MYSPE <- \"%s\" ",
getMYSPE(myStudentNumber)))
out <- c(out, prf[(iEmail+1):length(prf)])
writeLines(out, "./myScripts/.myProfile.R")
cat("\n")
cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
getMYSPE(myStudentNumber)))
MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use
rm(prf, iEmail, out) # cleanup
}
cat("... done.\n\n")
}
if (default.stringsAsFactors()) {
cat("WARNING.\n")
cat("========\n")
cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
cat("This will break some of the code.\n")
cat("Please contact your instructor to troubleshoot and fix this issue.\n")
cat("\n")
}
errText <- list()
errText[["noProfileFile"]] <- '
Your PROFILE FILE does not exist. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. Detailed
instructions were given when you first ran the init() command.
Try running init() again and follow the instructions. Reload youR RStudio
session and start over with this file.
If this does not fix the problem, ask for help.
'
errText[["noStudentNumber"]] <- '
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. This file gets
sourced when you start a new R-session, but since you see this error
message there was a problem.
Perhaps you need to restart your R-session. Try closing the RStudio
project and reopening it from the File > Recent Projects menu.
Perhaps there was a syntax error in your file. Then not all the
instructions in the file are executed. Check the file: is your
email perhpas not defined? Or did you type it without qwuoataion
marks?
Try fixing problems, and then restart R as described above.
If none of this fixes the problem, ask for help.
'
# [END]
# .Rprofile
#
# This script is automatically executed on startup
# ==============================================================================
init <- function() {
# Create a local copy of myScript.R if not done yet.
if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
file.copy(".tmp.R", "myScript.R")
cat("A new file \"myScript.R\" was created. You can use it for\n")
cat("notes and code experiments.\n\n")
}
cat("\n\n")
cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
cat("\"files\" pane), edit it and save it.\n")
cat("Then click the checkbox, and use the More -> Move... dialogue\n")
cat("to move it into the \"myScripts\" folder.\n\n")
file.edit("ABC-units.R")
return(invisible(NULL))
}
if (! file.exists("./myScripts/.myProfile.R")) {
cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
cat(" =================")
cat("\n\n")
cat(" WELCOME !\n")
cat("\n")
cat(" Type 'init()' to begin\n\n")
cat("\n")
cat(" =================")
cat("\n\n")
} else { # local profile exists ... validate state:
cat("\n\nLoading local functions ...")
source(".utilities.R") # local profile appears sane, source utilities
source("./myScripts/.myProfile.R")
if (! exists("myEMail")) { # ... has eMail been defined?
cat("ERROR !\n")
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myEMail\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n")
}
if (! exists("myStudentNumber")) { # ... has the Student Number been defined?
cat("ERROR !\n")
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myStudentNumber\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n")
}
if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
cat("ERROR !\n") # is the Student Number valid?
cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("your Student Number could not be validated.\n")
cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
cat(" and fix the problem or contact your instructor to continue.\n\n")
}
if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now
# ... and write it into the profile.
prf <- readLines("./myScripts/.myProfile.R")
iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
out <- prf[1:iEmail]
out <- c(out, sprintf("MYSPE <- \"%s\" ",
getMYSPE(myStudentNumber)))
out <- c(out, prf[(iEmail+1):length(prf)])
writeLines(out, "./myScripts/.myProfile.R")
cat("\n")
cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
getMYSPE(myStudentNumber)))
MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use
rm(prf, iEmail, out) # cleanup
}
cat("... done.\n\n")
}
if (default.stringsAsFactors()) {
cat("WARNING.\n")
cat("========\n")
cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
cat("This will break some of the code.\n")
cat("Please contact your instructor to troubleshoot and fix this issue.\n")
cat("\n")
}
errText <- list()
errText[["noProfileFile"]] <- '
Your PROFILE FILE does not exist. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. Detailed
instructions were given when you first ran the init() command.
Try running init() again and follow the instructions. Reload youR RStudio
session and start over with this file.
If this does not fix the problem, ask for help.
'
errText[["noStudentNumber"]] <- '
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. This file gets
sourced when you start a new R-session, but since you see this error
message there was a problem.
Perhaps you need to restart your R-session. Try closing the RStudio
project and reopening it from the File > Recent Projects menu.
Perhaps there was a syntax error in your file. Then not all the
instructions in the file are executed. Check the file: is your
email perhpas not defined? Or did you type it without qwuoataion
marks?
Try fixing problems, and then restart R as described above.
If none of this fixes the problem, ask for help.
'
# [END]

88
.gitignore vendored
View File

@ -1,44 +1,44 @@
# Miscellaneous
.Ds_store
instructor/
dev/
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
# History files
.Rhistory
.Rapp.history
# Session Data files
# .RData
# Files produced in assingments
data/APSESphyloSet.mfa
data/APSEStreeRproml.rds
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
.Rproj.user
# Miscellaneous
.Ds_store
instructor/
dev/
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
# History files
.Rhistory
.Rapp.history
# Session Data files
# .RData
# Files produced in assingments
data/APSESphyloSet.mfa
data/APSEStreeRproml.rds
# Example code in package build process
*-Ex.R
# Output files from R CMD build
/*.tar.gz
# Output files from R CMD check
/*.Rcheck/
# RStudio files
.Rproj.user/
# produced vignettes
vignettes/*.html
vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
# knitr and R markdown default cache directories
/*_cache/
/cache/
# Temporary files created by R markdown
*.utf8.md
*.knit.md
.Rproj.user

76
.tmp.R
View File

@ -1,38 +1,38 @@
# myScript.R
#
# --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_
# your saved version.
#
# As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above --------
#
## Purpose: <...>
#
# Version: <...>
#
# Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>)
#
# Versions:
#
# <number> <Features>
#
# TODO:
# <...>
#
# ====================================================================
# [END]
# myScript.R
#
# --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_
# your saved version.
#
# As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above --------
#
## Purpose: <...>
#
# Version: <...>
#
# Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>)
#
# Versions:
#
# <number> <Features>
#
# TODO:
# <...>
#
# ====================================================================
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,257 +1,257 @@
# 2021-10-12_In-Class_exploration.R
#
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
#
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
# Scribe: boris.steipe@utoronto.ca
#
# ==============================================================================
#
# In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from
# the seqinr package.
# First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one.
library(seqinr)
# Determine what datasets are available
#
# Using "find in topic" ... "amino acid"
data(aacost)
data(aaindex)
data(pK)
# We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way.
# Build a datastructure ...
# rows: amino acids
# columns: properties
# Are all lists in aaindex organized in the same way?
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
# index as a reference list
# Loop over each list in aaindex
for (i in 1:length(aaindex)) {
# get the I-vector
x <- aaindex[[i]]$I
# get the names
x <- names(x)
# compare with the names of our reference list
# the == and != operators are vectorized. Applying them to two vectors
# gives TRUE or FALSE for each pair of elements. any() or all() can be
# applied to logical vectors to anylise them and return a soingle result.
# if (...) conditions evaluate only a single value and will throw a warning if
# there is more than one.
if (any(x != refNames)) {
# There was at least one not-equal pair - so: complain
print(sprintf("Problem in list %d: names don't match", i))
}
}
# If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list.
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
str(aaData) # Confirm: we now have a two-column dataframe
# Next: add the rest ...
for (i in 3:length(aaindex)) {
# get the I-vector and write it into our dataframe
aaData[,i] <- aaindex[[i]]$I
}
# Sanity check
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
# Looks good.
# We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently:
rownames(aaData)
aacost[ , 1]
# using order(), applied to aacost - ordering the column with column-name
# "aaa"
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column
# Is this the same order as refNames?
refNames == aacost[sel, "aaa"] # Yes!
# add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
# Done.
str(aaData) # A dataframe with 20 rows and 545 columns
# To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality":
#
# "in high dimensional data, however, all objects appear
# to be sparse and dissimilar in many ways..."
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
#
# A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis)
#
# PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset:
aaPCA <- prcomp(t(aaData))
# This creates an error, because some of our indicews contain NA values!
# Which indices are this?
# We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe.
sel <- logical()
for (i in 1:ncol(aaData)) { # for each index
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
sel <- c(sel, FALSE) # add a FALSE element to the vector
} else { # else
sel <- c(sel, TRUE) # add a TRUE element
}
}
# Done. sel now subsets only the NA-free columns
545 - sum(sel) # 13 columns excluded
# Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
str(aaPCA) # structure of the result
plot(aaPCA) # plot the contributions of the
# components to the variance
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
type ="n") # just to set up the coordinate system
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
labels = rownames(aaPCA$rotation))
# PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0 This
# ensures that each feature is given approximately equal weight
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
plot(aaPCA)
plot(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
type ="n")
text(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
labels = rownames(aaPCA$rotation))
# Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs
# ==== Rotation 1 ===================
#
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
# The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
# Iterate over all columns and calculate correlations
cors <- numeric()
for (i in 1:ncol(aaData)) {
cors[i] <- cor(PC1, aaData[ , i])
}
summary(cors)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
#
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
which(cors == max(cors, na.rm = TRUE))
aaindex[[504]] # Linker propensity ???
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
# Plot this ...
plot(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
type ="n")
text(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
labels = rownames(aaPCA$rotation))
# This is essentially a random correlation but for Cysteine ...
# ==== Rotation 2 ===================
#
# same process
PC2 <- aaPCA$rotation[ , 2]
cors2 <- numeric()
for (i in 1:ncol(aaData)) {
cors2[i] <- cor(PC2, aaData[ , i])
}
summary(cors2)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
# Here we have quite strong correlations
which(cors2 == max(cors2, na.rm = TRUE))
aaindex[[148]]
# this index itself is correlated with many other indices
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
# Plot this too...
plot(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
type ="n")
text(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
labels = rownames(aaPCA$rotation))
# This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC).
# [END]
# 2021-10-12_In-Class_exploration.R
#
# ===== T H E E V E N B E T T E R A M I N O A C I D =====
#
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
# Scribe: boris.steipe@utoronto.ca
#
# ==============================================================================
#
# In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from
# the seqinr package.
# First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one.
library(seqinr)
# Determine what datasets are available
#
# Using "find in topic" ... "amino acid"
data(aacost)
data(aaindex)
data(pK)
# We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way.
# Build a datastructure ...
# rows: amino acids
# columns: properties
# Are all lists in aaindex organized in the same way?
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
# index as a reference list
# Loop over each list in aaindex
for (i in 1:length(aaindex)) {
# get the I-vector
x <- aaindex[[i]]$I
# get the names
x <- names(x)
# compare with the names of our reference list
# the == and != operators are vectorized. Applying them to two vectors
# gives TRUE or FALSE for each pair of elements. any() or all() can be
# applied to logical vectors to anylise them and return a soingle result.
# if (...) conditions evaluate only a single value and will throw a warning if
# there is more than one.
if (any(x != refNames)) {
# There was at least one not-equal pair - so: complain
print(sprintf("Problem in list %d: names don't match", i))
}
}
# If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list.
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
str(aaData) # Confirm: we now have a two-column dataframe
# Next: add the rest ...
for (i in 3:length(aaindex)) {
# get the I-vector and write it into our dataframe
aaData[,i] <- aaindex[[i]]$I
}
# Sanity check
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
# Looks good.
# We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently:
rownames(aaData)
aacost[ , 1]
# using order(), applied to aacost - ordering the column with column-name
# "aaa"
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column
# Is this the same order as refNames?
refNames == aacost[sel, "aaa"] # Yes!
# add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
# Done.
str(aaData) # A dataframe with 20 rows and 545 columns
# To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality":
#
# "in high dimensional data, however, all objects appear
# to be sparse and dissimilar in many ways..."
# https://en.wikipedia.org/wiki/Curse_of_dimensionality
#
# A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis)
#
# PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset:
aaPCA <- prcomp(t(aaData))
# This creates an error, because some of our indicews contain NA values!
# Which indices are this?
# We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe.
sel <- logical()
for (i in 1:ncol(aaData)) { # for each index
if (any(is.na(aaData[,i]))) { # if there is any NA value ...
sel <- c(sel, FALSE) # add a FALSE element to the vector
} else { # else
sel <- c(sel, TRUE) # add a TRUE element
}
}
# Done. sel now subsets only the NA-free columns
545 - sum(sel) # 13 columns excluded
# Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
str(aaPCA) # structure of the result
plot(aaPCA) # plot the contributions of the
# components to the variance
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
type ="n") # just to set up the coordinate system
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
labels = rownames(aaPCA$rotation))
# PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0 This
# ensures that each feature is given approximately equal weight
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
plot(aaPCA)
plot(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
type ="n")
text(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2],
labels = rownames(aaPCA$rotation))
# Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs
# ==== Rotation 1 ===================
#
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
# The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
# Iterate over all columns and calculate correlations
cors <- numeric()
for (i in 1:ncol(aaData)) {
cors[i] <- cor(PC1, aaData[ , i])
}
summary(cors)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
#
# The max correlation is ~0.6. That is not very high. Which ijndex is it?
which(cors == max(cors, na.rm = TRUE))
aaindex[[504]] # Linker propensity ???
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
# Plot this ...
plot(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
type ="n")
text(aaPCA$rotation[ , 1],
aaindex[[504]]$I,
labels = rownames(aaPCA$rotation))
# This is essentially a random correlation but for Cysteine ...
# ==== Rotation 2 ===================
#
# same process
PC2 <- aaPCA$rotation[ , 2]
cors2 <- numeric()
for (i in 1:ncol(aaData)) {
cors2[i] <- cor(PC2, aaData[ , i])
}
summary(cors2)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
# Here we have quite strong correlations
which(cors2 == max(cors2, na.rm = TRUE))
aaindex[[148]]
# this index itself is correlated with many other indices
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
# Plot this too...
plot(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
type ="n")
text(aaPCA$rotation[ , 2],
aaindex[[148]]$I,
labels = rownames(aaPCA$rotation))
# This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC).
# [END]

View File

@ -1,161 +1,161 @@
# tocID <- "ABC-Install_all_packages.R"
#
# Purpose: A Bioinformatics Course:
# Installing all packages in this course
#
# Version: 1.0
#
# Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 New code
#
#
# TODO:
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Packages 33
#TOC> 2 CRAN packages 98
#TOC> 3 Bioconductor packages 127
#TOC> 4 Other package sources 142
#TOC> 5 Updating packages 148
#TOC>
#TOC> ==========================================================================
# = 1 Packages ============================================================
# Much of R's functionality is contributed in packages: bundles of R scripts
# or code in other languages, pre-configured objects, and datasets. Making this
# functionality available is often done by issuing a library(<package-name>)
# command, however this is not the preferred way, since it may override other
# R functions and it makes it harder to understand where the source code of
# a particular function is located. In this course we call the function name
# prefixed with the package name and two colons:
# <package-name>::<function-name>()
# This is the preferred way, since it is explicit.
#
# Regardless of which idiom one uses to call the actual function, the package
# needs to be "installed" first, i.e. the code must have been downloaded
# from CRAN, or using the BiocManager::install() function.
#
# This script contains download commands for all packages that are used in the
# course. You can execute the script line by line (or even source the entire
# script) to make sure all packages can be installed on your computer. Just
# one reminder: if you are ever asked to install from source, the correct
# answer is usually "no" - except if you really know what you are doing and why.
#
# Once packages are installed you can get additional information about
# the contents of a package with the commands:
# library(help=<package-name>) # basic information
# browseVignettes("<package-name>") # available vignettes
# data(package = "<package-name>") # available datasets
#
# ... and you can load data sets with:
# data(<data-set-name>, package = "<package-name>")
#
# All packages here are installed only when they have not been installed
# before, using the following idiom:
#
# if (! requireNamespace("<package-name>", quietly=TRUE)) {
# install.packages("<package-name>")
# }
#
# ... or its BiocManager::install() equivalent:
#
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
# BiocManager::install("<bioconductor-package-name>")
# }
#
# If you want to _force_ a re-installation of the package, simply issue
# the install.packages("<package-name>") command on its own. For compactness
# we wrap the idiom into a function, which can also switch between CRAN
# and BIOconductor sources:
installIfNeeded <- function(package, s = "CRAN") {
# s: "CRAN" or "BIO"
if (s == "CRAN") {
if (! requireNamespace(package, quietly=TRUE)) {
install.packages(package)
}
} else if (s == "BIO") {
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace(package, quietly=TRUE)) {
BiocManager::install(package)
}
} else {
stop(sprintf("Unknown source \"%s\".", s))
}
}
# = 2 CRAN packages =======================================================
installIfNeeded("ape")
installIfNeeded("BiocManager")
installIfNeeded("bio3d")
installIfNeeded("evd")
installIfNeeded("ggseqlogo")
installIfNeeded("ggtern")
installIfNeeded("hexbin")
installIfNeeded("httr")
installIfNeeded("igraph")
installIfNeeded("jsonlite")
installIfNeeded("magrittr")
installIfNeeded("MASS")
installIfNeeded("microbenchmark")
installIfNeeded("phangorn")
installIfNeeded("plotly")
installIfNeeded("plotrix")
installIfNeeded("profvis")
installIfNeeded("robustbase")
installIfNeeded("RColorBrewer")
installIfNeeded("Rphylip")
installIfNeeded("rvest")
installIfNeeded("seqinr")
installIfNeeded("stringi")
installIfNeeded("taxize")
installIfNeeded("testthat")
installIfNeeded("xml2")
# = 3 Bioconductor packages ===============================================
installIfNeeded("Biobase", s = "BIO")
installIfNeeded("biomaRt", s = "BIO")
installIfNeeded("Biostrings", s = "BIO")
installIfNeeded("DECIPHER", s = "BIO")
installIfNeeded("GEOquery", s = "BIO")
installIfNeeded("GOSim", s = "BIO")
installIfNeeded("limma", s = "BIO")
installIfNeeded("msa", s = "BIO")
installIfNeeded("org.Sc.sgd.db", s = "BIO")
installIfNeeded("prada", s = "BIO")
installIfNeeded("topGO", s = "BIO")
# = 4 Other package sources ===============================================
# Using sources other than CRAN or Bioconductor to download general-purpose
# programs that run on your computer is not generally recommended.
# = 5 Updating packages ===================================================
# From time to time, update CRAN packages with the following command ...
update.packages()
# ... and also update Bioconductor packages as follows:
BiocManager::install()
# [END]
# tocID <- "ABC-Install_all_packages.R"
#
# Purpose: A Bioinformatics Course:
# Installing all packages in this course
#
# Version: 1.0
#
# Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 New code
#
#
# TODO:
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Packages 33
#TOC> 2 CRAN packages 98
#TOC> 3 Bioconductor packages 127
#TOC> 4 Other package sources 142
#TOC> 5 Updating packages 148
#TOC>
#TOC> ==========================================================================
# = 1 Packages ============================================================
# Much of R's functionality is contributed in packages: bundles of R scripts
# or code in other languages, pre-configured objects, and datasets. Making this
# functionality available is often done by issuing a library(<package-name>)
# command, however this is not the preferred way, since it may override other
# R functions and it makes it harder to understand where the source code of
# a particular function is located. In this course we call the function name
# prefixed with the package name and two colons:
# <package-name>::<function-name>()
# This is the preferred way, since it is explicit.
#
# Regardless of which idiom one uses to call the actual function, the package
# needs to be "installed" first, i.e. the code must have been downloaded
# from CRAN, or using the BiocManager::install() function.
#
# This script contains download commands for all packages that are used in the
# course. You can execute the script line by line (or even source the entire
# script) to make sure all packages can be installed on your computer. Just
# one reminder: if you are ever asked to install from source, the correct
# answer is usually "no" - except if you really know what you are doing and why.
#
# Once packages are installed you can get additional information about
# the contents of a package with the commands:
# library(help=<package-name>) # basic information
# browseVignettes("<package-name>") # available vignettes
# data(package = "<package-name>") # available datasets
#
# ... and you can load data sets with:
# data(<data-set-name>, package = "<package-name>")
#
# All packages here are installed only when they have not been installed
# before, using the following idiom:
#
# if (! requireNamespace("<package-name>", quietly=TRUE)) {
# install.packages("<package-name>")
# }
#
# ... or its BiocManager::install() equivalent:
#
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
# BiocManager::install("<bioconductor-package-name>")
# }
#
# If you want to _force_ a re-installation of the package, simply issue
# the install.packages("<package-name>") command on its own. For compactness
# we wrap the idiom into a function, which can also switch between CRAN
# and BIOconductor sources:
installIfNeeded <- function(package, s = "CRAN") {
# s: "CRAN" or "BIO"
if (s == "CRAN") {
if (! requireNamespace(package, quietly=TRUE)) {
install.packages(package)
}
} else if (s == "BIO") {
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace(package, quietly=TRUE)) {
BiocManager::install(package)
}
} else {
stop(sprintf("Unknown source \"%s\".", s))
}
}
# = 2 CRAN packages =======================================================
installIfNeeded("ape")
installIfNeeded("BiocManager")
installIfNeeded("bio3d")
installIfNeeded("evd")
installIfNeeded("ggseqlogo")
installIfNeeded("ggtern")
installIfNeeded("hexbin")
installIfNeeded("httr")
installIfNeeded("igraph")
installIfNeeded("jsonlite")
installIfNeeded("magrittr")
installIfNeeded("MASS")
installIfNeeded("microbenchmark")
installIfNeeded("phangorn")
installIfNeeded("plotly")
installIfNeeded("plotrix")
installIfNeeded("profvis")
installIfNeeded("robustbase")
installIfNeeded("RColorBrewer")
installIfNeeded("Rphylip")
installIfNeeded("rvest")
installIfNeeded("seqinr")
installIfNeeded("stringi")
installIfNeeded("taxize")
installIfNeeded("testthat")
installIfNeeded("xml2")
# = 3 Bioconductor packages ===============================================
installIfNeeded("Biobase", s = "BIO")
installIfNeeded("biomaRt", s = "BIO")
installIfNeeded("Biostrings", s = "BIO")
installIfNeeded("DECIPHER", s = "BIO")
installIfNeeded("GEOquery", s = "BIO")
installIfNeeded("GOSim", s = "BIO")
installIfNeeded("limma", s = "BIO")
installIfNeeded("msa", s = "BIO")
installIfNeeded("org.Sc.sgd.db", s = "BIO")
installIfNeeded("prada", s = "BIO")
installIfNeeded("topGO", s = "BIO")
# = 4 Other package sources ===============================================
# Using sources other than CRAN or Bioconductor to download general-purpose
# programs that run on your computer is not generally recommended.
# = 5 Updating packages ===================================================
# From time to time, update CRAN packages with the following command ...
update.packages()
# ... and also update Bioconductor packages as follows:
BiocManager::install()
# [END]

View File

@ -1,100 +1,100 @@
# addSACCE_APSESproteins.R
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
#
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SWI4_SACCE",
RefSeqID = "NP_011036",
UniProtID = "P25302",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
1081 klddiekdlr ana"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "PHD1_SACCE",
RefSeqID = "NP_012881",
UniProtID = "P36093",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
361 aknels"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SOK2_SACCE",
RefSeqID = "NP_013729",
UniProtID = "P53438",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
781 kkqek"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "XBP1_SACCE",
RefSeqID = "NP_012165",
UniProtID = "P40489",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
stringsAsFactors = FALSE))
# [END]
# addSACCE_APSESproteins.R
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
#
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SWI4_SACCE",
RefSeqID = "NP_011036",
UniProtID = "P25302",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
1081 klddiekdlr ana"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "PHD1_SACCE",
RefSeqID = "NP_012881",
UniProtID = "P36093",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
361 aknels"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SOK2_SACCE",
RefSeqID = "NP_013729",
UniProtID = "P53438",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
781 kkqek"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "XBP1_SACCE",
RefSeqID = "NP_012165",
UniProtID = "P40489",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
stringsAsFactors = FALSE))
# [END]

View File

@ -1,69 +1,69 @@
# ABC-units.R
#
# Purpose: A Bioinformatics Course: R code for learning units
#
# Version: 4.0
#
# Date: 2020 09 16
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# V 4.0 2020 version
# V 3.0 2019 version
# V 2.0 2018 version
# V 1.0 2017 version
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# The R-scripts and datasets in this project will be continuously updated,
# and updates will be posted on GitHub. To bring your version into the latest
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
# repository. However, this will overwrite locally edited version of files.
# To edit code and experiment with it, for example to add your own comments and
# examples, save your edited version into the "myScripts" folder. Otherwise you
# may have problems with git when you update the project to a new version. It's
# good practice to change the filename, for example by prepending your initials.
# This helps distinguish the files you are working with e.g. in a list of
# recent files. For example if your name is Honjo Tasuku, your edited
# BIN-Sequence.R might be named HT-BIN-Sequence.R
# If you pull from github and get the following type of error ...
# ---------------
# error: Your local changes to the following files would be
# overwritten by merge
# ...
# Please commit your changes or stash them before you can merge.
# ---------------
# ... then, you need to bring the offending file into its original state.
# Open the Commit window, select the file, and click on the Revert button.
#
# When working with these script DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
#
# ==============================================================================
# Once you have typed and executed the function init(), you will find a file
# called myScript.R in the project directory.
#
# Open it, you can place all of your code-experiments and notes into that
# file. This will complement your "Course Journal". If you keep all contents in
# this one file, you can find everything by using the <cmd>-F find function. To
# cross-reference code in your journal, create section headings.
#
# ==============================================================================
# The individual learning units' files can be opened by simply clicking on them
# in the File pane.
# [END]
# ABC-units.R
#
# Purpose: A Bioinformatics Course: R code for learning units
#
# Version: 4.0
#
# Date: 2020 09 16
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# V 4.0 2020 version
# V 3.0 2019 version
# V 2.0 2018 version
# V 1.0 2017 version
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# The R-scripts and datasets in this project will be continuously updated,
# and updates will be posted on GitHub. To bring your version into the latest
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
# repository. However, this will overwrite locally edited version of files.
# To edit code and experiment with it, for example to add your own comments and
# examples, save your edited version into the "myScripts" folder. Otherwise you
# may have problems with git when you update the project to a new version. It's
# good practice to change the filename, for example by prepending your initials.
# This helps distinguish the files you are working with e.g. in a list of
# recent files. For example if your name is Honjo Tasuku, your edited
# BIN-Sequence.R might be named HT-BIN-Sequence.R
# If you pull from github and get the following type of error ...
# ---------------
# error: Your local changes to the following files would be
# overwritten by merge
# ...
# Please commit your changes or stash them before you can merge.
# ---------------
# ... then, you need to bring the offending file into its original state.
# Open the Commit window, select the file, and click on the Revert button.
#
# When working with these script DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
#
# ==============================================================================
# Once you have typed and executed the function init(), you will find a file
# called myScript.R in the project directory.
#
# Open it, you can place all of your code-experiments and notes into that
# file. This will complement your "Course Journal". If you keep all contents in
# this one file, you can find everything by using the <cmd>-F find function. To
# cross-reference code in your journal, create section headings.
#
# ==============================================================================
# The individual learning units' files can be opened by simply clicking on them
# in the File pane.
# [END]

View File

@ -1,16 +1,16 @@
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: No
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: knitr
LaTeX: XeLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: No
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: knitr
LaTeX: XeLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

View File

@ -1,111 +1,111 @@
# tocID <- "BIN-ALI-BLAST.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# ==============================================================================
#
# Version: 1.3
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.1 Fixed parsing logic.
# 1.0 First live version 2017.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 Defining the APSES domain 45
#TOC> 2 Executing the BLAST search 75
#TOC> 3 Analysing results 97
#TOC>
#TOC> ==========================================================================
# = 1 Defining the APSES domain ===========================================
# Load your protein database
source("makeProteinDB.R")
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
# have entered this data into your database in the
# BIN-ALI-Optimal_sequence_alignment unit.)
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
# name of the Mbp1 orthologue
# of Mbp1 in your protein
# database, DON'T continue. We
# need to fix this problem.
# Get in touch.
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
# BLAST search.
# = 2 Executing the BLAST search ==========================================
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
# through its Web API, and to parse results. Have a look at the script, then
# source it:
source("./scripts/BLAST.R")
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
# cerevisiae:
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in
nHits = 10, #
E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c
length(BLASTresults$hits) # There should be at least one hit there. Ask for
# advice in case this step fails.
# = 3 Analysing results ===================================================
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
# What is the refseq ID of the top hit
topHit$accession
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
# domain. If it is not, ask me for advice.
# [END]
# tocID <- "BIN-ALI-BLAST.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# ==============================================================================
#
# Version: 1.3
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.1 Fixed parsing logic.
# 1.0 First live version 2017.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 Defining the APSES domain 45
#TOC> 2 Executing the BLAST search 75
#TOC> 3 Analysing results 97
#TOC>
#TOC> ==========================================================================
# = 1 Defining the APSES domain ===========================================
# Load your protein database
source("makeProteinDB.R")
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
# have entered this data into your database in the
# BIN-ALI-Optimal_sequence_alignment unit.)
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
# name of the Mbp1 orthologue
# of Mbp1 in your protein
# database, DON'T continue. We
# need to fix this problem.
# Get in touch.
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
# BLAST search.
# = 2 Executing the BLAST search ==========================================
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
# through its Web API, and to parse results. Have a look at the script, then
# source it:
source("./scripts/BLAST.R")
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
# cerevisiae:
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in
nHits = 10, #
E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c
length(BLASTresults$hits) # There should be at least one hit there. Ask for
# advice in case this step fails.
# = 3 Analysing results ===================================================
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
# What is the refseq ID of the top hit
topHit$accession
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
# domain. If it is not, ask me for advice.
# [END]

View File

@ -1,195 +1,195 @@
# tocID <- "BIN-ALI-Dotplot.R"
#
#
# ==============================================================================
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Dotplot unit.
#
# Version: 0.2
#
# Date: 2019 01 07
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------
#TOC> 1 ___Section___ 42
#TOC> 2 Tasks 190
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
if (!requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Let's load BLOSUM62
data(BLOSUM62, package = "Biostrings")
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
# matrix that has as many rows as one sequence, as many columns as another. Then
# we go through every cell of the matrix and enter the pairscore we encounter
# for the amino acid pair whose position corresponds to the row and column
# index. Finally we visualize the matrix in a plot.
#
# First we fetch our sequences and split them into single characters.
sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length.
str(MBP1_SACCE)
str(MBP1_MYSPE)
# How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
MBP1_SACCE[13]
MBP1_MYSPE[21]
# ... using these as subsetting expressions, we can pull the pairscore
# from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
# First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
# ... then we loop over the sequences and store the scores in the matrix.
#
for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_MYSPE)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
}
}
# Even though this is a large matrix, this does not take much time ...
# Let's have a look at a small block of the values:
dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_MYSPE.
# To plot this, we use the image() function. Here, with default parameters.
image(dotMat)
# Be patient, this takes a few moments to render: more than 500,000 values.
# Nice.
# What do you expect?
# What would similar sequences look like?
# What do you see?
#You migh notice a thin line of yellow along the diagonal, moving approximately
# from bottom left to top right, fading in and out of existence. This is the
# signature of extended sequence similarity.
# Let's magnify this a bit by looking at only the first 200 amino acids ...
image(dotMat[1:200, 1:200])
# ... and, according to our normal writing convention, we would like the
# diagonal to run from top-left to bottom-right since we write from left to
# right and from top to bottom...
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
# ... and we would like the range of the x- and y- axis to correspond to the
# sequence position ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ...
len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
box()
axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10)))
axis(3, at = c(1, seq(10, len, by=10)))
axis(4, at = c(1, seq(10, len, by=10)))
# ... you get the idea, we can infinitely customize our plot. However a good way
# to do this is to develop a particular view for, say, a report or publication
# in a script and then put it into a function. I have put a function into the
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package:
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to
# look very closely to discern the faint diagonals that correspond to similar
# sequence.
# Let's see if we can enhance the contrast between distributed noise and the
# actual alignment of conserved residues. We can filter the dot matrix with a
# pattern that enhances diagonally repeated values. Every value in the matrix
# will be replaced by a weighted average of its neighborhood. Here is a
# diagonal-filter:
myFilter <- matrix(numeric(25), nrow = 5)
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit
# ... Can you come up with a better filter? If so, eMail us.
# = 2 Tasks ===============================================================
# [END]
# tocID <- "BIN-ALI-Dotplot.R"
#
#
# ==============================================================================
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Dotplot unit.
#
# Version: 0.2
#
# Date: 2019 01 07
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------
#TOC> 1 ___Section___ 42
#TOC> 2 Tasks 190
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
if (!requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Let's load BLOSUM62
data(BLOSUM62, package = "Biostrings")
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
# matrix that has as many rows as one sequence, as many columns as another. Then
# we go through every cell of the matrix and enter the pairscore we encounter
# for the amino acid pair whose position corresponds to the row and column
# index. Finally we visualize the matrix in a plot.
#
# First we fetch our sequences and split them into single characters.
sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length.
str(MBP1_SACCE)
str(MBP1_MYSPE)
# How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
MBP1_SACCE[13]
MBP1_MYSPE[21]
# ... using these as subsetting expressions, we can pull the pairscore
# from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
# First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
# ... then we loop over the sequences and store the scores in the matrix.
#
for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_MYSPE)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
}
}
# Even though this is a large matrix, this does not take much time ...
# Let's have a look at a small block of the values:
dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_MYSPE.
# To plot this, we use the image() function. Here, with default parameters.
image(dotMat)
# Be patient, this takes a few moments to render: more than 500,000 values.
# Nice.
# What do you expect?
# What would similar sequences look like?
# What do you see?
#You migh notice a thin line of yellow along the diagonal, moving approximately
# from bottom left to top right, fading in and out of existence. This is the
# signature of extended sequence similarity.
# Let's magnify this a bit by looking at only the first 200 amino acids ...
image(dotMat[1:200, 1:200])
# ... and, according to our normal writing convention, we would like the
# diagonal to run from top-left to bottom-right since we write from left to
# right and from top to bottom...
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
# ... and we would like the range of the x- and y- axis to correspond to the
# sequence position ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ...
len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
box()
axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10)))
axis(3, at = c(1, seq(10, len, by=10)))
axis(4, at = c(1, seq(10, len, by=10)))
# ... you get the idea, we can infinitely customize our plot. However a good way
# to do this is to develop a particular view for, say, a report or publication
# in a script and then put it into a function. I have put a function into the
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package:
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to
# look very closely to discern the faint diagonals that correspond to similar
# sequence.
# Let's see if we can enhance the contrast between distributed noise and the
# actual alignment of conserved residues. We can filter the dot matrix with a
# pattern that enhances diagonally repeated values. Every value in the matrix
# will be replaced by a weighted average of its neighborhood. Here is a
# diagonal-filter:
myFilter <- matrix(numeric(25), nrow = 5)
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit
# ... Can you come up with a better filter? If so, eMail us.
# = 2 Tasks ===============================================================
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,365 +1,365 @@
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# ==============================================================================
# Version: 1.7.1
#
# Date: 2017-09 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
# 1.7 2020 updates
# 1.6 Maintenance
# 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library.
# 1.3 Updated confirmation task with correct logic
# 1.2 Added missing load of seqinr package
# 1.1 Update annotation file logic - it could already have been
# prepared in the BIN-FUNC-Annotation unit.
# 1.0.1 bugfix
# 1.0 First 2017 live version.
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------------
#TOC> 1 Prepare 58
#TOC> 2 Biostrings Pairwise Alignment 75
#TOC> 2.1 Optimal global alignment 93
#TOC> 2.2 Optimal local alignment 156
#TOC> 3 APSES Domain annotation by alignment 180
#TOC> 4 Update your database script 261
#TOC> 4.1 Preparing an annotation file ... 267
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
#TOC> 4.2 Execute and Validate 338
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# You can get package information with the following commands:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.
source("./myScripts/makeProteinDB.R")
# = 2 Biostrings Pairwise Alignment =======================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward.
# == 2.1 Optimal global alignment ==========================================
# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
str(ali1) # ... it's complicated
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch
# or work with "normal" R functions
# the alignment length
nchar(as.character(ali1@pattern))
# the number of identities
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject)))
# ... e.g. to calculate the percentage of identities
100 *
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) /
nchar(as.character(ali1@pattern))
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object
return(100 *
sum(seqinr::s2c(as.character(al@pattern)) ==
seqinr::s2c(as.character(al@subject))) /
nchar(as.character(al@pattern)))
}
percentID(ali1)
# == 2.2 Optimal local alignment ===========================================
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
gapExtension = 10)
Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity:
percentID(ali2)
# == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...
# = 3 APSES Domain annotation by alignment ================================
# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
#
# I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
# ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
# ... and with the two annotations we can get the corresponding ID from the
# annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
myDB$annotation[myDB$annotation$ID == proID &
myDB$annotation$ID == ftrID, ]
# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- Biostrings::pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_MYSPE,
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
Biostrings::writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.
str(aliApses@subject@range)
# start is:
aliApses@subject@range@start
# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1
# = 4 Update your database script =========================================
# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.
# == 4.1 Preparing an annotation file ... ==================================
#
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
#
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines except for the line that says:
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
#
# - Then delete the comma at the end of the line (your file will just have
# this one annotation).
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Save the file in your myScripts/ directory
#
## - Validate your file online at https://jsonlint.com/
#
# - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
# - save and close the file.
#
# Then SKIP the next section.
#
#
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
#
#
# You DO already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Below the last feature lines (but before the closing "]") add the
# following feature line (without the "#")
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Add a comma after the preceding feature line.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 4.2 Execute and Validate ==============================================
#
# - source() your database creation script:
#
# source("./myScripts/makeProteinDB.R")
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask on the mailing list for
# help.
#
# - Confirm
# The following commands should retrieve the correct start and end
# coordinates and sequence of the MBP1_MYSPE APSES domain:
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
(proID <- myDB$protein$ID[sel])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# [END]
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# ==============================================================================
# Version: 1.7.1
#
# Date: 2017-09 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
# 1.7 2020 updates
# 1.6 Maintenance
# 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library.
# 1.3 Updated confirmation task with correct logic
# 1.2 Added missing load of seqinr package
# 1.1 Update annotation file logic - it could already have been
# prepared in the BIN-FUNC-Annotation unit.
# 1.0.1 bugfix
# 1.0 First 2017 live version.
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------------
#TOC> 1 Prepare 58
#TOC> 2 Biostrings Pairwise Alignment 75
#TOC> 2.1 Optimal global alignment 93
#TOC> 2.2 Optimal local alignment 156
#TOC> 3 APSES Domain annotation by alignment 180
#TOC> 4 Update your database script 261
#TOC> 4.1 Preparing an annotation file ... 267
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
#TOC> 4.2 Execute and Validate 338
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# You can get package information with the following commands:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit.
source("./myScripts/makeProteinDB.R")
# = 2 Biostrings Pairwise Alignment =======================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward.
# == 2.1 Optimal global alignment ==========================================
# The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server.
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
str(ali1) # ... it's complicated
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch
# or work with "normal" R functions
# the alignment length
nchar(as.character(ali1@pattern))
# the number of identities
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject)))
# ... e.g. to calculate the percentage of identities
100 *
sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) /
nchar(as.character(ali1@pattern))
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object
return(100 *
sum(seqinr::s2c(as.character(al@pattern)) ==
seqinr::s2c(as.character(al@subject))) /
nchar(as.character(al@pattern)))
}
percentID(ali1)
# == 2.2 Optimal local alignment ===========================================
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
gapExtension = 10)
Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity:
percentID(ali2)
# == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...
# = 3 APSES Domain annotation by alignment ================================
# In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES
# domain.
#
# I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
# ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
# ... and with the two annotations we can get the corresponding ID from the
# annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
myDB$annotation[myDB$annotation$ID == proID &
myDB$annotation$ID == ftrID, ]
# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- Biostrings::pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_MYSPE,
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
Biostrings::writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.
str(aliApses@subject@range)
# start is:
aliApses@subject@range@start
# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1
# = 4 Update your database script =========================================
# Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB.
# == 4.1 Preparing an annotation file ... ==================================
#
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
#
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines except for the line that says:
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
#
# - Then delete the comma at the end of the line (your file will just have
# this one annotation).
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Save the file in your myScripts/ directory
#
## - Validate your file online at https://jsonlint.com/
#
# - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
# - save and close the file.
#
# Then SKIP the next section.
#
#
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
#
#
# You DO already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Below the last feature lines (but before the closing "]") add the
# following feature line (without the "#")
#
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
#
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence.
#
# - Add a comma after the preceding feature line.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 4.2 Execute and Validate ==============================================
#
# - source() your database creation script:
#
# source("./myScripts/makeProteinDB.R")
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask on the mailing list for
# help.
#
# - Confirm
# The following commands should retrieve the correct start and end
# coordinates and sequence of the MBP1_MYSPE APSES domain:
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
(proID <- myDB$protein$ID[sel])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start,
end))
# [END]

View File

@ -1,313 +1,313 @@
# tocID <- "BIN-ALI-Similarity.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
# Update ggtern:: ternary plot to use aacol dots under text
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 189
#TOC> 3 Background score 230
#TOC>
#TOC> ==========================================================================
# = 1 Amino Acid Properties ===============================================
# A large collection of amino acid property tables is available via the seqinr
# package:
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
# data:
?aaindex
data(aaindex, package = "seqinr") # load the aaindex list from the package
length(aaindex)
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# It's a bit cumbersome to search through the descriptions ... here is a
# function to make this easier:
searchAAindex <- function(patt) {
# Searches the aaindex descriptions for regular expression "patt"
# and prints index number and description.
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
for (i in seq_along(hits)) {
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
}
}
searchAAindex("free energy") # Search for "free energy"
searchAAindex("(size)|(volume)") # Search for "size" or "volume":
# Let's examine ...
# ... a hydrophobicity index
(Y <- aaindex[[528]][c("D", "I")])
# ... a volume index
(V <- aaindex[[150]][c("D", "I")])
# ... and one of our own: side-chain pK values as reported by
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
# to 7.4 (physiological pH)
K <- list(I = c( 7.4, # Ala
12.3, # Arg
7.4, # Asn
3.9, # Asp
8.6, # Cys
7.4, # Gln
4.3, # Glu
7.4, # Gly
6.5, # His
7.4, # Ile
7.4, # Leu
10.4, # Lys
7.4, # Met
7.4, # Phe
7.4, # Pro
7.4, # Ser
7.4, # Thr
7.4, # Trp
9.8, # Tyr
7.4)) # Val
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
# pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ...
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
plot(Y$I, V$I,
xlab = "hydrophobicity", ylab = "volume",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, V$I, names(Y$I), cex = 0.8)
plot(Y$I, K$I,
xlab = "hydrophobicity", ylab = "pK",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, K$I, names(Y$I), cex = 0.8)
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot":
if (! requireNamespace("ggtern", quietly=TRUE)) {
install.packages("ggtern")
}
# Package information:
# library(help = ggtern) # basic information
# browseVignettes("ggtern") # available vignettes
# data(package = "ggtern") # available datasets
# collect into data frame, normalize to (0.05, 0.95)
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
stringsAsFactors = FALSE)
rownames(myDat) <- names(Y$I)
ggtern::ggtern(data = myDat,
ggplot2::aes(x = vol,
y = phi,
z = pK,
label = rownames(myDat))) + ggplot2::geom_text()
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
# ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...)
prc <- prcomp(myDat)
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
pch=19, cex=6, col=aad, cex.main=0.7,
main="Principal Component Analysis of Amino Acid Features")
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
# This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein.
# = 2 Mutation Data matrix ================================================
# A mutation data matrix encodes all amino acid pairscores in a matrix.
# The Biostrings package contains the most common mutation data matrices.
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help=Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Let's attach the BLOSUM62 mutation data matrix from the package
data(BLOSUM62, package = "Biostrings")
# ... and see what it contains. (You've seen this matrix before.)
BLOSUM62
# We can simply access values via the row/column names.
# Identical amino acids have high scores ...
BLOSUM62["H", "H"] # Score for a pair of two histidines
BLOSUM62["S", "S"] # Score for a pair of two serines
# Similar amino acids have low positive scores ...
BLOSUM62["L", "I"] # Score for a leucine / lysine pair
BLOSUM62["F", "Y"] # etc.
# Dissimilar amino acids have negative scores ...
BLOSUM62["L", "K"] # Score for a leucine / lysine pair
BLOSUM62["Q", "P"] # etc.
BLOSUM62["R", "W"] # the matrix is symmetric!
BLOSUM62["W", "R"]
# = 3 Background score ====================================================
# The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence?
# If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity
# codes and the gap character:
sum(BLOSUM62[1:20, 1:20])/400
# But that score could be higher for real sequences, for which the amino acid
# distribution is not random. For example membrane proteins have a large number
# of hydrophobic residues - an alignment of unrelated proteins might produce
# positive scores. And there are other proteins with biased amino acid
# compositions, in particular poteins that interact with multiple other
# proteins. Let's test how this impacts the background score by comparing a
# sequence with shuffled sequences. These have the same composition, but are
# obvioulsy not homologous. The data directory contains the FASTA file for the
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ...
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues.
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
# Here is a function that takes two sequences and
# returns their average pairscore.
averagePairScore <- function(a, b, MDM = BLOSUM62) {
# Returns average pairscore of two sequences.
# Parameters:
# a, b chr amino acid sequence string
# MDM mutation data matrix. Default is BLOSUM62
# Value: num average pairscore.
a <- unlist(strsplit(a, ""))
b <- unlist(strsplit(b, ""))
v <- 0
for (i in seq_along(a)) {
v <- v + MDM[ a[i], b[i] ]
}
return(v / length(a))
}
orig3FG7 <- toString(aa3FG7)
orig2F1C <- toString(aa2F1C)
N <- 1000
scores3FG7 <- numeric(N)
scores2F1C <- numeric(N)
for (i in 1:N) {
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
}
# Plot the distributions
hist(scores3FG7,
col="#5599EE33",
breaks = seq(-1.5, 0, by=0.1),
main = "Pairscores for randomly shuffled sequences",
xlab = "Average pairscore from BLOSUM 62")
hist(scores2F1C,
col="#55EE9933",
breaks = seq(-1.5, 0, by=0.1),
add = TRUE)
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
legend('topright',
c("3FG7 (villin)", "2F1C (OmpG)"),
fill = c("#5599EE33", "#55EE9933"), bty = 'n',
inset = 0.1)
# This is an important result: even though we have shuffled significantly biased
# sequences, and the average scores trend above the average of the mutation data
# matrix, the average scores still remain comfortably below zero. This means
# that we can't (in general) improve a high-scoring alignment by simply
# extending it with randomly matched residues. We will only improve the score if
# the similarity of newly added residues is larger than what we expect to get by
# random chance!
# [END]
# tocID <- "BIN-ALI-Similarity.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot.
# 0.1 First code copied from 2016 material.
#
#
# TODO:
# Update ggtern:: ternary plot to use aacol dots under text
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 189
#TOC> 3 Background score 230
#TOC>
#TOC> ==========================================================================
# = 1 Amino Acid Properties ===============================================
# A large collection of amino acid property tables is available via the seqinr
# package:
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
# data:
?aaindex
data(aaindex, package = "seqinr") # load the aaindex list from the package
length(aaindex)
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# It's a bit cumbersome to search through the descriptions ... here is a
# function to make this easier:
searchAAindex <- function(patt) {
# Searches the aaindex descriptions for regular expression "patt"
# and prints index number and description.
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
for (i in seq_along(hits)) {
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
}
}
searchAAindex("free energy") # Search for "free energy"
searchAAindex("(size)|(volume)") # Search for "size" or "volume":
# Let's examine ...
# ... a hydrophobicity index
(Y <- aaindex[[528]][c("D", "I")])
# ... a volume index
(V <- aaindex[[150]][c("D", "I")])
# ... and one of our own: side-chain pK values as reported by
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
# to 7.4 (physiological pH)
K <- list(I = c( 7.4, # Ala
12.3, # Arg
7.4, # Asn
3.9, # Asp
8.6, # Cys
7.4, # Gln
4.3, # Glu
7.4, # Gly
6.5, # His
7.4, # Ile
7.4, # Leu
10.4, # Lys
7.4, # Met
7.4, # Phe
7.4, # Pro
7.4, # Ser
7.4, # Thr
7.4, # Trp
9.8, # Tyr
7.4)) # Val
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
# pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ...
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
plot(Y$I, V$I,
xlab = "hydrophobicity", ylab = "volume",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, V$I, names(Y$I), cex = 0.8)
plot(Y$I, K$I,
xlab = "hydrophobicity", ylab = "pK",
pch = 21,
cex = 6,
col = aac,
bg = aac)
text(Y$I, K$I, names(Y$I), cex = 0.8)
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot":
if (! requireNamespace("ggtern", quietly=TRUE)) {
install.packages("ggtern")
}
# Package information:
# library(help = ggtern) # basic information
# browseVignettes("ggtern") # available vignettes
# data(package = "ggtern") # available datasets
# collect into data frame, normalize to (0.05, 0.95)
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
stringsAsFactors = FALSE)
rownames(myDat) <- names(Y$I)
ggtern::ggtern(data = myDat,
ggplot2::aes(x = vol,
y = phi,
z = pK,
label = rownames(myDat))) + ggplot2::geom_text()
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
# ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...)
prc <- prcomp(myDat)
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
pch=19, cex=6, col=aad, cex.main=0.7,
main="Principal Component Analysis of Amino Acid Features")
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
# This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein.
# = 2 Mutation Data matrix ================================================
# A mutation data matrix encodes all amino acid pairscores in a matrix.
# The Biostrings package contains the most common mutation data matrices.
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help=Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Let's attach the BLOSUM62 mutation data matrix from the package
data(BLOSUM62, package = "Biostrings")
# ... and see what it contains. (You've seen this matrix before.)
BLOSUM62
# We can simply access values via the row/column names.
# Identical amino acids have high scores ...
BLOSUM62["H", "H"] # Score for a pair of two histidines
BLOSUM62["S", "S"] # Score for a pair of two serines
# Similar amino acids have low positive scores ...
BLOSUM62["L", "I"] # Score for a leucine / lysine pair
BLOSUM62["F", "Y"] # etc.
# Dissimilar amino acids have negative scores ...
BLOSUM62["L", "K"] # Score for a leucine / lysine pair
BLOSUM62["Q", "P"] # etc.
BLOSUM62["R", "W"] # the matrix is symmetric!
BLOSUM62["W", "R"]
# = 3 Background score ====================================================
# The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence?
# If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity
# codes and the gap character:
sum(BLOSUM62[1:20, 1:20])/400
# But that score could be higher for real sequences, for which the amino acid
# distribution is not random. For example membrane proteins have a large number
# of hydrophobic residues - an alignment of unrelated proteins might produce
# positive scores. And there are other proteins with biased amino acid
# compositions, in particular poteins that interact with multiple other
# proteins. Let's test how this impacts the background score by comparing a
# sequence with shuffled sequences. These have the same composition, but are
# obvioulsy not homologous. The data directory contains the FASTA file for the
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ...
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues.
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
# Here is a function that takes two sequences and
# returns their average pairscore.
averagePairScore <- function(a, b, MDM = BLOSUM62) {
# Returns average pairscore of two sequences.
# Parameters:
# a, b chr amino acid sequence string
# MDM mutation data matrix. Default is BLOSUM62
# Value: num average pairscore.
a <- unlist(strsplit(a, ""))
b <- unlist(strsplit(b, ""))
v <- 0
for (i in seq_along(a)) {
v <- v + MDM[ a[i], b[i] ]
}
return(v / length(a))
}
orig3FG7 <- toString(aa3FG7)
orig2F1C <- toString(aa2F1C)
N <- 1000
scores3FG7 <- numeric(N)
scores2F1C <- numeric(N)
for (i in 1:N) {
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
}
# Plot the distributions
hist(scores3FG7,
col="#5599EE33",
breaks = seq(-1.5, 0, by=0.1),
main = "Pairscores for randomly shuffled sequences",
xlab = "Average pairscore from BLOSUM 62")
hist(scores2F1C,
col="#55EE9933",
breaks = seq(-1.5, 0, by=0.1),
add = TRUE)
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
legend('topright',
c("3FG7 (villin)", "2F1C (OmpG)"),
fill = c("#5599EE33", "#55EE9933"), bty = 'n',
inset = 0.1)
# This is an important result: even though we have shuffled significantly biased
# sequences, and the average scores trend above the average of the mutation data
# matrix, the average scores still remain comfortably below zero. This means
# that we can't (in general) improve a high-scoring alignment by simply
# extending it with randomly matched residues. We will only improve the score if
# the similarity of newly added residues is larger than what we expect to get by
# random chance!
# [END]

View File

@ -1,216 +1,216 @@
# tocID <- "BIN-Data_integration.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Data_integration unit.
#
# Version: 1.2
#
# Date: 2018-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance and updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0.1 Bugfix: UniProt ID Mapping service API change
# 1.0 First live version
#
#
# TODO:
# Develop a fungi-specific BioMart example.
# (cf.
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Identifier mapping 42
#TOC> 2 Cross-referencing tables 165
#TOC>
#TOC> ==========================================================================
# = 1 Identifier mapping ==================================================
# UniProt provides a well-designed ID mapping tool that can be accessed
# online at http://www.uniprot.org/mapping/
#
# Here we will use the UniProt Web API for this tool to map identifiers. The
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
# simply via a Web- browsers request. Such requests are commonly sent via the
# GET or POST verbs that a Webserver responds to, when a client asks for data.
# GET requests are visible in the URL of the request; POST requests are not
# directly visible, they are commonly used to send the contents of forms, or
# when transmitting larger, complex data items. The UniProt ID mapping sevice
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
# and POST() functions are part of the httr package.
# To begin, we load httr, which supports sending and receiving data via the
# http protocol, just like a Web browser.
if (! requireNamespace("httr", quietly=TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We will walk through the process with the refSeqID
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
# happens if the ID can't be mapped:
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
# The UniProt ID mapping service API is very straightforward to use: just define
# the URL of the server and send a list of items labelled as "query" in the body
# of the request. GET() and POST() are functions from httr.
# Note. A recent bug in the interaction between the server expectations and the
# curl client libraries requires the following initialization
httr::set_config(httr::config(http_version = 0))
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
URL <- "https://www.uniprot.org/mapping/"
response <- httr::POST(URL,
body = list(from = "P_REFSEQ_AC", # Refseq Protein
to = "ACC", # UniProt ID
format = "tab",
query = myQueryIDs))
cat(httr::content(response))
# We need to check the status code - if it is not 200, an error ocurred and we
# can't process the result:
httr::status_code(response)
# If the query is successful, tabbed text is returned. We can assign that to a
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
myMappedIDs
# If this works as expected, you should see:
# From To
# 1 NP_010227 P39678
# 2 NP_011036 P25302
#
# ... and note that there are only two entries, because nothing was returned
# for the dummy "RefSeq ID" NP_00000
# If the query can't be fulfilled because of a problem with the server, a
# WebPage is returned. But the server status is also returned and we can check
# the status code. I have lately gotten many "503" status codes: Server Not
# Available...
# We wrap this into a function:
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# Use UniProt ID mapping service to map one or more IDs
# Parameters:
# s char A string of separated IDs
# mapFrom char the database in which the IDs in s are valid. Default
# is RefSeq protein
# mapTo char the database in which the target IDs are valid. Default
# is UniProtKB
# Value
# a data frame of mapped IDs, with column names From and To, or an
# empty data frame if the mapping was unsuccessful. No rows are returned
# for IDs that are not mapped.
# Initialize curl
httr::set_config(httr::config(http_version = 0))
URL <- "https://www.uniprot.org/uploadlists/"
response <- httr::POST(URL,
body = list(from = mapFrom,
to = mapTo,
format = "tab",
query = s))
if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
colnames(myMap) <- c("From", "To")
} else {
myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:",
"server sent status",
httr::status_code(response)))
}
return(myMap)
}
# Try it out ...
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
# into your workspace on startup.
# = 2 Cross-referencing tables ============================================
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
# genes in a model organism database such as SGD, or from the Human Genen
# Nomenclature commission. How do we map one set of identifiers to another one?
# The function to use is match().
# Here is a tiny set of identifiers taken from a much larger table to
# illustrate the principle:
#
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
"Q08641", "P47129", "P52910", "P00330", "P81450"),
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
refID = c("NP_014657", "NP_009386",
"NP_012683", "NP_012559",
"NP_010038", "NP_014882",
"NP_012616", "NP_013254",
"NP_014555", "NP_013629"))
myIDs
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
# their gene names.
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
# %in% will only tell us if these IDs are present in the table:
myQuery %in% myIDs$refID
# ... but not where they are located. But match() does what we need here:
match(myQuery, myIDs$refID)
# ... and we can use the result to subset the column that we want to map to:
myIDs$name[match(myQuery, myIDs$refID)]
# Note that the output preserves the NA - i.e. the length of the mapped
# values is exactly the same as the length of the query.
# task: map the three genes to their UniProt Identifier.
#
# Note: if you want to do very many queries in very large tables, use the
# fmatch() function in the "fastmatch" package for a considerable
# speedup.
# [END]
# tocID <- "BIN-Data_integration.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Data_integration unit.
#
# Version: 1.2
#
# Date: 2018-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance and updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0.1 Bugfix: UniProt ID Mapping service API change
# 1.0 First live version
#
#
# TODO:
# Develop a fungi-specific BioMart example.
# (cf.
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Identifier mapping 42
#TOC> 2 Cross-referencing tables 165
#TOC>
#TOC> ==========================================================================
# = 1 Identifier mapping ==================================================
# UniProt provides a well-designed ID mapping tool that can be accessed
# online at http://www.uniprot.org/mapping/
#
# Here we will use the UniProt Web API for this tool to map identifiers. The
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
# simply via a Web- browsers request. Such requests are commonly sent via the
# GET or POST verbs that a Webserver responds to, when a client asks for data.
# GET requests are visible in the URL of the request; POST requests are not
# directly visible, they are commonly used to send the contents of forms, or
# when transmitting larger, complex data items. The UniProt ID mapping sevice
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
# and POST() functions are part of the httr package.
# To begin, we load httr, which supports sending and receiving data via the
# http protocol, just like a Web browser.
if (! requireNamespace("httr", quietly=TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We will walk through the process with the refSeqID
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
# happens if the ID can't be mapped:
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
# The UniProt ID mapping service API is very straightforward to use: just define
# the URL of the server and send a list of items labelled as "query" in the body
# of the request. GET() and POST() are functions from httr.
# Note. A recent bug in the interaction between the server expectations and the
# curl client libraries requires the following initialization
httr::set_config(httr::config(http_version = 0))
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
URL <- "https://www.uniprot.org/mapping/"
response <- httr::POST(URL,
body = list(from = "P_REFSEQ_AC", # Refseq Protein
to = "ACC", # UniProt ID
format = "tab",
query = myQueryIDs))
cat(httr::content(response))
# We need to check the status code - if it is not 200, an error ocurred and we
# can't process the result:
httr::status_code(response)
# If the query is successful, tabbed text is returned. We can assign that to a
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
myMappedIDs
# If this works as expected, you should see:
# From To
# 1 NP_010227 P39678
# 2 NP_011036 P25302
#
# ... and note that there are only two entries, because nothing was returned
# for the dummy "RefSeq ID" NP_00000
# If the query can't be fulfilled because of a problem with the server, a
# WebPage is returned. But the server status is also returned and we can check
# the status code. I have lately gotten many "503" status codes: Server Not
# Available...
# We wrap this into a function:
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# Use UniProt ID mapping service to map one or more IDs
# Parameters:
# s char A string of separated IDs
# mapFrom char the database in which the IDs in s are valid. Default
# is RefSeq protein
# mapTo char the database in which the target IDs are valid. Default
# is UniProtKB
# Value
# a data frame of mapped IDs, with column names From and To, or an
# empty data frame if the mapping was unsuccessful. No rows are returned
# for IDs that are not mapped.
# Initialize curl
httr::set_config(httr::config(http_version = 0))
URL <- "https://www.uniprot.org/uploadlists/"
response <- httr::POST(URL,
body = list(from = mapFrom,
to = mapTo,
format = "tab",
query = s))
if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
colnames(myMap) <- c("From", "To")
} else {
myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:",
"server sent status",
httr::status_code(response)))
}
return(myMap)
}
# Try it out ...
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
# into your workspace on startup.
# = 2 Cross-referencing tables ============================================
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
# genes in a model organism database such as SGD, or from the Human Genen
# Nomenclature commission. How do we map one set of identifiers to another one?
# The function to use is match().
# Here is a tiny set of identifiers taken from a much larger table to
# illustrate the principle:
#
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
"Q08641", "P47129", "P52910", "P00330", "P81450"),
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
refID = c("NP_014657", "NP_009386",
"NP_012683", "NP_012559",
"NP_010038", "NP_014882",
"NP_012616", "NP_013254",
"NP_014555", "NP_013629"))
myIDs
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
# their gene names.
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
# %in% will only tell us if these IDs are present in the table:
myQuery %in% myIDs$refID
# ... but not where they are located. But match() does what we need here:
match(myQuery, myIDs$refID)
# ... and we can use the result to subset the column that we want to map to:
myIDs$name[match(myQuery, myIDs$refID)]
# Note that the output preserves the NA - i.e. the length of the mapped
# values is exactly the same as the length of the query.
# task: map the three genes to their UniProt Identifier.
#
# Note: if you want to do very many queries in very large tables, use the
# fmatch() function in the "fastmatch" package for a considerable
# speedup.
# [END]

View File

@ -1,435 +1,435 @@
# tocID <- "BIN-FUNC-Domain_annotation.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit.
#
# ==============================================================================
# Version: 1.4
#
# Date: 2017-11 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.4 Add code for shared data import from the Wiki
# 1.3 Add code for database export to JSON and instructions
# for uploading annotations to the Public Student Wiki page
# 1.2 Consistently: data in ./myScripts/ ;
# begin SHARING DATA section
# 1.1 2020 Updates
# 1.0 Live version 2017
# 0.1 First code copied from 2016 material.
#
# TODO:
# Put the domain plot into a function
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Update your database script 51
#TOC> 1.1 Preparing an annotation file ... 58
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
#TOC> 1.2 Execute and Validate 136
#TOC> 2 Plot Annotations 161
#TOC> 3 SHARING DATA 287
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
#TOC>
#TOC> ==========================================================================
# = 1 Update your database script =========================================
# Since you have recorded domain features at the SMART database, we can store
# the feature annotations in myDB ...
# == 1.1 Preparing an annotation file ... ==================================
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
#
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines that have annotations you did not
# find in SMART for MBP1_MYSPE.
#
# - Make enough copies of the "Ankyrin fold" and "low complexity" region
# lines to have a line for each feature you found.
#
# - Then delete the comma at the end of the last line.
#
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
# and change the "start" and "end" features to the coordinates you
# recorded in the SMART database.
#
# - Save your file in the ./myScripts/ folder.
#
# - Validate your file online at https://jsonlint.com/
#
# - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
#
# - save and close the file.
#
# Then SKIP the next section.
#
#
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
#
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
#
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Make as many copies of the "APSES fold" line as you have found
# features in SMART.
#
# - Add a comma after every line except for the last one
#
# - Edit the annotations but include only features that are in the
# myDB$feature table. Check which features are in the database by executing
#
# myDB$feature$name
#
# - Update the "start" and "end" coordinates for each feature to the
# values you found.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 1.2 Execute and Validate ==============================================
#
# - source() your database creation script:
#
# source("./myScripts/makeProteinDB.R")
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask for help on the
# Discussion Board.
#
# - Confirm
# The following commands should retrieve all of the features that have been
# annotated for MBP1_MYSPE
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
(proID <- myDB$protein$ID[sel])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
# (once). If not, consider what could have gone wrong
# and ask on the list if you have difficulties fixing
# it.
# = 2 Plot Annotations ====================================================
# In this section we will plot domain annotations as colored rectangles on a
# sequence, as an example of using the R plotting system for generic, data
# driven images.
# We need a small utility function that draws the annotation boxes on a
# representation of sequence. It should accept the start and end coordinates,
# the y value where it should be plotted and the color of the box, and plot a
# rectangle using R's rect() function.
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
# Draw a box from xStart to xEnd at y, filled with colour myCol
# The height of the box is y +- DELTA
rect(xStart, (y - DELTA), xEnd, (y + DELTA),
border = "black", col = myCol)
}
# test this:
plot(c(-1.5, 1.5), c(0, 0), type = "l")
drawBox(-1, 1, 0.0, "peachpuff")
# Next, we define a function to plot annotations for one protein: the name of
# the protein, a horizontal grey line for its length, and all of its features.
plotProtein <- function(DB, name, y) {
# DB: protein database
# name: the name of the protein in the database.
# y: height where to draw the plot
#
# Define colors: we create a vector of color values, one for
# each feature, and we give it names of the feature ID. Then we
# can easily get the color value from the feature name.
# A: make a vector of color values. The syntax may appear unusual -
# colorRampPalette() returns a function, and we simply append
# the parameter (number-of-features) without assigning the function
# to its own variable name.
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
"#62C923", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(DB$feature))
# B: Features may overlap, so we make the colors transparent by setting
# their "alpha channel" to 1/3 (hex: 55)
ftrCol <- paste0(ftrCol, "55")
# C: we asssign names
names(ftrCol) <- DB$feature$ID
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
# find the row-index of the protein ID in the protein table of DB
iProtein <- which(DB$protein$name == name)
# write the name of the protein
text(-30, y, adj=1, labels=name, cex=0.75 )
#draw a line from 0 to nchar(sequence-of-the-protein)
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
lwd=3, col="#999999")
# get the rows of feature annotations for the protein
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
# draw a colored box for each feature
for (i in iFtr) {
drawBox(DB$annotation$start[i],
DB$annotation$end[i],
y,
ftrCol[ DB$annotation$featureID[i] ])
}
}
# Plot each annotated protein:
# Get the rows of all unique annotated Mbp1 proteins in myDB
iRows <- grep("^MBP1_", myDB$protein$name)
# define the size of the plot-frame to accomodate all proteins
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
# decrease margins
plot(1, 1,
xlim = c(-200, xMax + 100),
ylim = c(0, yMax),
type = "n",
axes = FALSE,
bty = "n",
main = "Mbp1 orthologue domain annotations",
xlab = "sequence position",
cex.axis = 0.8,
ylab="")
axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923",
"#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55")
legend(xMax - 150, 7,
legend = myDB$feature$name,
cex = 0.7,
fill = myCol,
bty = "n")
# Finally, iterate over all proteins and call plotProtein()
for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
}
par(oPar) # reset the plot parameters
# The plot shows what is variable and what is constant about the annotations in
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
# top.
# Task:
# Put a copy of the plot into your journal and interpret it with respect
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
# Task:
# It would be better to align the motif borders, at least approximately (not
# all proteins have all motifs). How would you go about doing that?
# = 3 SHARING DATA ========================================================
# It's particularly interesting to compare such annotations across many
# homologous proteins. I have created a page on the Student Wiki () that you can
# edit, and then download the data from the entire class directly to your
# RStudio project.
#
# I have provided a function that extracts all information that refers to a
# single protein from the database, and prints it out as well-formatted JSON,
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
# bookkeeping involved, but the code is not otherwise very enlightening so I
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
# would want to have a look.
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
# Task:
# =====
# 1: Run the following code:
cat("{{Vspace}}",
"<!-- ==== BEGIN PROTEIN ==== -->",
"<pre class=\"protein-data\">",
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
"</pre>",
"<!-- ===== END PROTEIN ====== -->",
"", sep = "\n"
)
# 2: Copy the entire output from the console.
# 3: Navigate to
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
# ... edit the page, and paste your output at the top.
# 4: Save your edits.
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
# Once we have collected a number of protein annotations, we can access the
# Wiki-page and import the data into our database. The Wiki page is an html
# document with lots of MediaWiki specific stuff - but the contents we are
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
# work like normal HTML <pre> tags, but we have defined a special class for them
# to make it easy to parse out the contents we want. The rvest:: package in
# combination with xml2:: provides us with all the tools we need for such
# "Webscraping" of data....
if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest")
}
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
# Here's the process:
# The URL is an "open" page on the student Wiki. Users that are not logged in
# can view the contents, but you can only edit if you are logged in.
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
# First thing is to retrieve the HTML from the url...
x <- xml2::read_html(myURL)
# This retrieves the page source, but that still needs to be parsed into its
# logical elements. HTML is a subset of XML and such documents are structured as
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
# parses out the document structure and then uses a so-called "xpath" expression
# to select nodes we are interested in. Now, xpath is one of those specialized
# languages of which there are a few more to learn than one would care for. You
# MUST know how to format sprintf() expressions, and you SHOULD be competent
# with regular expressions. But if you want to be really competent in your work,
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
# to search on Stackoverflow for what you need for parsing data out of Web
# documents...
# The expression we use below is:
# - get any node anywhere in the tree ("//*") ...
# - that has a particular attribute("[@ ... ]").
# - The attribute we want is that the class of the node is "protein-data";
# that is the class we have defined for our <pre> tags.
# As a result of this selection, we get a list of pointers to the document tree.
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
# Next we fetch the actual payload - the text - from the tree:
# rvest::html_text() gets the text from the list of pointers. The result is a
# normal list of character strings.
z <- rvest::html_text(y)
# Finally we can iterate over the list, and add all proteins we don't already
# have to our database. There may well be items that are rejected because they
# are already present in the database - for example, unless somebody has
# annotated new features, all of the features are already there. Don't worry -
# that is intended; we don't want duplicate entries.
for (thisJSON in z) {
thisData <- jsonlite::fromJSON(thisJSON)
if (! thisData$protein$name %in% myDB$protein$name) {
myDB <- dbAddProtein(myDB, thisData$protein)
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
myDB <- dbAddFeature(myDB, thisData$feature)
myDB <- dbAddAnnotation(myDB, thisData$annotation)
}
}
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
iRows <- grep("^MBP1_", myDB$protein$name)
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
plot(1, 1,
xlim = c(-200, xMax + 100),
ylim = c(0, yMax),
type = "n",
axes = FALSE,
bty = "n",
main = "Mbp1 orthologue domain annotations",
xlab = "sequence position",
cex.axis = 0.8,
ylab="")
axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923",
"#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55")
legend(xMax - 150, 7,
legend = myDB$feature$name,
cex = 0.7,
fill = myCol,
bty = "n")
for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
}
par(oPar) # reset the plot parameters
# ... the more proteins we can compare, the more we learn about the
# architectural principles of this family's domains.
# [END]
# tocID <- "BIN-FUNC-Domain_annotation.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit.
#
# ==============================================================================
# Version: 1.4
#
# Date: 2017-11 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.4 Add code for shared data import from the Wiki
# 1.3 Add code for database export to JSON and instructions
# for uploading annotations to the Public Student Wiki page
# 1.2 Consistently: data in ./myScripts/ ;
# begin SHARING DATA section
# 1.1 2020 Updates
# 1.0 Live version 2017
# 0.1 First code copied from 2016 material.
#
# TODO:
# Put the domain plot into a function
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Update your database script 51
#TOC> 1.1 Preparing an annotation file ... 58
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
#TOC> 1.2 Execute and Validate 136
#TOC> 2 Plot Annotations 161
#TOC> 3 SHARING DATA 287
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
#TOC>
#TOC> ==========================================================================
# = 1 Update your database script =========================================
# Since you have recorded domain features at the SMART database, we can store
# the feature annotations in myDB ...
# == 1.1 Preparing an annotation file ... ==================================
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
#
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
#
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory.
#
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE").
#
# - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one.
#
# - From that block, delete all lines that have annotations you did not
# find in SMART for MBP1_MYSPE.
#
# - Make enough copies of the "Ankyrin fold" and "low complexity" region
# lines to have a line for each feature you found.
#
# - Then delete the comma at the end of the last line.
#
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
# and change the "start" and "end" features to the coordinates you
# recorded in the SMART database.
#
# - Save your file in the ./myScripts/ folder.
#
# - Validate your file online at https://jsonlint.com/
#
# - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end:
#
# myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^
# edit this!
#
# - save and close the file.
#
# Then SKIP the next section.
#
#
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
#
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
#
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory:
#
# - Open the file in the RStudio editor.
#
# - Make as many copies of the "APSES fold" line as you have found
# features in SMART.
#
# - Add a comma after every line except for the last one
#
# - Edit the annotations but include only features that are in the
# myDB$feature table. Check which features are in the database by executing
#
# myDB$feature$name
#
# - Update the "start" and "end" coordinates for each feature to the
# values you found.
#
# - Save your file.
#
# - Validate your file online at https://jsonlint.com/
#
#
# == 1.2 Execute and Validate ==============================================
#
# - source() your database creation script:
#
# source("./myScripts/makeProteinDB.R")
#
# This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask for help on the
# Discussion Board.
#
# - Confirm
# The following commands should retrieve all of the features that have been
# annotated for MBP1_MYSPE
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
(proID <- myDB$protein$ID[sel])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
# (once). If not, consider what could have gone wrong
# and ask on the list if you have difficulties fixing
# it.
# = 2 Plot Annotations ====================================================
# In this section we will plot domain annotations as colored rectangles on a
# sequence, as an example of using the R plotting system for generic, data
# driven images.
# We need a small utility function that draws the annotation boxes on a
# representation of sequence. It should accept the start and end coordinates,
# the y value where it should be plotted and the color of the box, and plot a
# rectangle using R's rect() function.
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
# Draw a box from xStart to xEnd at y, filled with colour myCol
# The height of the box is y +- DELTA
rect(xStart, (y - DELTA), xEnd, (y + DELTA),
border = "black", col = myCol)
}
# test this:
plot(c(-1.5, 1.5), c(0, 0), type = "l")
drawBox(-1, 1, 0.0, "peachpuff")
# Next, we define a function to plot annotations for one protein: the name of
# the protein, a horizontal grey line for its length, and all of its features.
plotProtein <- function(DB, name, y) {
# DB: protein database
# name: the name of the protein in the database.
# y: height where to draw the plot
#
# Define colors: we create a vector of color values, one for
# each feature, and we give it names of the feature ID. Then we
# can easily get the color value from the feature name.
# A: make a vector of color values. The syntax may appear unusual -
# colorRampPalette() returns a function, and we simply append
# the parameter (number-of-features) without assigning the function
# to its own variable name.
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
"#62C923", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(DB$feature))
# B: Features may overlap, so we make the colors transparent by setting
# their "alpha channel" to 1/3 (hex: 55)
ftrCol <- paste0(ftrCol, "55")
# C: we asssign names
names(ftrCol) <- DB$feature$ID
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
# find the row-index of the protein ID in the protein table of DB
iProtein <- which(DB$protein$name == name)
# write the name of the protein
text(-30, y, adj=1, labels=name, cex=0.75 )
#draw a line from 0 to nchar(sequence-of-the-protein)
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
lwd=3, col="#999999")
# get the rows of feature annotations for the protein
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
# draw a colored box for each feature
for (i in iFtr) {
drawBox(DB$annotation$start[i],
DB$annotation$end[i],
y,
ftrCol[ DB$annotation$featureID[i] ])
}
}
# Plot each annotated protein:
# Get the rows of all unique annotated Mbp1 proteins in myDB
iRows <- grep("^MBP1_", myDB$protein$name)
# define the size of the plot-frame to accomodate all proteins
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
# decrease margins
plot(1, 1,
xlim = c(-200, xMax + 100),
ylim = c(0, yMax),
type = "n",
axes = FALSE,
bty = "n",
main = "Mbp1 orthologue domain annotations",
xlab = "sequence position",
cex.axis = 0.8,
ylab="")
axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923",
"#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55")
legend(xMax - 150, 7,
legend = myDB$feature$name,
cex = 0.7,
fill = myCol,
bty = "n")
# Finally, iterate over all proteins and call plotProtein()
for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
}
par(oPar) # reset the plot parameters
# The plot shows what is variable and what is constant about the annotations in
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
# top.
# Task:
# Put a copy of the plot into your journal and interpret it with respect
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
# Task:
# It would be better to align the motif borders, at least approximately (not
# all proteins have all motifs). How would you go about doing that?
# = 3 SHARING DATA ========================================================
# It's particularly interesting to compare such annotations across many
# homologous proteins. I have created a page on the Student Wiki () that you can
# edit, and then download the data from the entire class directly to your
# RStudio project.
#
# I have provided a function that extracts all information that refers to a
# single protein from the database, and prints it out as well-formatted JSON,
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
# bookkeeping involved, but the code is not otherwise very enlightening so I
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
# would want to have a look.
# == 3.1 Post MBP1_MYSPE as JSON data ======================================
# Task:
# =====
# 1: Run the following code:
cat("{{Vspace}}",
"<!-- ==== BEGIN PROTEIN ==== -->",
"<pre class=\"protein-data\">",
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
"</pre>",
"<!-- ===== END PROTEIN ====== -->",
"", sep = "\n"
)
# 2: Copy the entire output from the console.
# 3: Navigate to
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
# ... edit the page, and paste your output at the top.
# 4: Save your edits.
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
# Once we have collected a number of protein annotations, we can access the
# Wiki-page and import the data into our database. The Wiki page is an html
# document with lots of MediaWiki specific stuff - but the contents we are
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
# work like normal HTML <pre> tags, but we have defined a special class for them
# to make it easy to parse out the contents we want. The rvest:: package in
# combination with xml2:: provides us with all the tools we need for such
# "Webscraping" of data....
if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest")
}
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
# Here's the process:
# The URL is an "open" page on the student Wiki. Users that are not logged in
# can view the contents, but you can only edit if you are logged in.
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
# First thing is to retrieve the HTML from the url...
x <- xml2::read_html(myURL)
# This retrieves the page source, but that still needs to be parsed into its
# logical elements. HTML is a subset of XML and such documents are structured as
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
# parses out the document structure and then uses a so-called "xpath" expression
# to select nodes we are interested in. Now, xpath is one of those specialized
# languages of which there are a few more to learn than one would care for. You
# MUST know how to format sprintf() expressions, and you SHOULD be competent
# with regular expressions. But if you want to be really competent in your work,
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
# to search on Stackoverflow for what you need for parsing data out of Web
# documents...
# The expression we use below is:
# - get any node anywhere in the tree ("//*") ...
# - that has a particular attribute("[@ ... ]").
# - The attribute we want is that the class of the node is "protein-data";
# that is the class we have defined for our <pre> tags.
# As a result of this selection, we get a list of pointers to the document tree.
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
# Next we fetch the actual payload - the text - from the tree:
# rvest::html_text() gets the text from the list of pointers. The result is a
# normal list of character strings.
z <- rvest::html_text(y)
# Finally we can iterate over the list, and add all proteins we don't already
# have to our database. There may well be items that are rejected because they
# are already present in the database - for example, unless somebody has
# annotated new features, all of the features are already there. Don't worry -
# that is intended; we don't want duplicate entries.
for (thisJSON in z) {
thisData <- jsonlite::fromJSON(thisJSON)
if (! thisData$protein$name %in% myDB$protein$name) {
myDB <- dbAddProtein(myDB, thisData$protein)
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
myDB <- dbAddFeature(myDB, thisData$feature)
myDB <- dbAddAnnotation(myDB, thisData$annotation)
}
}
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
iRows <- grep("^MBP1_", myDB$protein$name)
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
plot(1, 1,
xlim = c(-200, xMax + 100),
ylim = c(0, yMax),
type = "n",
axes = FALSE,
bty = "n",
main = "Mbp1 orthologue domain annotations",
xlab = "sequence position",
cex.axis = 0.8,
ylab="")
axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923",
"#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55")
legend(xMax - 150, 7,
legend = myDB$feature$name,
cex = 0.7,
fill = myCol,
bty = "n")
for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i)
}
par(oPar) # reset the plot parameters
# ... the more proteins we can compare, the more we learn about the
# architectural principles of this family's domains.
# [END]

View File

@ -1,169 +1,169 @@
# tocID <- "BIN-FUNC-Semantic_similarity.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
#
# Version: 1.2
#
# Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 New code.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 43
#TOC> 2 Fetch GO Annotations 100
#TOC> 3 Semantic Similarities 109
#TOC> 4 GO Term Enrichment in Gene Sets 127
#TOC>
#TOC> ==========================================================================
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# GOSim is an R-package in the Bioconductor project.
if (! requireNamespace("GOSim", quietly = TRUE)) {
BiocManager::install("GOSim")
}
# Package information:
# library(help = GOSim) # basic information
# browseVignettes("GOSim") # available vignettes
# data(package = "GOSim") # available datasets
# GOSim makes extensive assumptions about loaded packages, and many base
# methods are masked. We will thus use library(GOSim) to load it
# in its entirety and with all packages it depends on. We will still use
# the <package>::<function>() syntax in the code below, but this now serves
# more of a didactic purpose, rather than actual syntax requirements.
library(GOSim)
# GOSim loads human annotations in org.Hs.eg.db by default. We load yeast
# annotations instead...
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
BiocManager::install("org.Sc.sgd.db")
}
# Bioconductor annotation packages won't work stably unless we actually load
# them:
library(org.Sc.sgd.db)
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ...
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
# ... and the types of available annotations with the columns() function
AnnotationDbi::columns(org.Sc.sgd.db)
# Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim:
# Choose GOterms to use
GOSim::setEvidenceLevel(evidences = "all",
organism = org.Sc.sgdORGANISM,
gomap = org.Sc.sgdGO)
# Use Biological Process ontology
GOSim::setOntology("BP", loadIC = FALSE)
# confirm that we loaded the correct ontology
head(get("gomap", envir = GOSimEnv))
# = 2 Fetch GO Annotations ================================================
# All keys being used here are yeast systematic names.
# Get one set of annotations
GOSim::getGOInfo(c("YDL056W")) # Mbp1
# = 3 Semantic Similarities ===============================================
# Get semantic similarities between genes
?getGeneSim
# There are _many_ different metrics of term similarity implemented
# in this package.
# Mbp1 and...
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
# = 4 GO Term Enrichment in Gene Sets =====================================
# Calculating GO term enrichment in gene sets is done with the Bioconductor
# topGO package.
if (! requireNamespace("topGO", quietly = TRUE)) {
BiocManager::install("topGO")
}
# Package information:
# library(help = topGO) # basic information
# browseVignettes("topGO") # available vignettes
# data(package = "topGO") # available datasets
# Once again - assumptions are made by GOsim that require us to load the
# topGO package wholesale:
library(topGO)
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14
"YDL056W", # Mbp1
"YLR182W", # Swi6
"YER111C", # Swi4
"YOR083W", # Whi5
"YBR160W", # Cdc28
"YMR199W", # Cln1
"YPL256C", # Cln2
"YAL040C") # Cln3
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
# we define enrichment
myEnr <- GOenrichment(mySet, allGenes)
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
#Most significantly enriched is GO:0071931. What is this?
annotate::getGOTerm("GO:0071931") # ... makes sense.
# [END]
# tocID <- "BIN-FUNC-Semantic_similarity.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
#
# Version: 1.2
#
# Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 New code.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 43
#TOC> 2 Fetch GO Annotations 100
#TOC> 3 Semantic Similarities 109
#TOC> 4 GO Term Enrichment in Gene Sets 127
#TOC>
#TOC> ==========================================================================
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# GOSim is an R-package in the Bioconductor project.
if (! requireNamespace("GOSim", quietly = TRUE)) {
BiocManager::install("GOSim")
}
# Package information:
# library(help = GOSim) # basic information
# browseVignettes("GOSim") # available vignettes
# data(package = "GOSim") # available datasets
# GOSim makes extensive assumptions about loaded packages, and many base
# methods are masked. We will thus use library(GOSim) to load it
# in its entirety and with all packages it depends on. We will still use
# the <package>::<function>() syntax in the code below, but this now serves
# more of a didactic purpose, rather than actual syntax requirements.
library(GOSim)
# GOSim loads human annotations in org.Hs.eg.db by default. We load yeast
# annotations instead...
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
BiocManager::install("org.Sc.sgd.db")
}
# Bioconductor annotation packages won't work stably unless we actually load
# them:
library(org.Sc.sgd.db)
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ...
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
# ... and the types of available annotations with the columns() function
AnnotationDbi::columns(org.Sc.sgd.db)
# Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim:
# Choose GOterms to use
GOSim::setEvidenceLevel(evidences = "all",
organism = org.Sc.sgdORGANISM,
gomap = org.Sc.sgdGO)
# Use Biological Process ontology
GOSim::setOntology("BP", loadIC = FALSE)
# confirm that we loaded the correct ontology
head(get("gomap", envir = GOSimEnv))
# = 2 Fetch GO Annotations ================================================
# All keys being used here are yeast systematic names.
# Get one set of annotations
GOSim::getGOInfo(c("YDL056W")) # Mbp1
# = 3 Semantic Similarities ===============================================
# Get semantic similarities between genes
?getGeneSim
# There are _many_ different metrics of term similarity implemented
# in this package.
# Mbp1 and...
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
# = 4 GO Term Enrichment in Gene Sets =====================================
# Calculating GO term enrichment in gene sets is done with the Bioconductor
# topGO package.
if (! requireNamespace("topGO", quietly = TRUE)) {
BiocManager::install("topGO")
}
# Package information:
# library(help = topGO) # basic information
# browseVignettes("topGO") # available vignettes
# data(package = "topGO") # available datasets
# Once again - assumptions are made by GOsim that require us to load the
# topGO package wholesale:
library(topGO)
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14
"YDL056W", # Mbp1
"YLR182W", # Swi6
"YER111C", # Swi4
"YOR083W", # Whi5
"YBR160W", # Cdc28
"YMR199W", # Cln1
"YPL256C", # Cln2
"YAL040C") # Cln3
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
# we define enrichment
myEnr <- GOenrichment(mySet, allGenes)
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
#Most significantly enriched is GO:0071931. What is this?
annotate::getGOTerm("GO:0071931") # ... makes sense.
# [END]

View File

@ -1,351 +1,351 @@
# tocID <- "BIN-MYSPE.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit
#
#
# Version: 1.4
#
# Date: 2017-09 - 2021-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.4 Add troubleshooting hints via errText[[...]]
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
# V 1.2 Reorganized proportional plot section into a "further reading"
# section, added nested-box, and sankey plot visualization of
# proportions. Introduced plotly.
# V 1.1 2020 Workflow changes
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
#
# TODO: Sample solution for sankey plot function.
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 PREPARATIONS 52
#TOC> 2 SUITABLE MYSPE SPECIES 65
#TOC> 3 ADOPT "MYSPE" 89
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
#TOC> 4.1 Percentages 146
#TOC> 4.2 Visualizing proportions: Pie chart 165
#TOC> 4.3 Visualizing proportions: Nested squares 243
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280
#TOC>
#TOC> ==========================================================================
# = 1 PREPARATIONS ========================================================
#
# Execute the two conditionals below:
if (! file.exists("./myScripts/.myProfile.R")) {
stop(errText[["noProfileFile"]]) # message defined in .Rprofile
}
if (! exists("myStudentNumber")) {
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
}
# = 2 SUITABLE MYSPE SPECIES ==============================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "MYSPE" (My Species) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
# was retrieved from https://fungi.ensembl.org; a search for homologues to
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
# A representative organism at each genus-level was chosen from those hits
# that actual;ly have a homologue. Finally, a mapping table was constructed to
# asymmetrically retrieve unique species: a student number will retrieve
# a species, but (public) knowledge of the species cannot reconstruct the
# student number.
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
# of selecting and combining data from various data resources. Studying
# it will give you a better sense of how such workflows can be
# implemented in practice.
# = 3 ADOPT "MYSPE" =======================================================
# Execute:
( MYSPE <- getMYSPE(myStudentNumber) )
# If this produced an error, this session has not been properly set up. You
# may not yet have run init() and edited .myProfile.R , or that file is not
# in your myScripts/ folder. Fix this, and execute:
#
# source(".Rprofile") .
# If this produced NA, your Student Number may not be correct, or you are not in
# my class-list. Contact me. Otherwise, this should have printed a species name,
# and the taxonomy ID of its genome-sequenced strain. This is your unique
# speciesfor this course. Note it in your journal ...
biCode(MYSPE) # and also note it's "BiCode" ...
( myTaxID <- names(MYSPE) ) # and its taxID
# Task:
# =====
# Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information about MYSPE, taken from the table of genome-
# sequenced fungi that is in your ./data folder.
fungiDat <- read.csv("data/Species.csv")
iMs <- which(fungiDat$Taxon.ID == myTaxID)
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
# That's all.
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
# The material below is an exploration of data-preparation and plotting
# techniques; you can treat this as additional practice and further reading and
# I expect that some of the code and plotting examples may be useful in a
# different context.
# A frequent task is to visualize the proportion of elements with given
# categories in a sample. For example, we might ask what the proportion of the
# different orders of fungi is the order of MYSPE? Let's first collect the
# numbers.
( nFungi <- nrow(fungiDat) ) # sequenced fungi
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
# == 4.1 Percentages =======================================================
# The zeroth-order approach to visualization is simply to print percentages:
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
myOr,
(nOrder * 100) / nFungi))
# ... or, adding the actual numbers:
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
myOr,
(nOrder * 100) / nFungi,
nOrder,
nFungi))
# But that's hard to visualize for most of us, and anyway, we don't know how
# that relates to other orders.
# == 4.2 Visualizing proportions: Pie chart ================================
# Often, we will use a pie chart instead. Pie charts are rather informal types
# of plots, not well suited for analysis. But easy to do:
# Define four colors to identify the four categories
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
# and remember the
# previous setting
pie(c(nSpecies, # subtract numbers since these
nGenus - nSpecies, # categories are mutually contained
nOrder - nGenus - nSpecies, # in each other
nFungi - nOrder - nGenus - nSpecies),
labels = "",
radius = 0.9,
main = "MYSPE in genome-sequenced fungi",
lty = 0, # turn borders for wedges off
col = pCol,
clockwise = TRUE,
init.angle = 90)
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
legend(x = 0.95, y = 0.8, # place at legend here
legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes
pch = 15, # a filled square
col = pCol)
par(oPar) # reset graphics state
# Unless MYSPE is one of the frequently sequenced species, there will only be a
# very thin wedge visible. Pie charts are not well suited to visualize small
# proportions.
# It is a little more useful if we have non-nested proportions - like the
# number of species in the same order overall:
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
head(myTbl)
# pie() does a reasonable job out of the box to interpret table() data:
pie(myTbl)
# ... we can improve this quickly with a bit of tweaking:
N <- length(myTbl)
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
myCol <- rep(pCol[4], N) # N elements of pCol[1]
myCol[sel] <- pCol[1] # replace this one color
myLbl <- rep("", N) # N labels of ""
myLbl[sel] <- myOr # replace this one label with the MYSPE order
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
pie(myTbl,
labels = myLbl,
radius = 0.9,
main = "MYSPE order",
border = "#DDDDDD",
col = myCol,
clockwise = TRUE,
init.angle = 90)
par(oPar) # reset graphics state
# But the overall problem remains.
# == 4.3 Visualizing proportions: Nested squares ===========================
# A simple alternative is to draw such proportions as nested squares:
x <- sqrt(nFungi)
# set margins to ~ 0 and type to square
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
# empty, square plot
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
type="n", axes=FALSE, xlab="", ylab="")
# basic square for all genomes
rect(0, 0, x, x, col = pCol[4])
# grid
u <- 0:floor(x)
N <- length(u)
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
# each square on this grid is one genome
# colored squares
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
# labels
text(x/2, x/2, "Fungi")
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
par(oPar) # reset graphics state
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
# Sankey diagrams are an excellent way to visualize complicated nested
# proportions and their changes (see here for example:
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
# example with the MYSPE proportions, as an illustration of the plotting
# principle.
if (! requireNamespace("plotly")) {
install.packages("plotly")
}
# Package information:
# library(help = plotly) # basic information
# browseVignettes("plotly") # available vignettes
# data(package = "plotly") # available datasets
# Here, we use the plotly package that wraps a very well developed javascript
# library with many options for interactive plots. I am producing this plot
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
# to change the code to adapt it to your own MYSPE - or even build a function
# for this. Do try this if you have a bit of coding experience, sankey diagrams
# are a good way to show hierarchical data relations - and if you get this
# working for your own organism you can be proud that you have understood
# how preparing the data works.
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
"Ophiostomatales (6)", # 1
"Other...", # 2
"Sporothrix (4)", # 3
"Other...", # 4
"Sporothrix schenckii (2)", # 5
"Other..." # 6
),
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
color = c("#f2f2f0", #
"#ffd5c4",
"#CCCCCC",
"#ff9582",
"#CCCCCC",
"#ed394e",
"#CCCCCC"
),
pad = 15,
thickness = 20,
line = list(color = "black",
width = 0.5))
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
value = c(6, 18, 4, 2, 2, 2)) # and node 1
# Setting up the actual plot ...
fig <- plotly::plot_ly(type = "sankey",
arrangement = "snap",
orientation = "h",
node = myNodes,
link = myLinks)
# Adding and adjusting a few layout parameters
fig <- plotly::layout(fig,
title = "Fungi Genomes - Classification",
font = list(size = 10))
fig # plot the diagram
# Note that the plot appears in the Viewer window, not the Plot window, and that
# it is interactive: you can hover over nodes and links, and drag the nodes
# around.
# [END]
# tocID <- "BIN-MYSPE.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit
#
#
# Version: 1.4
#
# Date: 2017-09 - 2021-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.4 Add troubleshooting hints via errText[[...]]
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
# V 1.2 Reorganized proportional plot section into a "further reading"
# section, added nested-box, and sankey plot visualization of
# proportions. Introduced plotly.
# V 1.1 2020 Workflow changes
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
#
# TODO: Sample solution for sankey plot function.
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 PREPARATIONS 52
#TOC> 2 SUITABLE MYSPE SPECIES 65
#TOC> 3 ADOPT "MYSPE" 89
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
#TOC> 4.1 Percentages 146
#TOC> 4.2 Visualizing proportions: Pie chart 165
#TOC> 4.3 Visualizing proportions: Nested squares 243
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280
#TOC>
#TOC> ==========================================================================
# = 1 PREPARATIONS ========================================================
#
# Execute the two conditionals below:
if (! file.exists("./myScripts/.myProfile.R")) {
stop(errText[["noProfileFile"]]) # message defined in .Rprofile
}
if (! exists("myStudentNumber")) {
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
}
# = 2 SUITABLE MYSPE SPECIES ==============================================
# In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called
# "MYSPE" (My Species) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
# was retrieved from https://fungi.ensembl.org; a search for homologues to
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
# A representative organism at each genus-level was chosen from those hits
# that actual;ly have a homologue. Finally, a mapping table was constructed to
# asymmetrically retrieve unique species: a student number will retrieve
# a species, but (public) knowledge of the species cannot reconstruct the
# student number.
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
# of selecting and combining data from various data resources. Studying
# it will give you a better sense of how such workflows can be
# implemented in practice.
# = 3 ADOPT "MYSPE" =======================================================
# Execute:
( MYSPE <- getMYSPE(myStudentNumber) )
# If this produced an error, this session has not been properly set up. You
# may not yet have run init() and edited .myProfile.R , or that file is not
# in your myScripts/ folder. Fix this, and execute:
#
# source(".Rprofile") .
# If this produced NA, your Student Number may not be correct, or you are not in
# my class-list. Contact me. Otherwise, this should have printed a species name,
# and the taxonomy ID of its genome-sequenced strain. This is your unique
# speciesfor this course. Note it in your journal ...
biCode(MYSPE) # and also note it's "BiCode" ...
( myTaxID <- names(MYSPE) ) # and its taxID
# Task:
# =====
# Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information about MYSPE, taken from the table of genome-
# sequenced fungi that is in your ./data folder.
fungiDat <- read.csv("data/Species.csv")
iMs <- which(fungiDat$Taxon.ID == myTaxID)
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
# That's all.
# = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
# The material below is an exploration of data-preparation and plotting
# techniques; you can treat this as additional practice and further reading and
# I expect that some of the code and plotting examples may be useful in a
# different context.
# A frequent task is to visualize the proportion of elements with given
# categories in a sample. For example, we might ask what the proportion of the
# different orders of fungi is the order of MYSPE? Let's first collect the
# numbers.
( nFungi <- nrow(fungiDat) ) # sequenced fungi
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
# == 4.1 Percentages =======================================================
# The zeroth-order approach to visualization is simply to print percentages:
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
myOr,
(nOrder * 100) / nFungi))
# ... or, adding the actual numbers:
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
myOr,
(nOrder * 100) / nFungi,
nOrder,
nFungi))
# But that's hard to visualize for most of us, and anyway, we don't know how
# that relates to other orders.
# == 4.2 Visualizing proportions: Pie chart ================================
# Often, we will use a pie chart instead. Pie charts are rather informal types
# of plots, not well suited for analysis. But easy to do:
# Define four colors to identify the four categories
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
# and remember the
# previous setting
pie(c(nSpecies, # subtract numbers since these
nGenus - nSpecies, # categories are mutually contained
nOrder - nGenus - nSpecies, # in each other
nFungi - nOrder - nGenus - nSpecies),
labels = "",
radius = 0.9,
main = "MYSPE in genome-sequenced fungi",
lty = 0, # turn borders for wedges off
col = pCol,
clockwise = TRUE,
init.angle = 90)
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
legend(x = 0.95, y = 0.8, # place at legend here
legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes
pch = 15, # a filled square
col = pCol)
par(oPar) # reset graphics state
# Unless MYSPE is one of the frequently sequenced species, there will only be a
# very thin wedge visible. Pie charts are not well suited to visualize small
# proportions.
# It is a little more useful if we have non-nested proportions - like the
# number of species in the same order overall:
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
head(myTbl)
# pie() does a reasonable job out of the box to interpret table() data:
pie(myTbl)
# ... we can improve this quickly with a bit of tweaking:
N <- length(myTbl)
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
myCol <- rep(pCol[4], N) # N elements of pCol[1]
myCol[sel] <- pCol[1] # replace this one color
myLbl <- rep("", N) # N labels of ""
myLbl[sel] <- myOr # replace this one label with the MYSPE order
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
pie(myTbl,
labels = myLbl,
radius = 0.9,
main = "MYSPE order",
border = "#DDDDDD",
col = myCol,
clockwise = TRUE,
init.angle = 90)
par(oPar) # reset graphics state
# But the overall problem remains.
# == 4.3 Visualizing proportions: Nested squares ===========================
# A simple alternative is to draw such proportions as nested squares:
x <- sqrt(nFungi)
# set margins to ~ 0 and type to square
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
# empty, square plot
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
type="n", axes=FALSE, xlab="", ylab="")
# basic square for all genomes
rect(0, 0, x, x, col = pCol[4])
# grid
u <- 0:floor(x)
N <- length(u)
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
# each square on this grid is one genome
# colored squares
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
# labels
text(x/2, x/2, "Fungi")
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
par(oPar) # reset graphics state
# == 4.4 Visualizing proportions: Sankey diagrams ==========================
# Sankey diagrams are an excellent way to visualize complicated nested
# proportions and their changes (see here for example:
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
# example with the MYSPE proportions, as an illustration of the plotting
# principle.
if (! requireNamespace("plotly")) {
install.packages("plotly")
}
# Package information:
# library(help = plotly) # basic information
# browseVignettes("plotly") # available vignettes
# data(package = "plotly") # available datasets
# Here, we use the plotly package that wraps a very well developed javascript
# library with many options for interactive plots. I am producing this plot
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
# to change the code to adapt it to your own MYSPE - or even build a function
# for this. Do try this if you have a bit of coding experience, sankey diagrams
# are a good way to show hierarchical data relations - and if you get this
# working for your own organism you can be proud that you have understood
# how preparing the data works.
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
"Ophiostomatales (6)", # 1
"Other...", # 2
"Sporothrix (4)", # 3
"Other...", # 4
"Sporothrix schenckii (2)", # 5
"Other..." # 6
),
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
color = c("#f2f2f0", #
"#ffd5c4",
"#CCCCCC",
"#ff9582",
"#CCCCCC",
"#ed394e",
"#CCCCCC"
),
pad = 15,
thickness = 20,
line = list(color = "black",
width = 0.5))
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
value = c(6, 18, 4, 2, 2, 2)) # and node 1
# Setting up the actual plot ...
fig <- plotly::plot_ly(type = "sankey",
arrangement = "snap",
orientation = "h",
node = myNodes,
link = myLinks)
# Adding and adjusting a few layout parameters
fig <- plotly::layout(fig,
title = "Fungi Genomes - Classification",
font = list(size = 10))
fig # plot the diagram
# Note that the plot appears in the Viewer window, not the Plot window, and that
# it is interactive: you can hover over nodes and links, and drag the nodes
# around.
# [END]

View File

@ -1,234 +1,234 @@
# tocID <- "BIN-PHYLO-Data_preparation.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------
#TOC> 1 Preparations 45
#TOC> 2 Fetching sequences 77
#TOC> 3 Multiple Sequence Alignment 118
#TOC> 4 Reviewing and Editing Alignments 137
#TOC> 4.1 Masking workflow 153
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
# You need to reload your protein database, including changes that might have
# been made to the reference files. If you have worked with the prerequiste
# units, you should have a script named "makeProteinDB.R" that will create the
# myDB object with a protein and feature database. Ask for advice if not.
source("myScripts/makeProteinDB.R")
# Load packages we need
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
if (! requireNamespace("msa", quietly = TRUE)) {
BiocManager::install("msa")
}
# Package information:
# library(help = msa) # basic information
# browseVignettes("msa") # available vignettes
# data(package = "msa") # available datasets
# = 2 Fetching sequences ==================================================
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
# domains. You have annotated their ranges as a feature. The following code
# retrieves the sequences from myDB. You have seen similar code in other units.
sel <- grep("^MBP1_", myDB$protein$name)
(proNames <- myDB$protein$name[sel])
(proIDs <- myDB$protein$ID[sel])
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
myDB$annotation$featureID == sel]) # == !
# Why?
APSI <- character(length(fanIDs))
for (i in seq_along(fanIDs)) {
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
proID <- myDB$annotation$proteinID[sel] # get its protein ID
start <- myDB$annotation$start[sel] # get start ...
end <- myDB$annotation$end[sel] # ... and end
sel <- myDB$protein$ID == proID # get the protein row index ...
# ... and the sequence
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
names(APSI)[i] <- (myDB$protein$name[sel])
}
head(APSI)
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
APSI <- c(APSI,
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
names(APSI)[length(APSI)] <- "KILA_ESCCO"
tail(APSI)
# = 3 Multiple Sequence Alignment =========================================
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#
APSESSet <- Biostrings::AAStringSet(APSI)
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
# that happens in your case, just use msaClustalOmega() instead.
# inspect the alignment.
writeALN(APSESMsa)
# What do you think? Is this a good alignment for phylogenetic inference?
# = 4 Reviewing and Editing Alignments ====================================
# Head back to the Wiki page for this unit and read up on the background
# first.
# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.
# == 4.1 Masking workflow ==================================================
# get the length of the alignment
(lenAli <- APSESMsa@unmasked@ranges@width[1])
# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
ncol = lenAli)
# assign the correct rownames
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsa)) {
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
}
# inspect the result
msaMatrix[1:7, 30:40]
# Now let's make a logical vector with an element for each column that selects
# which columns should be masked out.
# The number of hyphens in a column is easy to count. Consider:
msaMatrix[ , 20] # column 20
msaMatrix[ , 20] == "-" # TRUE for all gap characters
sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE
# Thus filling our logical vector is simple:
# initialize a mask
colMask <- logical(ncol(msaMatrix))
# define the threshold for rejecting a column
limit <- round(nrow(APSESMsa) * (2/3))
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
# and FALSE columns will be rejected.
for (i in 1:ncol(msaMatrix)) {
count <- sum(msaMatrix[ , i] == "-")
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
}
# Inspect the mask
colMask
# How many positions are being kept?
sum(colMask)
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]
# check:
ncol(maskedMatrix)
# ... then collapse each row of single characters back into a string ...
APSESphyloSet <- character()
for (i in 1:nrow(maskedMatrix)) {
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(APSESphyloSet) <- rownames(maskedMatrix)
# inspect ...
writeALN(APSESphyloSet)
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
# several indels from the KILA_ESCCO outgroup sequence.
# We save the aligned, masked domains to a file in the data/ directory,
# in multi-FASTA format.
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
# [END]
# tocID <- "BIN-PHYLO-Data_preparation.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------
#TOC> 1 Preparations 45
#TOC> 2 Fetching sequences 77
#TOC> 3 Multiple Sequence Alignment 118
#TOC> 4 Reviewing and Editing Alignments 137
#TOC> 4.1 Masking workflow 153
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
# You need to reload your protein database, including changes that might have
# been made to the reference files. If you have worked with the prerequiste
# units, you should have a script named "makeProteinDB.R" that will create the
# myDB object with a protein and feature database. Ask for advice if not.
source("myScripts/makeProteinDB.R")
# Load packages we need
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
if (! requireNamespace("msa", quietly = TRUE)) {
BiocManager::install("msa")
}
# Package information:
# library(help = msa) # basic information
# browseVignettes("msa") # available vignettes
# data(package = "msa") # available datasets
# = 2 Fetching sequences ==================================================
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
# domains. You have annotated their ranges as a feature. The following code
# retrieves the sequences from myDB. You have seen similar code in other units.
sel <- grep("^MBP1_", myDB$protein$name)
(proNames <- myDB$protein$name[sel])
(proIDs <- myDB$protein$ID[sel])
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
myDB$annotation$featureID == sel]) # == !
# Why?
APSI <- character(length(fanIDs))
for (i in seq_along(fanIDs)) {
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
proID <- myDB$annotation$proteinID[sel] # get its protein ID
start <- myDB$annotation$start[sel] # get start ...
end <- myDB$annotation$end[sel] # ... and end
sel <- myDB$protein$ID == proID # get the protein row index ...
# ... and the sequence
APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
names(APSI)[i] <- (myDB$protein$name[sel])
}
head(APSI)
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
APSI <- c(APSI,
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
names(APSI)[length(APSI)] <- "KILA_ESCCO"
tail(APSI)
# = 3 Multiple Sequence Alignment =========================================
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#
APSESSet <- Biostrings::AAStringSet(APSI)
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
# that happens in your case, just use msaClustalOmega() instead.
# inspect the alignment.
writeALN(APSESMsa)
# What do you think? Is this a good alignment for phylogenetic inference?
# = 4 Reviewing and Editing Alignments ====================================
# Head back to the Wiki page for this unit and read up on the background
# first.
# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.
# == 4.1 Masking workflow ==================================================
# get the length of the alignment
(lenAli <- APSESMsa@unmasked@ranges@width[1])
# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
ncol = lenAli)
# assign the correct rownames
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsa)) {
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
}
# inspect the result
msaMatrix[1:7, 30:40]
# Now let's make a logical vector with an element for each column that selects
# which columns should be masked out.
# The number of hyphens in a column is easy to count. Consider:
msaMatrix[ , 20] # column 20
msaMatrix[ , 20] == "-" # TRUE for all gap characters
sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE
# Thus filling our logical vector is simple:
# initialize a mask
colMask <- logical(ncol(msaMatrix))
# define the threshold for rejecting a column
limit <- round(nrow(APSESMsa) * (2/3))
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
# and FALSE columns will be rejected.
for (i in 1:ncol(msaMatrix)) {
count <- sum(msaMatrix[ , i] == "-")
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
}
# Inspect the mask
colMask
# How many positions are being kept?
sum(colMask)
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]
# check:
ncol(maskedMatrix)
# ... then collapse each row of single characters back into a string ...
APSESphyloSet <- character()
for (i in 1:nrow(maskedMatrix)) {
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(APSESphyloSet) <- rownames(maskedMatrix)
# inspect ...
writeALN(APSESphyloSet)
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
# several indels from the KILA_ESCCO outgroup sequence.
# We save the aligned, masked domains to a file in the data/ directory,
# in multi-FASTA format.
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
# [END]

View File

@ -1,406 +1,406 @@
# tocID <- "BIN-PHYLO-Tree_analysis.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
# Rewrite of tip re-ordering. Better handling of
# messages. pBar() for randomization.
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.2 Typo in variable name, style changes
# 1.0.1 Wrong section heading
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 50
#TOC> 2 SPECIES REFERENCE TREE 66
#TOC> 3 Tree Analysis 117
#TOC> 3.1 Rooting Trees 177
#TOC> 3.2 Rotating Clades 222
#TOC> 3.3 Computing tree distances 309
#TOC>
#TOC> ==========================================================================
# = 1 Preparation and Tree Plot ===========================================
if (! requireNamespace("ape", quietly = TRUE)) {
install.packages("ape")
}
# Package information:
# library(help = ape) # basic information
# browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets
# We change the graphics parameters from time to time, let's define the
# default so we can recreate a sane state:
dev.off()
PAR <- par()
# = 2 SPECIES REFERENCE TREE ==============================================
# Before we do any kind of phylogenetic analysis of genes from several species,
# we MUST have a reference tree of the taxonomic relationships in hand. This
# context is absolutely required for the interpretation of our tree.
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
if (! requireNamespace("taxize", quietly = TRUE)) {
install.packages("taxize")
}
# Package information:
# library(help = taxize) # basic information
# browseVignettes("taxize") # available vignettes
# data(package = "taxize") # available datasets
( mySOI <- c(myDB$taxonomy$ID, "83333") )
myClass <- taxize::classification(mySOI, db = "ncbi")
str(myClass)
myClass[[1]]
fungiTree <- taxize::class2tree(myClass, check = TRUE)
plot(fungiTree)
# The tree produced by taxize:: contains full length species names,
# but it would be more convenient if it had bicodes instead. Also, the actual
# tree is only part of the list(), which will cause problems later:
str(fungiTree)
# we therefor simplify
fungiTree <- fungiTree$phylo
str(fungiTree)
# The species names are in a vector $phylo$tip.label of this list.
# We can use biCode() to shorten them.
fungiTree$tip.label <- biCode(fungiTree$tip.label)
# Plot the tree
nSP <- length(fungiTree$tip.label)
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.6,
adj = 0.2,
bg = "#D4F2DA")
# Note that you can use the arrow buttons in the menu above the plot pane to
# scroll back to plots you have created earlier - so you can reference back to
# this species tree in your later analysis.
# = 3 Tree Analysis =======================================================
# 1.1 Visualizing your tree
# The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been defined with the Rphylip package, and in ape. You can
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
# to plot it. The underlying function is plot.phylo(), and documentation for its
# many options can by found by typing:
?plot.phylo
# We load the APSES sequence tree that you produced in the
# BIN-PHYLO-Tree_building unit:
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
plot(apsTree) # default type is "phylogram"
plot(apsTree, type = "unrooted")
plot(apsTree, type = "fan", no.margin = TRUE)
# rescale to show all of the labels:
# record the current plot parameters by assigning them to a variable ...
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
# ... and adjust the plot limits for a new plot:
plot(apsTree,
type = "fan",
x.lim = tmp$x.lim * 1.8,
y.lim = tmp$y.lim * 1.8,
cex = 0.8,
no.margin = TRUE)
# Inspect the tree object
str(apsTree)
apsTree$tip.label
apsTree$edge
apsTree$edge.length
# show the node / edge and tip labels on a plot
plot(apsTree)
ape::nodelabels()
ape::edgelabels()
ape::tiplabels()
# show the number of nodes, edges and tips
ape::Nnode(apsTree)
ape::Nedge(apsTree)
ape::Ntip(apsTree)
par(PAR) # reset graphics state
# Finally, write the tree to console in Newick format
ape::write.tree(apsTree)
# == 3.1 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
ape::is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package.
plot(apsTree)
# add labels for internal nodes and tips
ape::nodelabels(cex = 0.5, frame = "circle")
ape::tiplabels(cex = 0.5, frame = "rect")
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
# number in yours. Substitute the correct node number below for "outgroup".
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
plot(apsTree)
ape::is.rooted(apsTree)
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common
# Ancestor) of the ingroup.
# The edge lengths are stored in the phylo object:
apsTree$edge.length
# ... and you can assign a small arbitrary value to the edge
# to show how it connects to the tree without having an
# overlap.
apsTree$edge.length[1] <- 0.1
plot(apsTree, cex = 0.7)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
# This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
# myself that too. We'll just add a length by hand.
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex = 0.7, root.edge = TRUE)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# == 3.2 Rotating Clades ===================================================
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
# We can either rotate around individual internal nodes ...
layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
# Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top.
par(PAR) # reset graphics state
# ... or we can rearrange the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
# from the NCBI taxonomic classification.
nOrg <- length(apsTree$tip.label)
plot(fungiTree,
no.margin = FALSE, root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# These are the fungi tree tips ...
fungiTree$tip.label
# ... and their order is determined by the edge-list that is stored in
fungiTree$edge
# which edges join the tips?
ape::tiplabels(cex = 0.5, frame = "rect")
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
# ordered from bottom to top.
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
sel <- fungiTree$edge[ , 2 ] <= nOrg
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
# Now, here are the genes of the apsTree tips ...
apsTree$tip.label
# ... and the "constraint" we need for reordering, according to the help page
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
oSp <- gsub("^", "MBP1_", oSp)
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
# Then we can plot the two trees to compare: the fungi- tree
par(PAR) # reset graphics state
layout(matrix(1:2, 1, 2))
plot(fungiTree,
no.margin = TRUE,
root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# and the re-organized apsesTree ...
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
no.margin = TRUE,
root.edge = TRUE)
par(PAR) # reset graphics state
# As you can see, the reordering is not perfect, since the topologies are
# different, mostly due to the unresolved nodes in the reference tree. One
# could play with that ...
# Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree?
# In order to quantify how different these two trees are, we need to compute
# tree distances.
# == 3.3 Computing tree distances ==========================================
# Many superb phylogeny tools are contributed by the phangorn package.
if (! requireNamespace("phangorn", quietly = TRUE)) {
install.packages("phangorn")
}
# Package information:
# library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
apsTree2 <- apsTree
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
# The ape package provides the function rtree()
# to compute random trees.
ape::rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels
br = NULL) # don't generate branch lengths since
# fungiTree has none, so we can't
# compare them anyway.
# (Note the warning message about non-binary trees; we'll suppress that later
# by wrapping the function call in supressMessages(); we don't want to
# print it 10,000 times :-)
# Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
# defined in .utilities.R to keep track of where we are at:
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) {
pBar(i, N)
xTree <- ape::rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
tip.label = apsTree2$tip.label,
br = NULL)
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
}
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"])
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
par(PAR) # reset graphics state
hist(myTreeDistances[, "path"],
col = "aliceblue",
main = "Distances of random Trees to fungiTree")
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
# Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics.
# [END]
# tocID <- "BIN-PHYLO-Tree_analysis.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
# Rewrite of tip re-ordering. Better handling of
# messages. pBar() for randomization.
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.2 Typo in variable name, style changes
# 1.0.1 Wrong section heading
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 50
#TOC> 2 SPECIES REFERENCE TREE 66
#TOC> 3 Tree Analysis 117
#TOC> 3.1 Rooting Trees 177
#TOC> 3.2 Rotating Clades 222
#TOC> 3.3 Computing tree distances 309
#TOC>
#TOC> ==========================================================================
# = 1 Preparation and Tree Plot ===========================================
if (! requireNamespace("ape", quietly = TRUE)) {
install.packages("ape")
}
# Package information:
# library(help = ape) # basic information
# browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets
# We change the graphics parameters from time to time, let's define the
# default so we can recreate a sane state:
dev.off()
PAR <- par()
# = 2 SPECIES REFERENCE TREE ==============================================
# Before we do any kind of phylogenetic analysis of genes from several species,
# we MUST have a reference tree of the taxonomic relationships in hand. This
# context is absolutely required for the interpretation of our tree.
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
if (! requireNamespace("taxize", quietly = TRUE)) {
install.packages("taxize")
}
# Package information:
# library(help = taxize) # basic information
# browseVignettes("taxize") # available vignettes
# data(package = "taxize") # available datasets
( mySOI <- c(myDB$taxonomy$ID, "83333") )
myClass <- taxize::classification(mySOI, db = "ncbi")
str(myClass)
myClass[[1]]
fungiTree <- taxize::class2tree(myClass, check = TRUE)
plot(fungiTree)
# The tree produced by taxize:: contains full length species names,
# but it would be more convenient if it had bicodes instead. Also, the actual
# tree is only part of the list(), which will cause problems later:
str(fungiTree)
# we therefor simplify
fungiTree <- fungiTree$phylo
str(fungiTree)
# The species names are in a vector $phylo$tip.label of this list.
# We can use biCode() to shorten them.
fungiTree$tip.label <- biCode(fungiTree$tip.label)
# Plot the tree
nSP <- length(fungiTree$tip.label)
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.6,
adj = 0.2,
bg = "#D4F2DA")
# Note that you can use the arrow buttons in the menu above the plot pane to
# scroll back to plots you have created earlier - so you can reference back to
# this species tree in your later analysis.
# = 3 Tree Analysis =======================================================
# 1.1 Visualizing your tree
# The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been defined with the Rphylip package, and in ape. You can
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
# to plot it. The underlying function is plot.phylo(), and documentation for its
# many options can by found by typing:
?plot.phylo
# We load the APSES sequence tree that you produced in the
# BIN-PHYLO-Tree_building unit:
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
plot(apsTree) # default type is "phylogram"
plot(apsTree, type = "unrooted")
plot(apsTree, type = "fan", no.margin = TRUE)
# rescale to show all of the labels:
# record the current plot parameters by assigning them to a variable ...
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
# ... and adjust the plot limits for a new plot:
plot(apsTree,
type = "fan",
x.lim = tmp$x.lim * 1.8,
y.lim = tmp$y.lim * 1.8,
cex = 0.8,
no.margin = TRUE)
# Inspect the tree object
str(apsTree)
apsTree$tip.label
apsTree$edge
apsTree$edge.length
# show the node / edge and tip labels on a plot
plot(apsTree)
ape::nodelabels()
ape::edgelabels()
ape::tiplabels()
# show the number of nodes, edges and tips
ape::Nnode(apsTree)
ape::Nedge(apsTree)
ape::Ntip(apsTree)
par(PAR) # reset graphics state
# Finally, write the tree to console in Newick format
ape::write.tree(apsTree)
# == 3.1 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
ape::is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package.
plot(apsTree)
# add labels for internal nodes and tips
ape::nodelabels(cex = 0.5, frame = "circle")
ape::tiplabels(cex = 0.5, frame = "rect")
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
# number in yours. Substitute the correct node number below for "outgroup".
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
plot(apsTree)
ape::is.rooted(apsTree)
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common
# Ancestor) of the ingroup.
# The edge lengths are stored in the phylo object:
apsTree$edge.length
# ... and you can assign a small arbitrary value to the edge
# to show how it connects to the tree without having an
# overlap.
apsTree$edge.length[1] <- 0.1
plot(apsTree, cex = 0.7)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
# This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
# myself that too. We'll just add a length by hand.
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex = 0.7, root.edge = TRUE)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# == 3.2 Rotating Clades ===================================================
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
# We can either rotate around individual internal nodes ...
layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
# Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top.
par(PAR) # reset graphics state
# ... or we can rearrange the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
# from the NCBI taxonomic classification.
nOrg <- length(apsTree$tip.label)
plot(fungiTree,
no.margin = FALSE, root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# These are the fungi tree tips ...
fungiTree$tip.label
# ... and their order is determined by the edge-list that is stored in
fungiTree$edge
# which edges join the tips?
ape::tiplabels(cex = 0.5, frame = "rect")
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
# ordered from bottom to top.
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
sel <- fungiTree$edge[ , 2 ] <= nOrg
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
# Now, here are the genes of the apsTree tips ...
apsTree$tip.label
# ... and the "constraint" we need for reordering, according to the help page
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
oSp <- gsub("^", "MBP1_", oSp)
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
# Then we can plot the two trees to compare: the fungi- tree
par(PAR) # reset graphics state
layout(matrix(1:2, 1, 2))
plot(fungiTree,
no.margin = TRUE,
root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
# and the re-organized apsesTree ...
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
no.margin = TRUE,
root.edge = TRUE)
par(PAR) # reset graphics state
# As you can see, the reordering is not perfect, since the topologies are
# different, mostly due to the unresolved nodes in the reference tree. One
# could play with that ...
# Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree?
# In order to quantify how different these two trees are, we need to compute
# tree distances.
# == 3.3 Computing tree distances ==========================================
# Many superb phylogeny tools are contributed by the phangorn package.
if (! requireNamespace("phangorn", quietly = TRUE)) {
install.packages("phangorn")
}
# Package information:
# library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
apsTree2 <- apsTree
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
# The ape package provides the function rtree()
# to compute random trees.
ape::rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels
br = NULL) # don't generate branch lengths since
# fungiTree has none, so we can't
# compare them anyway.
# (Note the warning message about non-binary trees; we'll suppress that later
# by wrapping the function call in supressMessages(); we don't want to
# print it 10,000 times :-)
# Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree().
N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
# defined in .utilities.R to keep track of where we are at:
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) {
pBar(i, N)
xTree <- ape::rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
tip.label = apsTree2$tip.label,
br = NULL)
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
}
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"])
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
par(PAR) # reset graphics state
hist(myTreeDistances[, "path"],
col = "aliceblue",
main = "Distances of random Trees to fungiTree")
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
# Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics.
# [END]

View File

@ -1,168 +1,168 @@
# tocID <- "BIN-PHYLO-Tree_building.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_building unit.
#
# Version: 1.2
#
# Date: 2017-10 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac:
# instructions to authorize proml.app
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
# Add MrBayes
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------
#TOC> 1 Calculating Trees 48
#TOC> 1.1 PROMLPATH ... 68
#TOC> 1.1.1 ... on the Mac 73
#TOC> 1.1.2 ... on Windows 101
#TOC> 1.1.3 ... on Linux 115
#TOC> 1.1.4 Confirming PROMLPATH 120
#TOC> 1.2 Building a maximum likelihood tree 134
#TOC>
#TOC> ==========================================================================
# = 1 Calculating Trees ===================================================
# Follow the instructions found at phylip's home on the Web to install. If you
# are on a Windows computer, take note of the installation directory.
# After you have installed Phylip on your computer, install the R package that
# provides an interface to the Phylip functions.
if (! requireNamespace("Rphylip", quietly = TRUE)) {
install.packages("Rphylip")
}
# Package information:
# library(help = Rphylip) # basic information
# browseVignettes("Rphylip") # available vignettes
# data(package = "Rphylip") # available datasets
# This will install RPhylip, as well as its dependency, the package "ape".
# == 1.1 PROMLPATH ... =====================================================
# The next part may be tricky. You will need to figure out where
# on your computer Phylip has been installed and define the path
# to the proml program that calculates a maximum-likelihood tree.
# === 1.1.1 ... on the Mac
# On the Mac, the standard installation places a phylip folder
# in the /Applications directory. That folder contains all the
# individual phylip programs as <name>.app files. These are not
# the actual executables, but "app" files are actually directories
# that contain the required resources for a program to run.
# The executable is in a subdirectory and you can point Rphylip
# directly to that subdirectory to find the program it needs:
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
# because they have not been "signed" by the PHYLIP developers. The process
# will terminate by your system, with a warning.
# - Navigate to the phylip folder in your ~/Applications directory
# - Descend into the "exe" folder and find proml.app
# - Ctrl-click proml.app and choose "Open". A dialogue will show that
# says: "macOS cannot verify the developer of “proml.app”.
# Are you sure you want to open it?"
# - Click open to continue. You may need to allow access to the terminal
# as well. When the proml terminal session open, you can type
# Ctrl-c to abort the program and close the window.
#
# This adds proml.app to the list of known-good programs and you will not
# need to repeat this process.
#
# === 1.1.2 ... on Windows
# On Windows you need to know where the programs have been installed, and you
# need to specify a path that is correct for the Windows OS. Find the folder
# that is named "exe", and right-click to inspect its properties. The path
# should be listed among them.
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
# assignment has to be
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
# (Note: "/", not "\")
# I have heard that your path must not contain spaces, and it is prudent to
# avoid other special characters as well.
# === 1.1.3 ... on Linux
# If you are running Linux I trust you know what to do. It's probably
# something like
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
# === 1.1.4 Confirming PROMLPATH
# Confirm that the settings are right.
PROMLPATH # returns the path
list.dirs(PROMLPATH) # returns the directories in that path
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
# If "proml" is NOT among the files that the last command returns, you
# can't continue. Ask on the mailing list for advice.
# If everything is good, you can add the line that defines PROMLPATH to
# myScripts/.myProfile.R - the path will then be automatically set when
# you quit RStudio and return.
# == 1.2 Building a maximum likelihood tree ================================
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
# as a "proseq" object with the read.protein() function of the RPhylip package:
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
str(apsIn)
# ... and you are ready to build a tree.
# There are many fast options in PHYLIP - we will use the most _accurate_ one
# that it has: proml, a maximum-likelihood tree building program for protein
# data.
# Building maximum-likelihood trees can eat as much computer time
# as you can throw at it. Calculating a tree of 48 APSES domains
# with default parameters of Rproml() runs for more than half a day
# on my computer. But we have only twelve sequences here, so the
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
# of coffee while you are waiting.
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
# A quick first look:
plot(apsTree)
# save your tree:
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
# If this did not work, ask for advice.
# [END]
# tocID <- "BIN-PHYLO-Tree_building.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_building unit.
#
# Version: 1.2
#
# Date: 2017-10 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac:
# instructions to authorize proml.app
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
# Add MrBayes
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------
#TOC> 1 Calculating Trees 48
#TOC> 1.1 PROMLPATH ... 68
#TOC> 1.1.1 ... on the Mac 73
#TOC> 1.1.2 ... on Windows 101
#TOC> 1.1.3 ... on Linux 115
#TOC> 1.1.4 Confirming PROMLPATH 120
#TOC> 1.2 Building a maximum likelihood tree 134
#TOC>
#TOC> ==========================================================================
# = 1 Calculating Trees ===================================================
# Follow the instructions found at phylip's home on the Web to install. If you
# are on a Windows computer, take note of the installation directory.
# After you have installed Phylip on your computer, install the R package that
# provides an interface to the Phylip functions.
if (! requireNamespace("Rphylip", quietly = TRUE)) {
install.packages("Rphylip")
}
# Package information:
# library(help = Rphylip) # basic information
# browseVignettes("Rphylip") # available vignettes
# data(package = "Rphylip") # available datasets
# This will install RPhylip, as well as its dependency, the package "ape".
# == 1.1 PROMLPATH ... =====================================================
# The next part may be tricky. You will need to figure out where
# on your computer Phylip has been installed and define the path
# to the proml program that calculates a maximum-likelihood tree.
# === 1.1.1 ... on the Mac
# On the Mac, the standard installation places a phylip folder
# in the /Applications directory. That folder contains all the
# individual phylip programs as <name>.app files. These are not
# the actual executables, but "app" files are actually directories
# that contain the required resources for a program to run.
# The executable is in a subdirectory and you can point Rphylip
# directly to that subdirectory to find the program it needs:
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
# because they have not been "signed" by the PHYLIP developers. The process
# will terminate by your system, with a warning.
# - Navigate to the phylip folder in your ~/Applications directory
# - Descend into the "exe" folder and find proml.app
# - Ctrl-click proml.app and choose "Open". A dialogue will show that
# says: "macOS cannot verify the developer of “proml.app”.
# Are you sure you want to open it?"
# - Click open to continue. You may need to allow access to the terminal
# as well. When the proml terminal session open, you can type
# Ctrl-c to abort the program and close the window.
#
# This adds proml.app to the list of known-good programs and you will not
# need to repeat this process.
#
# === 1.1.2 ... on Windows
# On Windows you need to know where the programs have been installed, and you
# need to specify a path that is correct for the Windows OS. Find the folder
# that is named "exe", and right-click to inspect its properties. The path
# should be listed among them.
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
# assignment has to be
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
# (Note: "/", not "\")
# I have heard that your path must not contain spaces, and it is prudent to
# avoid other special characters as well.
# === 1.1.3 ... on Linux
# If you are running Linux I trust you know what to do. It's probably
# something like
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
# === 1.1.4 Confirming PROMLPATH
# Confirm that the settings are right.
PROMLPATH # returns the path
list.dirs(PROMLPATH) # returns the directories in that path
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
# If "proml" is NOT among the files that the last command returns, you
# can't continue. Ask on the mailing list for advice.
# If everything is good, you can add the line that defines PROMLPATH to
# myScripts/.myProfile.R - the path will then be automatically set when
# you quit RStudio and return.
# == 1.2 Building a maximum likelihood tree ================================
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
# as a "proseq" object with the read.protein() function of the RPhylip package:
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
str(apsIn)
# ... and you are ready to build a tree.
# There are many fast options in PHYLIP - we will use the most _accurate_ one
# that it has: proml, a maximum-likelihood tree building program for protein
# data.
# Building maximum-likelihood trees can eat as much computer time
# as you can throw at it. Calculating a tree of 48 APSES domains
# with default parameters of Rproml() runs for more than half a day
# on my computer. But we have only twelve sequences here, so the
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
# of coffee while you are waiting.
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
# A quick first look:
plot(apsTree)
# save your tree:
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
# If this did not work, ask for advice.
# [END]

View File

@ -1,323 +1,323 @@
# tocID <- "BIN-PPI-Analysis.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit.
#
# Version: 1.4
#
# Date: 2017-08 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.4 Update vector ID's for betweenness centrality.
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
# 1.2 2020 Updates; Rewrite for new STRINg V11;
# Deprecate save()/load() for saveRDS()/readRDS()
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First live version
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 50
#TOC> 2 Functional Edges in the Human Proteome 86
#TOC> 2.1 Cliques 129
#TOC> 2.2 Communities 170
#TOC> 2.3 Betweenness Centrality 184
#TOC> 3 biomaRt 231
#TOC> 4 Task for submission 302
#TOC>
#TOC> ==========================================================================
# = 1 Setup and data ======================================================
# Not surprisingly, the analysis of PPI networks needs iGraph:
if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph")
}
# Package information:
# library(help = igraph) # basic information
# browseVignettes("igraph") # available vignettes
# data(package = "igraph") # available datasets
# In order for you to explore some real, biological networks, I give you a
# dataframe of functional relationships of human proteins that I have downloaded
# from the STRING database. The full table has 8.5 million records, here is a
# subset of records with combined confidence scores > 980
# The selected set of edges with a confidence of > 964 is a dataframe with about
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
# a fungal proteome. You can load the saved dataframe here (To read more about
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
STRINGedges <- readRDS("./data/STRINGedges.rds")
head(STRINGedges)
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
head(STRINGedges)
# = 2 Functional Edges in the Human Proteome ==============================
# There are many possibilities to explore interesting aspects of biological
# networks, we will keep with some very simple procedures here but you have
# to be aware that this is barely scratching the surface of possibilities.
# However, once the network exists in your computer, it is comparatively
# easy to find information online about the many, many options to analyze.
# Make a graph from this dataframe
?igraph::graph_from_data_frame
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google
# for <layout large graphs> if you are curious. Also, consider what one can
# really learn from plotting such a graph ...
# Of course simple computations on this graph are reasonably fast:
compSTR <- igraph::components(gSTR)
summary(compSTR) # our graph is fully connected!
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
# this actually does look rather scale-free
(freqRank <- table(igraph::degree(gSTR)))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "8,400 nodes from the human functional interaction network")
# This looks very scale-free indeed.
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
log10(as.numeric(names(freqRank)) + 1)))
abline(regressionLine, col = "firebrick")
# Now explore some more:
# == 2.1 Cliques ===========================================================
# Let's find the largest cliques. Remember: a clique is a fully connected
# subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs.
igraph::clique_num(gSTR)
# The largest clique has 81 members.
(C <- igraph::largest_cliques(gSTR)[[1]])
# Pick one of the proteins and find out what this fully connected cluster of 81
# proteins is (you can simply Google for any of the IDs). Is this expected?
# Plot this ...
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
# color the vertices along a color spectrum
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
# color the edges to have the same color as the originating node
eCol <- character()
for (i in seq_along(vCol)) {
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
}
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(R,
layout = igraph::layout_in_circle(R),
vertex.size = 3,
vertex.color = vCol,
edge.color = eCol,
edge.width = 0.1,
vertex.label = NA)
par(oPar)
# ... well: remember: a clique means every node is connected to every other
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
# networks looks like for large complexes.
# == 2.2 Communities =======================================================
set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- igraph::cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
igraph::modularity(gSTRclusters) # ... measures how separated the different
# membership types are from each other
tMem <- table(igraph::membership(gSTRclusters))
length(tMem) # About 700 communities identified
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
range(tMem) # ... but one has > 200 members
# == 2.3 Betweenness Centrality ============================================
# Let's find the nodes with the 10 - highest betweenness centralities.
#
BC <- igraph::centr_betw(gSTR)
# remember: BC$res contains the results
head(BC$res)
BC$res[1] # betweenness centrality of node 1 in the graph ...
# ... which one is node 1?
igraph::V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements of BC with their
# index ...
names(BC$res) <- as.character(1:length(BC$res))
# ... and then we sort:
sBC <- sort(BC$res, decreasing = TRUE)
head(sBC)
# This ordered vector means: node 3 has the highest betweenness centrality,
# node 721 has the second highest, etc.
(BCsel <- as.numeric(names(sBC)[1:10]))
# We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs...
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
# Task:
# =====
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
# We are going to use these IDs to produce some output for a submitted task:
# therefore I need you to execute the following line, note the "seal" that this
# returns, and not change myENSPsel later:
myENSPsel <- selectENSP(ENSPsel)
# Next, to find what these proteins are...
# We could now Google for all of these IDs to learn more about them. But really,
# googling for IDs one after the other, that would be lame. Let's instead use
# the very, very useful biomaRt package to translate these Ensemble IDs into
# gene symbols.
# = 3 biomaRt =============================================================
# IDs are just labels, but for _bio_informatics we need to learn more about the
# biological function of the genes or proteins that we retrieve via graph data
# mining. biomaRt is the tool of choice. It's a package distributed by the
# bioconductor project. This here is not a biomaRt tutorial (that's for another
# day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt")
}
# Package information:
# library(help = biomaRt) # basic information
# browseVignettes("biomaRt") # available vignettes
# data(package = "biomaRt") # available datasets
# define which dataset to use ... this takes a while for download
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined?
( filters <- biomaRt::listFilters(myMart) )
# and what attributes can we filter for?
( attributes <- biomaRt::listAttributes(myMart) )
# Soooo many options - let's look for the correct name of filters that are
# useful for ENSP IDs ...
filters[grep("ENSP", filters$description), ]
# ... and the correct attribute names for gene symbols and descriptions ...
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
# ... so we can put this together: here is a syntax example:
biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = "ENSP00000000442",
mart = myMart)
# A simple loop will now get us the information for our 10 most central genes
# from the human subset of STRING.
CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list.
for (ID in myENSPsel) {
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = ID,
mart = myMart)
}
# So what are the proteins with the ten highest betweenness centralities?
# ... are you surprised? (I am! Really.)
# = 4 Task for submission =================================================
# Write a loop that will go through your personalized list of Ensemble IDs and
# for each ID:
# -- print the ID,
# -- print the first row's HGNC symbol,
# -- print the first row's wikigene description.
# -- print the first row's phenotype.
#
# Write your thoughts about this group of genes.
#
# (Hint, you can structure your loop in the same way as the loop that
# created CPdefs. )
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
# for this loop and its output into your report if you are submitting
# anything for credit for this unit. Please read the requirements carefully.
# [END]
# tocID <- "BIN-PPI-Analysis.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit.
#
# Version: 1.4
#
# Date: 2017-08 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.4 Update vector ID's for betweenness centrality.
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
# 1.2 2020 Updates; Rewrite for new STRINg V11;
# Deprecate save()/load() for saveRDS()/readRDS()
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First live version
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 50
#TOC> 2 Functional Edges in the Human Proteome 86
#TOC> 2.1 Cliques 129
#TOC> 2.2 Communities 170
#TOC> 2.3 Betweenness Centrality 184
#TOC> 3 biomaRt 231
#TOC> 4 Task for submission 302
#TOC>
#TOC> ==========================================================================
# = 1 Setup and data ======================================================
# Not surprisingly, the analysis of PPI networks needs iGraph:
if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph")
}
# Package information:
# library(help = igraph) # basic information
# browseVignettes("igraph") # available vignettes
# data(package = "igraph") # available datasets
# In order for you to explore some real, biological networks, I give you a
# dataframe of functional relationships of human proteins that I have downloaded
# from the STRING database. The full table has 8.5 million records, here is a
# subset of records with combined confidence scores > 980
# The selected set of edges with a confidence of > 964 is a dataframe with about
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
# a fungal proteome. You can load the saved dataframe here (To read more about
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
STRINGedges <- readRDS("./data/STRINGedges.rds")
head(STRINGedges)
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
head(STRINGedges)
# = 2 Functional Edges in the Human Proteome ==============================
# There are many possibilities to explore interesting aspects of biological
# networks, we will keep with some very simple procedures here but you have
# to be aware that this is barely scratching the surface of possibilities.
# However, once the network exists in your computer, it is comparatively
# easy to find information online about the many, many options to analyze.
# Make a graph from this dataframe
?igraph::graph_from_data_frame
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google
# for <layout large graphs> if you are curious. Also, consider what one can
# really learn from plotting such a graph ...
# Of course simple computations on this graph are reasonably fast:
compSTR <- igraph::components(gSTR)
summary(compSTR) # our graph is fully connected!
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
# this actually does look rather scale-free
(freqRank <- table(igraph::degree(gSTR)))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "8,400 nodes from the human functional interaction network")
# This looks very scale-free indeed.
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
log10(as.numeric(names(freqRank)) + 1)))
abline(regressionLine, col = "firebrick")
# Now explore some more:
# == 2.1 Cliques ===========================================================
# Let's find the largest cliques. Remember: a clique is a fully connected
# subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs.
igraph::clique_num(gSTR)
# The largest clique has 81 members.
(C <- igraph::largest_cliques(gSTR)[[1]])
# Pick one of the proteins and find out what this fully connected cluster of 81
# proteins is (you can simply Google for any of the IDs). Is this expected?
# Plot this ...
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
# color the vertices along a color spectrum
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
# color the edges to have the same color as the originating node
eCol <- character()
for (i in seq_along(vCol)) {
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
}
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(R,
layout = igraph::layout_in_circle(R),
vertex.size = 3,
vertex.color = vCol,
edge.color = eCol,
edge.width = 0.1,
vertex.label = NA)
par(oPar)
# ... well: remember: a clique means every node is connected to every other
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
# networks looks like for large complexes.
# == 2.2 Communities =======================================================
set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- igraph::cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
igraph::modularity(gSTRclusters) # ... measures how separated the different
# membership types are from each other
tMem <- table(igraph::membership(gSTRclusters))
length(tMem) # About 700 communities identified
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
range(tMem) # ... but one has > 200 members
# == 2.3 Betweenness Centrality ============================================
# Let's find the nodes with the 10 - highest betweenness centralities.
#
BC <- igraph::centr_betw(gSTR)
# remember: BC$res contains the results
head(BC$res)
BC$res[1] # betweenness centrality of node 1 in the graph ...
# ... which one is node 1?
igraph::V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements of BC with their
# index ...
names(BC$res) <- as.character(1:length(BC$res))
# ... and then we sort:
sBC <- sort(BC$res, decreasing = TRUE)
head(sBC)
# This ordered vector means: node 3 has the highest betweenness centrality,
# node 721 has the second highest, etc.
(BCsel <- as.numeric(names(sBC)[1:10]))
# We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs...
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
# Task:
# =====
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
# We are going to use these IDs to produce some output for a submitted task:
# therefore I need you to execute the following line, note the "seal" that this
# returns, and not change myENSPsel later:
myENSPsel <- selectENSP(ENSPsel)
# Next, to find what these proteins are...
# We could now Google for all of these IDs to learn more about them. But really,
# googling for IDs one after the other, that would be lame. Let's instead use
# the very, very useful biomaRt package to translate these Ensemble IDs into
# gene symbols.
# = 3 biomaRt =============================================================
# IDs are just labels, but for _bio_informatics we need to learn more about the
# biological function of the genes or proteins that we retrieve via graph data
# mining. biomaRt is the tool of choice. It's a package distributed by the
# bioconductor project. This here is not a biomaRt tutorial (that's for another
# day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt")
}
# Package information:
# library(help = biomaRt) # basic information
# browseVignettes("biomaRt") # available vignettes
# data(package = "biomaRt") # available datasets
# define which dataset to use ... this takes a while for download
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined?
( filters <- biomaRt::listFilters(myMart) )
# and what attributes can we filter for?
( attributes <- biomaRt::listAttributes(myMart) )
# Soooo many options - let's look for the correct name of filters that are
# useful for ENSP IDs ...
filters[grep("ENSP", filters$description), ]
# ... and the correct attribute names for gene symbols and descriptions ...
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
# ... so we can put this together: here is a syntax example:
biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = "ENSP00000000442",
mart = myMart)
# A simple loop will now get us the information for our 10 most central genes
# from the human subset of STRING.
CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list.
for (ID in myENSPsel) {
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = ID,
mart = myMart)
}
# So what are the proteins with the ten highest betweenness centralities?
# ... are you surprised? (I am! Really.)
# = 4 Task for submission =================================================
# Write a loop that will go through your personalized list of Ensemble IDs and
# for each ID:
# -- print the ID,
# -- print the first row's HGNC symbol,
# -- print the first row's wikigene description.
# -- print the first row's phenotype.
#
# Write your thoughts about this group of genes.
#
# (Hint, you can structure your loop in the same way as the loop that
# created CPdefs. )
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
# for this loop and its output into your report if you are submitting
# anything for credit for this unit. Please read the requirements carefully.
# [END]

View File

@ -1,252 +1,252 @@
# tocID <- "BIN-SEQA-Composition.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-SEQA-Comparison unit
#
# Version: 1.2
#
# Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# Versions:
# 1.0 First live version 2017
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 Preparation 48
#TOC> 2 Aggregate properties 69
#TOC> 3 Sequence Composition Enrichment 113
#TOC> 3.1 Barplot, and side-by-side barplot 136
#TOC> 3.2 Plotting ratios 171
#TOC> 3.3 Plotting log ratios 188
#TOC> 3.4 Sort by frequency 204
#TOC> 3.5 Color by amino acid type 221
#TOC>
#TOC> ==========================================================================
# = 1 Preparation =========================================================
if (! requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# Load a reference sequence to work with:
# If you have done the BIN-Storing_data unit:
source("makeProteinDB.R")
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
mySeq <- myDB$protein$sequence[sel]
# If not, use the yeast Mbp1 sequence:
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
# = 2 Aggregate properties ================================================
# Let's try a simple function from seqinr: computing the pI of the sequence
?seqinr::computePI
# This takes as input a vector of upper-case AA codes
# We can use the function strsplit() to split the string
# into single characters
(s <- strsplit(mySeq, "")) # splitting on the empty spring
# splits into single characters
s <- unlist(s) # strsplit() returns a list! Why?
# (But we don't need a list now...)
# Alternatively, seqinr provides
# the function s2c() to convert strings into
# character vectors (and c2s to convert them back).
seqinr::s2c(mySeq)
seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point
seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight
seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of
# values along the sequence
# A true Labor of Love has gone into the
# compilation of the "aaindex" data:
?seqinr::aaindex
data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it
# accessible as an R object
length(aaindex) # no seqinr:: needed for the dataset since we just
# "attached" it with data()
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# = 3 Sequence Composition Enrichment =====================================
# Lets use one of the indices to calculate and plot amino-acid
# composition enrichment:
aaindex[[459]]$D
#
# Let's construct an enrichment plot to compare average frequencies
# with the amino acid counts in our sequence.
(refData <- aaindex[[459]]$I) # reference frequencies in %
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
# code using seqinr's "a()" function
sum(refData)
refData # ... in %
# tabulate the amino acid counts in mySeq
(obsData <- table(seqinr::s2c(mySeq))) # counts
(obsData <- 100 * (obsData / sum(obsData))) # frequencies
# == 3.1 Barplot, and side-by-side barplot =================================
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
abline(h = 100/20, col="#BB0000")
barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555")
# Ok: first problem - the values in obsData are in alphabetical order. But the
# values in refData are in alphabetical order of amino acid name: alanine,
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
# order a lot - one of the old biochemistry tropes in the field. So we need to
# re-order one of the vectors to match the other. That's easy though:
refData
(refData <- refData[names(obsData)])
barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555")
# To compare the values, we want to see them in a barplot, side-by-side ...
barplot(rbind(obsData, refData),
ylim = c(0, 12),
beside = TRUE,
col = c("#CCCCCC", "#BB0000"),
cex.names = 0.7)
abline(h = 100/20, col="#00000044")
# ... and add a legend
legend (x = 1, y = 12,
legend = c("mySeq", "Average composition"),
fill = c("#CCCCCC", "#BB0000"),
cex = 0.7,
bty = "n")
# == 3.2 Plotting ratios ===================================================
# To better compare the values, we'll calculate ratios between
# obsData and refData
barplot(obsData / refData,
col = "#CCCCCC",
ylab = "Sequence / Average",
ylim = c(0, 2.5),
cex.names = 0.7)
abline(h = 1, col="#BB0000")
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
# ... but ratios are not very good here, since the difference in height on the
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
# lines) are exactly the same fold-difference !
# == 3.3 Plotting log ratios ===============================================
# A better way to display this
# is to plot log(ratios).
barplot(log(obsData / refData),
col = "#CCCCCC",
ylab = "log(Sequence / Average)",
ylim = log(c(1/3, 3)),
cex.names = 0.7)
abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
# Note how the two-fold difference lines are now the same distance from the
# line of equal ratio.
# == 3.4 Sort by frequency =================================================
barplot(sort(log(obsData / refData), decreasing = TRUE),
ylim = log(c(1/3, 3)),
col = "#CCCCCC",
ylab = "log(Sequence / Average)",
cex.names = 0.7)
abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# == 3.5 Color by amino acid type ==========================================
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
# script, or define your own.
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
lR <- sort(log(obsData / refData), decreasing = TRUE)
barplot(lR,
ylim = log(c(1/3, 3)),
col = AACOLS[names(lR)],
ylab = "log(Sequence / Average)",
cex.names = 0.7)
abline(h = log(1), col="#00000055")
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# Task:
# Interpret this plot. (Can you?) Which types of amino acids are enriched?
# Depleted?
# [END]
# tocID <- "BIN-SEQA-Composition.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-SEQA-Comparison unit
#
# Version: 1.2
#
# Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# Versions:
# 1.0 First live version 2017
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 Preparation 48
#TOC> 2 Aggregate properties 69
#TOC> 3 Sequence Composition Enrichment 113
#TOC> 3.1 Barplot, and side-by-side barplot 136
#TOC> 3.2 Plotting ratios 171
#TOC> 3.3 Plotting log ratios 188
#TOC> 3.4 Sort by frequency 204
#TOC> 3.5 Color by amino acid type 221
#TOC>
#TOC> ==========================================================================
# = 1 Preparation =========================================================
if (! requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# Load a reference sequence to work with:
# If you have done the BIN-Storing_data unit:
source("makeProteinDB.R")
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
mySeq <- myDB$protein$sequence[sel]
# If not, use the yeast Mbp1 sequence:
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
# = 2 Aggregate properties ================================================
# Let's try a simple function from seqinr: computing the pI of the sequence
?seqinr::computePI
# This takes as input a vector of upper-case AA codes
# We can use the function strsplit() to split the string
# into single characters
(s <- strsplit(mySeq, "")) # splitting on the empty spring
# splits into single characters
s <- unlist(s) # strsplit() returns a list! Why?
# (But we don't need a list now...)
# Alternatively, seqinr provides
# the function s2c() to convert strings into
# character vectors (and c2s to convert them back).
seqinr::s2c(mySeq)
seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point
seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight
seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of
# values along the sequence
# A true Labor of Love has gone into the
# compilation of the "aaindex" data:
?seqinr::aaindex
data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it
# accessible as an R object
length(aaindex) # no seqinr:: needed for the dataset since we just
# "attached" it with data()
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# = 3 Sequence Composition Enrichment =====================================
# Lets use one of the indices to calculate and plot amino-acid
# composition enrichment:
aaindex[[459]]$D
#
# Let's construct an enrichment plot to compare average frequencies
# with the amino acid counts in our sequence.
(refData <- aaindex[[459]]$I) # reference frequencies in %
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
# code using seqinr's "a()" function
sum(refData)
refData # ... in %
# tabulate the amino acid counts in mySeq
(obsData <- table(seqinr::s2c(mySeq))) # counts
(obsData <- 100 * (obsData / sum(obsData))) # frequencies
# == 3.1 Barplot, and side-by-side barplot =================================
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
abline(h = 100/20, col="#BB0000")
barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555")
# Ok: first problem - the values in obsData are in alphabetical order. But the
# values in refData are in alphabetical order of amino acid name: alanine,
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
# order a lot - one of the old biochemistry tropes in the field. So we need to
# re-order one of the vectors to match the other. That's easy though:
refData
(refData <- refData[names(obsData)])
barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555")
# To compare the values, we want to see them in a barplot, side-by-side ...
barplot(rbind(obsData, refData),
ylim = c(0, 12),
beside = TRUE,
col = c("#CCCCCC", "#BB0000"),
cex.names = 0.7)
abline(h = 100/20, col="#00000044")
# ... and add a legend
legend (x = 1, y = 12,
legend = c("mySeq", "Average composition"),
fill = c("#CCCCCC", "#BB0000"),
cex = 0.7,
bty = "n")
# == 3.2 Plotting ratios ===================================================
# To better compare the values, we'll calculate ratios between
# obsData and refData
barplot(obsData / refData,
col = "#CCCCCC",
ylab = "Sequence / Average",
ylim = c(0, 2.5),
cex.names = 0.7)
abline(h = 1, col="#BB0000")
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
# ... but ratios are not very good here, since the difference in height on the
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
# lines) are exactly the same fold-difference !
# == 3.3 Plotting log ratios ===============================================
# A better way to display this
# is to plot log(ratios).
barplot(log(obsData / refData),
col = "#CCCCCC",
ylab = "log(Sequence / Average)",
ylim = log(c(1/3, 3)),
cex.names = 0.7)
abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
# Note how the two-fold difference lines are now the same distance from the
# line of equal ratio.
# == 3.4 Sort by frequency =================================================
barplot(sort(log(obsData / refData), decreasing = TRUE),
ylim = log(c(1/3, 3)),
col = "#CCCCCC",
ylab = "log(Sequence / Average)",
cex.names = 0.7)
abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# == 3.5 Color by amino acid type ==========================================
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
# script, or define your own.
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
lR <- sort(log(obsData / refData), decreasing = TRUE)
barplot(lR,
ylim = log(c(1/3, 3)),
col = AACOLS[names(lR)],
ylab = "log(Sequence / Average)",
cex.names = 0.7)
abline(h = log(1), col="#00000055")
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# Task:
# Interpret this plot. (Can you?) Which types of amino acids are enriched?
# Depleted?
# [END]

View File

@ -1,394 +1,394 @@
# tocID <- "BIN-Sequence.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit.
#
# Version: 1.5
#
# Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.5 2020 Updates
# 1.4 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr()
# 1.0 First live version 2017.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Prepare 63
#TOC> 2 Storing Sequence 80
#TOC> 3 String properties 109
#TOC> 4 Substrings 116
#TOC> 5 Creating strings: sprintf() 137
#TOC> 6 Changing strings 172
#TOC> 6.1.1 Changing case 174
#TOC> 6.1.2 Reverse 179
#TOC> 6.1.3 Change characters 183
#TOC> 6.1.4 Substitute characters 211
#TOC> 6.2 stringi and stringr 231
#TOC> 6.3 dbSanitizeSequence() 241
#TOC> 7 Permuting and sampling 253
#TOC> 7.1 Permutations 260
#TOC> 7.2 Sampling 306
#TOC> 7.2.1 Equiprobable characters 308
#TOC> 7.2.2 Defined probability vector 350
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
# Much basic sequence handling is supported by the Bioconductor package
# Biostrings.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# = 2 Storing Sequence ====================================================
# Sequences can be represented and stored as vectors of single characters ...
(v <- c("D", "I", "V", "M", "T", "Q"))
# ... as strings ...
(s <- "DIVMTQ")
# ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc.
(a <- Biostrings::AAString("DIVMTQ"))
# ... and all of these representations can be interconverted:
# string to vector ...
unlist(strsplit(s, ""))
# vector to string ...
paste(v, sep = "", collapse = "")
# ... and AAstring to plain string.
as.character(a)
# Since operations with character vectors trivially follow all other vector
# conventions and syntax, and we will look at Biostrings methods in more
# detail in a later unit, we will focus on basic strings in the following.
# = 3 String properties ===================================================
length(s) # why ???
nchar(s) # Aha!
# = 4 Substrings ==========================================================
# Use the substr() function
substr(s, 2, 4)
# or the similar substring()
substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence
(vI <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
substring(s, vI, vI+2) # ... returns all triplets
# = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a bit of a learning curve, but this is
# totally worth it:
# the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf()
# calls elsewhere in the code.
sprintf("Just a string.")
sprintf("A string and the number %d.", 5)
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer"
N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string:
for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i - 1, x))
}
# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ====================================================
# === 6.1.1 Changing case
tolower(s)
toupper(tolower(s))
# === 6.1.2 Reverse
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
# Biostrings::str_rev(s)
# The following works, of course, but awkward:
s
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
# reverse complement
COMP <- c("t", "g", "c", "a")
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
s
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
# === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." Kind of like the COMP vector above ...
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
# One could implement toupper() and tolower() with this - remember that R has
# character vectors of uppercase and lowercase letters as language constants.
chartr(paste0(letters, collapse = ""),
paste0(LETTERS, collapse = ""),
"Twinkle, twinkle little star, how I wonder what you are.")
# One amusing way to use the function is for a reversible substitution
# cypher.
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
set.seed(112358) # set RNG seed for repeatable randomness
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
set.seed(NULL) # reset the RNG
# encode ...
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
# decode ...
chartr(myCypher, alBet, x)
# (Nb. substitution cyphers are easy to crack!)
# === 6.1.4 Substitute characters
# gsub can change lengths.
# Example: implementing the binary Fibonacci sequence:
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
( s <- 1 )
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
# Iterate this line a few times ...
#
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
# for the features of the sequence.
# I use gsub() often to delete unwanted characters ...
# ... select something, and substitute the empty string for it.
(s <- gsub("-", "", s))
# For example: clean up a sequence
# copy/paste from UniProt
(s <- " 10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove numbers
(s <- gsub("[0-9]", "", s))
# remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s))
# == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at
# the function descriptions for the stringr and the stringi package. stringr is
# part of the tidyverse, and for the most part a wrapper for stringi functions.
# https://github.com/tidyverse/stringr
# == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources
cat( s <- ">FASTA header will be removed
10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
dbSanitizeSequence(s)
# = 7 Permuting and sampling ==============================================
# An important aspect of working with strings is generating random strings
# with given statistical properties: reference items to evaluate significance.
# == 7.1 Permutations ======================================================
# One way to produce such reference items is to permute a string. A permuted
# string has the same composition as the original, but all positional
# information is lost. The sample() function can be used to permute:
# This is the sequence of the ompA secretion signal
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
(x <- sample(s, length(s))) # permuted
# Here's a small example how such permuted strings may be useful. As you look
# at the ompA sequence, you suspect that the two lysines near the +-charged
# N-terminus may not be accidental, but selected for a positively charged
# N-terminus. What is the chance that such a sequence has two lysines close to
# the N-terminus simply by chance? Or put differently: what is the average
# distance of two lysines in such a sequence to the N-terminus. First, we
# need an expression that measures the distance. A simple use of the which()
# function will do just fine.
which(s == "K") # shows they are in position 2 and 3, so ...
mean(which(s == "K")) # ... gives us the average, and ...
mean(which(x == "K")) # ... gives us the average of the permuted sequence.
# So what does the distribution look like? Lets do 10,000 trials.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000
d <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K"))
}
set.seed(NULL) # reset the RNG
hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
# N-terminus or more. That's just below the signifcance
# threshold of 5 %. It's a trend, but to be sure we are looking
# at a biological effect we would need to see more
# sequences.
# == 7.2 Sampling ==========================================================
# === 7.2.1 Equiprobable characters
# Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it:
nuc <- c("A", "C", "G", "T")
N <- 100
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # 51 is close to expected
# What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function
if (! requireNamespace("stringi", quietly = TRUE)) {
install.packages("stringi")
}
# Package information:
# library(help = stringi) # basic information
# browseVignettes("stringi") # available vignettes
# data(package = "stringi") # available datasets
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine
# whether there are more or less CpG motifs than expected by chance.
# (cf. https://en.wikipedia.org/wiki/CpG_site)
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
# with the same probability. But the yeast genomic DNA GC content is only
# 38%. So you would expect fewer CpG motifs based on the statistical properties
# of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that?
# === 7.2.2 Defined probability vector
# This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with
# 19 C, 19 G, 31 A and 31 T
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
# ... but that doesn't scale if the numeric accuracy needs to be higher.
#
# However sample() has an argument that takes care of that: you can explicitly
# specify the probabilities with which each element of the the sampling vector
# should be chosen:
nuc <- c("A", "C", "G", "T")
N <- 100
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs?
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case.
# [END]
# tocID <- "BIN-Sequence.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit.
#
# Version: 1.5
#
# Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.5 2020 Updates
# 1.4 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr()
# 1.0 First live version 2017.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Prepare 63
#TOC> 2 Storing Sequence 80
#TOC> 3 String properties 109
#TOC> 4 Substrings 116
#TOC> 5 Creating strings: sprintf() 137
#TOC> 6 Changing strings 172
#TOC> 6.1.1 Changing case 174
#TOC> 6.1.2 Reverse 179
#TOC> 6.1.3 Change characters 183
#TOC> 6.1.4 Substitute characters 211
#TOC> 6.2 stringi and stringr 231
#TOC> 6.3 dbSanitizeSequence() 241
#TOC> 7 Permuting and sampling 253
#TOC> 7.1 Permutations 260
#TOC> 7.2 Sampling 306
#TOC> 7.2.1 Equiprobable characters 308
#TOC> 7.2.2 Defined probability vector 350
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
# Much basic sequence handling is supported by the Bioconductor package
# Biostrings.
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# = 2 Storing Sequence ====================================================
# Sequences can be represented and stored as vectors of single characters ...
(v <- c("D", "I", "V", "M", "T", "Q"))
# ... as strings ...
(s <- "DIVMTQ")
# ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc.
(a <- Biostrings::AAString("DIVMTQ"))
# ... and all of these representations can be interconverted:
# string to vector ...
unlist(strsplit(s, ""))
# vector to string ...
paste(v, sep = "", collapse = "")
# ... and AAstring to plain string.
as.character(a)
# Since operations with character vectors trivially follow all other vector
# conventions and syntax, and we will look at Biostrings methods in more
# detail in a later unit, we will focus on basic strings in the following.
# = 3 String properties ===================================================
length(s) # why ???
nchar(s) # Aha!
# = 4 Substrings ==========================================================
# Use the substr() function
substr(s, 2, 4)
# or the similar substring()
substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence
(vI <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
substring(s, vI, vI+2) # ... returns all triplets
# = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a bit of a learning curve, but this is
# totally worth it:
# the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf()
# calls elsewhere in the code.
sprintf("Just a string.")
sprintf("A string and the number %d.", 5)
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer"
N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string:
for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i - 1, x))
}
# sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ====================================================
# === 6.1.1 Changing case
tolower(s)
toupper(tolower(s))
# === 6.1.2 Reverse
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
# Biostrings::str_rev(s)
# The following works, of course, but awkward:
s
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
# reverse complement
COMP <- c("t", "g", "c", "a")
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
s
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
# === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." Kind of like the COMP vector above ...
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
# One could implement toupper() and tolower() with this - remember that R has
# character vectors of uppercase and lowercase letters as language constants.
chartr(paste0(letters, collapse = ""),
paste0(LETTERS, collapse = ""),
"Twinkle, twinkle little star, how I wonder what you are.")
# One amusing way to use the function is for a reversible substitution
# cypher.
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
set.seed(112358) # set RNG seed for repeatable randomness
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
set.seed(NULL) # reset the RNG
# encode ...
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
# decode ...
chartr(myCypher, alBet, x)
# (Nb. substitution cyphers are easy to crack!)
# === 6.1.4 Substitute characters
# gsub can change lengths.
# Example: implementing the binary Fibonacci sequence:
# 0 -> 1; 1 -> 10 , in three nested gsub() statements
( s <- 1 )
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
# Iterate this line a few times ...
#
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
# for the features of the sequence.
# I use gsub() often to delete unwanted characters ...
# ... select something, and substitute the empty string for it.
(s <- gsub("-", "", s))
# For example: clean up a sequence
# copy/paste from UniProt
(s <- " 10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove numbers
(s <- gsub("[0-9]", "", s))
# remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s))
# == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at
# the function descriptions for the stringr and the stringi package. stringr is
# part of the tidyverse, and for the most part a wrapper for stringi functions.
# https://github.com/tidyverse/stringr
# == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources
cat( s <- ">FASTA header will be removed
10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
dbSanitizeSequence(s)
# = 7 Permuting and sampling ==============================================
# An important aspect of working with strings is generating random strings
# with given statistical properties: reference items to evaluate significance.
# == 7.1 Permutations ======================================================
# One way to produce such reference items is to permute a string. A permuted
# string has the same composition as the original, but all positional
# information is lost. The sample() function can be used to permute:
# This is the sequence of the ompA secretion signal
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
(x <- sample(s, length(s))) # permuted
# Here's a small example how such permuted strings may be useful. As you look
# at the ompA sequence, you suspect that the two lysines near the +-charged
# N-terminus may not be accidental, but selected for a positively charged
# N-terminus. What is the chance that such a sequence has two lysines close to
# the N-terminus simply by chance? Or put differently: what is the average
# distance of two lysines in such a sequence to the N-terminus. First, we
# need an expression that measures the distance. A simple use of the which()
# function will do just fine.
which(s == "K") # shows they are in position 2 and 3, so ...
mean(which(s == "K")) # ... gives us the average, and ...
mean(which(x == "K")) # ... gives us the average of the permuted sequence.
# So what does the distribution look like? Lets do 10,000 trials.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000
d <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K"))
}
set.seed(NULL) # reset the RNG
hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
# N-terminus or more. That's just below the signifcance
# threshold of 5 %. It's a trend, but to be sure we are looking
# at a biological effect we would need to see more
# sequences.
# == 7.2 Sampling ==========================================================
# === 7.2.1 Equiprobable characters
# Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it:
nuc <- c("A", "C", "G", "T")
N <- 100
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # 51 is close to expected
# What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function
if (! requireNamespace("stringi", quietly = TRUE)) {
install.packages("stringi")
}
# Package information:
# library(help = stringi) # basic information
# browseVignettes("stringi") # available vignettes
# data(package = "stringi") # available datasets
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine
# whether there are more or less CpG motifs than expected by chance.
# (cf. https://en.wikipedia.org/wiki/CpG_site)
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
# with the same probability. But the yeast genomic DNA GC content is only
# 38%. So you would expect fewer CpG motifs based on the statistical properties
# of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that?
# === 7.2.2 Defined probability vector
# This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with
# 19 C, 19 G, 31 A and 31 T
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
# ... but that doesn't scale if the numeric accuracy needs to be higher.
#
# However sample() has an argument that takes care of that: you can explicitly
# specify the probabilities with which each element of the the sampling vector
# should be chosen:
nuc <- c("A", "C", "G", "T")
N <- 100
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
table(v)
sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs?
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case.
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,349 +1,349 @@
# tocID <- "FND-Genetic_code.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-Genetic_code unit.
#
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.1 Comment on "incomplete final line" warning in FASTA
# 1.0 First live version
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------------
#TOC> 1 Storing the genetic code 45
#TOC> 1.1 Genetic code in Biostrings 63
#TOC> 2 Working with the genetic code 94
#TOC> 2.1 Translate a sequence. 129
#TOC> 3 An alternative representation: 3D array 212
#TOC> 3.1 Print a Genetic code table 246
#TOC> 4 Tasks 272
#TOC>
#TOC> ==========================================================================
# = 1 Storing the genetic code ============================================
# The genetic code maps trinucleotide codons to amino acids. To store it, we
# need some mechanism to associate the two representations. The most
# convenient way to do that is a "named vector" which holds the amino acid
# code and assigns the codons as names to its elements.
x <- c("M", "H", "H", "*", "*", "*")
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
x
# Then we can access the vector by the codon as name, and retrieve the
# amino acid ...
x["ATG"]
x["CAC"]
x["TAA"]
# ... or the names of elements, to retrieve the codon(s)
names(x)[x == "M"]
names(x)[x == "H"]
names(x)[x == "*"]
# == 1.1 Genetic code in Biostrings ========================================
# Coveniently, the standard genetic code as well as its alternatives are
# available in the Bioconductor "Biostrings" package:
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# The standard genetic code vector
Biostrings::GENETIC_CODE
# The table of genetic codes. This information corresponds to this page
# at the NCBI:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
Biostrings::GENETIC_CODE_TABLE
# Most of the alternative codes are mitochondrial codes. The id of the
# Alternative Yeast Nuclear code is "12"
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
# = 2 Working with the genetic code =======================================
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
# to a "local" variable, rather than retrieving it from the package all the
# time.
GC <- Biostrings::GENETIC_CODE
# This is a named vector of characters ...
str(GC)
# ... which also stores the alternative initiation codons TTG and CTG in
# an attribute of the vector. (Alternative initiation codons sometimes are
# used instead of ATG to intiate translation, if translation is not initiated
# at ATG thses are still translated with fMet.)
attr(GC, "alt_init_codons")
# But the key to use this vector is in the "names" which we use for subsetting
# the list of amino acids in whatever way we need.
names(GC)
# The translation of "TGG" ...
GC["TGG"]
# All stop codons
names(GC)[GC == "*"]
# All start codons
names(GC)[GC == "M"] # ... or
c(names(GC)[GC == "M"],
attr(GC, "alt_init_codons"))
# == 2.1 Translate a sequence. =============================================
# I have provided a gene sequence in the data directory:
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
# read it
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# You will notice that this generates a Warning message:
# Warning message:
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
# The reason for this is that the last character of the file is the letter "A"
# and not a "\n" line break. This file is exactly how it was sent from the
# NCBI server; I think good, defensive programming practice would have been to
# include some kind of an end-marker in the file, like a final "\n". This helps
# us recognize an incomplete transmission. Let's parse the actual sequence from
# the file, and then check for completeness.
head(mbp1)
# drop the first line (header)
mbp1 <- mbp1[-1]
head(mbp1)
# concatenate it all to a single string
mbp1 <- paste(mbp1, sep = "", collapse = "")
# how long is it?
nchar(mbp1)
# how many codons?
nchar(mbp1)/3
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
# first verification that the file we read is complete, the nucleotides of a
# complete ORF should be divisible by 3.
# Extract the codons. There are many ways to split a long string into chunks
# of three characters. Here we use the Biostrings codons() function. codons()
# requires an object of type DNAstring - a special kind of string with
# attributes that are useful for Biostrings. Thus we convert the sequence first
# with DNAstring(), then split it up, then convert it into a plain
# character vector.
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
head(mbp1Codons)
# now translate each codon
mbp1AA <- character(834)
for (i in seq_along(mbp1Codons)) {
mbp1AA[i] <- GC[mbp1Codons[i]]
}
head(mbp1Codons)
head(mbp1AA)
tail(mbp1Codons)
tail(mbp1AA) # Note the stop!
# The TAA "ochre" stop codon is our second verification that the nucleotide
# sequence is complete: a stop codon can't appear internally in an ORF.
# We can work with the mbp1AA vector, for example to tabulate the
# amino acid frequencies:
table(mbp1AA)
sort(table(mbp1AA), decreasing = TRUE)
# Or we can paste all elements together into a single string. But let's remove
# the stop, it's not actually a part of the sequence. To remove the last element
# of a vector, re-assign it with a vector minus the index of the last element:
mbp1AA <- mbp1AA[-(length(mbp1AA))]
tail(mbp1AA) # Note the stop is gone!
# paste it together, collapsing the elements using an empty string as the
# separation-character (i.e.: nothing)
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
# = 3 An alternative representation: 3D array =============================
# We don't use 3D arrays often - usually just 2D tables and data frames, so
# here is a good opportunity to review the syntax of 3D arrays with a
# genetic code cube:
# Initialize, using A G C T as the names of the elements in each dimension
cCube <- array(data = character(64),
dim = c(4, 4, 4),
dimnames = list(c("A", "G", "C", "T"),
c("A", "G", "C", "T"),
c("A", "G", "C", "T")))
# fill it with amino acid codes using three nested loops
for (i in 1:4) {
for (j in 1:4) {
for (k in 1:4) {
myCodon <- paste(dimnames(cCube)[[1]][i],
dimnames(cCube)[[2]][j],
dimnames(cCube)[[3]][k],
sep = "",
collapse = "")
cCube[i, j, k] <- GC[myCodon]
}
}
}
# confirm
cCube["A", "T", "G"] # methionine
cCube["T", "T", "T"] # phenylalanine
cCube["T", "A", "G"] # stop (amber)
# == 3.1 Print a Genetic code table ========================================
# The data structure of our cCube is well suited to print a table. In the
# "standard" way to print the genetic code, we write codons with the same
# second nucleotide in columns, and arrange rows in blocks of same
# first nucleotide, varying the third nucleotide fastest. This maximizes the
# similarity of adjacent amino acids in the table if we print the
# nucleotides in the order T C A G. It's immidiately obvious that the code
# is not random: the universal genetic code is exceptionally error tolerant in
# the sense that mutations (or single-nucleotide translation errors) are likely
# to result in an amino acid with similar biophysical properties as the
# original.
nuc <- c("T", "C", "A", "G")
# (calling variables f, s, t to indicate first, second, and third position ...)
for (f in nuc) { # first varies in blocks
for (t in nuc) { # third varies in columns
for (s in nuc) { # second varies in rows
cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t]))
}
cat("\n")
}
cat("\n")
}
# = 4 Tasks ===============================================================
# Task: What do you need to change to print the table with U instead
# of T? Try it.
# Task: Point mutations are more often transitions (purine -> purine;
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
# pyrimidine -> purine), even though twice as many transversions
# are possible in the code. This is most likely due a deamination /
# tautomerization process that favours C -> T changes. If the code
# indeed minimizes the effect of mutations, you would expect that
# codons that differ by a transition code for more similar amino acids
# than codons that differ by a transversion. Is that true? List the set
# of all amino acid pairs that are encoded by codons with a C -> T
# transition. Then list the set of amino acid pairs with a C -> A
# transversion. Which set of pairs is more similar?
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
# have if you translate them in the 2. or the 3. frame?
# Task: How does the amino acid composition change if you translate the mbp1
# gene with the Alternative Yeast Nuclear code that is used by the
# "GTC clade" of fungi?
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
# Solution:
# Fetch the code
Biostrings::GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
altYcode <- Biostrings::getGeneticCode("12")
# what's the difference?
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
Biostrings::GENETIC_CODE[delta]
altYcode[delta]
# translate
altYAA <- character(834)
for (i in seq_along(mbp1Codons)) {
altYAA[i] <- altYcode[mbp1Codons[i]]
}
table(mbp1AA)
table(altYAA)
# Task: The genetic code has significant redundacy, i.e. there are up to six
# codons that code for the same amino acid. Write code that lists how
# many amino acids are present how often i.e. it should tell you that
# two amino acids are encoded only with a single codon, three amino
# acids have six codons, etc. Solution below, but don't peek. There
# are many possible ways to do this.
#
#
# Solution:
( x <- table(table(Biostrings::GENETIC_CODE)) )
# confirm
sum(x * as.numeric(names(x)))
# [END]
# tocID <- "FND-Genetic_code.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-Genetic_code unit.
#
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.1 Comment on "incomplete final line" warning in FASTA
# 1.0 First live version
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------------
#TOC> 1 Storing the genetic code 45
#TOC> 1.1 Genetic code in Biostrings 63
#TOC> 2 Working with the genetic code 94
#TOC> 2.1 Translate a sequence. 129
#TOC> 3 An alternative representation: 3D array 212
#TOC> 3.1 Print a Genetic code table 246
#TOC> 4 Tasks 272
#TOC>
#TOC> ==========================================================================
# = 1 Storing the genetic code ============================================
# The genetic code maps trinucleotide codons to amino acids. To store it, we
# need some mechanism to associate the two representations. The most
# convenient way to do that is a "named vector" which holds the amino acid
# code and assigns the codons as names to its elements.
x <- c("M", "H", "H", "*", "*", "*")
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
x
# Then we can access the vector by the codon as name, and retrieve the
# amino acid ...
x["ATG"]
x["CAC"]
x["TAA"]
# ... or the names of elements, to retrieve the codon(s)
names(x)[x == "M"]
names(x)[x == "H"]
names(x)[x == "*"]
# == 1.1 Genetic code in Biostrings ========================================
# Coveniently, the standard genetic code as well as its alternatives are
# available in the Bioconductor "Biostrings" package:
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# The standard genetic code vector
Biostrings::GENETIC_CODE
# The table of genetic codes. This information corresponds to this page
# at the NCBI:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
Biostrings::GENETIC_CODE_TABLE
# Most of the alternative codes are mitochondrial codes. The id of the
# Alternative Yeast Nuclear code is "12"
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
# = 2 Working with the genetic code =======================================
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
# to a "local" variable, rather than retrieving it from the package all the
# time.
GC <- Biostrings::GENETIC_CODE
# This is a named vector of characters ...
str(GC)
# ... which also stores the alternative initiation codons TTG and CTG in
# an attribute of the vector. (Alternative initiation codons sometimes are
# used instead of ATG to intiate translation, if translation is not initiated
# at ATG thses are still translated with fMet.)
attr(GC, "alt_init_codons")
# But the key to use this vector is in the "names" which we use for subsetting
# the list of amino acids in whatever way we need.
names(GC)
# The translation of "TGG" ...
GC["TGG"]
# All stop codons
names(GC)[GC == "*"]
# All start codons
names(GC)[GC == "M"] # ... or
c(names(GC)[GC == "M"],
attr(GC, "alt_init_codons"))
# == 2.1 Translate a sequence. =============================================
# I have provided a gene sequence in the data directory:
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
# read it
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# You will notice that this generates a Warning message:
# Warning message:
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
# The reason for this is that the last character of the file is the letter "A"
# and not a "\n" line break. This file is exactly how it was sent from the
# NCBI server; I think good, defensive programming practice would have been to
# include some kind of an end-marker in the file, like a final "\n". This helps
# us recognize an incomplete transmission. Let's parse the actual sequence from
# the file, and then check for completeness.
head(mbp1)
# drop the first line (header)
mbp1 <- mbp1[-1]
head(mbp1)
# concatenate it all to a single string
mbp1 <- paste(mbp1, sep = "", collapse = "")
# how long is it?
nchar(mbp1)
# how many codons?
nchar(mbp1)/3
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
# first verification that the file we read is complete, the nucleotides of a
# complete ORF should be divisible by 3.
# Extract the codons. There are many ways to split a long string into chunks
# of three characters. Here we use the Biostrings codons() function. codons()
# requires an object of type DNAstring - a special kind of string with
# attributes that are useful for Biostrings. Thus we convert the sequence first
# with DNAstring(), then split it up, then convert it into a plain
# character vector.
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
head(mbp1Codons)
# now translate each codon
mbp1AA <- character(834)
for (i in seq_along(mbp1Codons)) {
mbp1AA[i] <- GC[mbp1Codons[i]]
}
head(mbp1Codons)
head(mbp1AA)
tail(mbp1Codons)
tail(mbp1AA) # Note the stop!
# The TAA "ochre" stop codon is our second verification that the nucleotide
# sequence is complete: a stop codon can't appear internally in an ORF.
# We can work with the mbp1AA vector, for example to tabulate the
# amino acid frequencies:
table(mbp1AA)
sort(table(mbp1AA), decreasing = TRUE)
# Or we can paste all elements together into a single string. But let's remove
# the stop, it's not actually a part of the sequence. To remove the last element
# of a vector, re-assign it with a vector minus the index of the last element:
mbp1AA <- mbp1AA[-(length(mbp1AA))]
tail(mbp1AA) # Note the stop is gone!
# paste it together, collapsing the elements using an empty string as the
# separation-character (i.e.: nothing)
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
# = 3 An alternative representation: 3D array =============================
# We don't use 3D arrays often - usually just 2D tables and data frames, so
# here is a good opportunity to review the syntax of 3D arrays with a
# genetic code cube:
# Initialize, using A G C T as the names of the elements in each dimension
cCube <- array(data = character(64),
dim = c(4, 4, 4),
dimnames = list(c("A", "G", "C", "T"),
c("A", "G", "C", "T"),
c("A", "G", "C", "T")))
# fill it with amino acid codes using three nested loops
for (i in 1:4) {
for (j in 1:4) {
for (k in 1:4) {
myCodon <- paste(dimnames(cCube)[[1]][i],
dimnames(cCube)[[2]][j],
dimnames(cCube)[[3]][k],
sep = "",
collapse = "")
cCube[i, j, k] <- GC[myCodon]
}
}
}
# confirm
cCube["A", "T", "G"] # methionine
cCube["T", "T", "T"] # phenylalanine
cCube["T", "A", "G"] # stop (amber)
# == 3.1 Print a Genetic code table ========================================
# The data structure of our cCube is well suited to print a table. In the
# "standard" way to print the genetic code, we write codons with the same
# second nucleotide in columns, and arrange rows in blocks of same
# first nucleotide, varying the third nucleotide fastest. This maximizes the
# similarity of adjacent amino acids in the table if we print the
# nucleotides in the order T C A G. It's immidiately obvious that the code
# is not random: the universal genetic code is exceptionally error tolerant in
# the sense that mutations (or single-nucleotide translation errors) are likely
# to result in an amino acid with similar biophysical properties as the
# original.
nuc <- c("T", "C", "A", "G")
# (calling variables f, s, t to indicate first, second, and third position ...)
for (f in nuc) { # first varies in blocks
for (t in nuc) { # third varies in columns
for (s in nuc) { # second varies in rows
cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t]))
}
cat("\n")
}
cat("\n")
}
# = 4 Tasks ===============================================================
# Task: What do you need to change to print the table with U instead
# of T? Try it.
# Task: Point mutations are more often transitions (purine -> purine;
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
# pyrimidine -> purine), even though twice as many transversions
# are possible in the code. This is most likely due a deamination /
# tautomerization process that favours C -> T changes. If the code
# indeed minimizes the effect of mutations, you would expect that
# codons that differ by a transition code for more similar amino acids
# than codons that differ by a transversion. Is that true? List the set
# of all amino acid pairs that are encoded by codons with a C -> T
# transition. Then list the set of amino acid pairs with a C -> A
# transversion. Which set of pairs is more similar?
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
# have if you translate them in the 2. or the 3. frame?
# Task: How does the amino acid composition change if you translate the mbp1
# gene with the Alternative Yeast Nuclear code that is used by the
# "GTC clade" of fungi?
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
# Solution:
# Fetch the code
Biostrings::GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
altYcode <- Biostrings::getGeneticCode("12")
# what's the difference?
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
Biostrings::GENETIC_CODE[delta]
altYcode[delta]
# translate
altYAA <- character(834)
for (i in seq_along(mbp1Codons)) {
altYAA[i] <- altYcode[mbp1Codons[i]]
}
table(mbp1AA)
table(altYAA)
# Task: The genetic code has significant redundacy, i.e. there are up to six
# codons that code for the same amino acid. Write code that lists how
# many amino acids are present how often i.e. it should tell you that
# two amino acids are encoded only with a single codon, three amino
# acids have six codons, etc. Solution below, but don't peek. There
# are many possible ways to do this.
#
#
# Solution:
( x <- table(table(Biostrings::GENETIC_CODE)) )
# confirm
sum(x * as.numeric(names(x)))
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,224 +1,224 @@
# tocID <- "FND-STA-Information_theory.R"
#
# ==============================================================================
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Information_theory unit.
#
# Version: 0.2.1
#
# Date: 2017 - 2021
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.2.1 Maintenance
# 0.2 Under development
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------
#TOC> 1 ___Section___ 39
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
# What level of information is "significant"
# Assume the background distribution is the database frequencies of
# amino acids:
AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to
# sum to 1.0
AAref["A"] <- 0.0904
AAref["C"] <- 0.0123
AAref["D"] <- 0.0545
AAref["E"] <- 0.0617
AAref["F"] <- 0.0394
AAref["G"] <- 0.0724
AAref["H"] <- 0.0221
AAref["I"] <- 0.0573
AAref["K"] <- 0.0504
AAref["L"] <- 0.0986
AAref["M"] <- 0.0240
AAref["N"] <- 0.0392
AAref["P"] <- 0.0486
AAref["Q"] <- 0.0381
AAref["R"] <- 0.0570
AAref["S"] <- 0.0673
AAref["T"] <- 0.0558
AAref["V"] <- 0.0686
AAref["W"] <- 0.0129
AAref["Y"] <- 0.0294
sum(AAref)
# Function to calculate Shannon entropy
H <- function(pmf) {
# Calculate Shannon entropy
# Parameters:
# pmf (numeric) probability mass function: a vector of states and
# associated probabilities. Each element of
# pmf must be in (0, 1] and sum(pmf) must be 1.
# Value:
# Shannon entropy in bits.
# Examples:
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
# # nucleotide sequence
# H(1) # If all elements are the same, entropy is zero
#
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
stop("Input is not a discrete probability distribution.")
}
H <- -sum(pmf * (log(pmf) / log(2)))
return(H)
}
# Why use all.equal()? Exact comparisons with floating point numbers are
# brittle. Consider for example:
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
# Entropy of the database frequencies (in bits):
(Href <- H(AAref))
# for comparison: entropy if all amino acids are equiprobable
H(rep(0.05, 20))
# Set up a simulation to estimate the distribution of Information values
# from random sequences drawn from AAref. This is the distribution for the
# statistical null hypothesis:
nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000 # number of trials
IObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H
IObs[i] <- Href - Hobs # store information
}
# evaluate
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
# i.e. an actual observation that lies outside the purple lines is deemed
# "significant"(1)(2). Of course, this is only true to the degree that the
# database frequencies are a valid model for the null-hypothesis on the
# sequence position we are considering here.
# (1) If we use 5% quantiles, this means a value is significantly larger
# than expected, and we ignore cases when the value is < 0; if we
# consider both smaller and larger values, we need to use 2.5% quantiles,
# since 5% of all observations lie outside the 0.025 and 0.975
# quantiles.
#
# (2) For an actual observation of counts, we calculate its observed
# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
# You can probably now appreciate that information is a bit of a shortcut for
# biological sequences, and does not really take the different inherent
# frequencies based on the character of the amino acids into account. For
# example, L is the most frequent and C is the least frequent, but if we have an
# alignment of 1000 sequences and we see that the frequencies for L and C are
# swapped, that would be _very_ surprising - nevertheless, the information would
# be 0. In order to take that into account, we should actually compute
# Kullback-Leibler divergences.
# Swap C and L frequencies
p <- AAref
q <- AAref
q["L"] <- AAref["C"]
q["C"] <- AAref["L"]
H(p)
H(q)
KLdiv <- function(p, q) {
# p and q are two pmfs of discrete probability distributions
# with the same outcomes, which are nowhere 0.
# Value: Kullback-Leibler divergence sum(p * log( p / q))).
if (length(p) != length(q)) {
stop("PANIC: input vector lengths differ!")
}
if (any(c((p == 0), (q == 0)))) {
stop("PANIC: 0's found in input vectors!")
}
return(sum(p * log( p / q )))
}
KLdiv(p, p)
KLdiv(p, q)
nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000 # number of trials
KLdivObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
simCounts <- simCounts/sum(simCounts) # counts to frequency
KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
}
# evaluate
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
quantile(KLdivObs, 0.992)
# Running the simulation with KL does not give a fundamentally
# different behaviour - since we are just randomly sampling. But KL would be
# more sensitive in case there is biological selection, where the sampling is no
# longer random. If I run the same simulation, with nObs <- 80 but calculating
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
# nice addition to the toolbox.
# [END]
# tocID <- "FND-STA-Information_theory.R"
#
# ==============================================================================
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Information_theory unit.
#
# Version: 0.2.1
#
# Date: 2017 - 2021
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.2.1 Maintenance
# 0.2 Under development
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------
#TOC> 1 ___Section___ 39
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
# What level of information is "significant"
# Assume the background distribution is the database frequencies of
# amino acids:
AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to
# sum to 1.0
AAref["A"] <- 0.0904
AAref["C"] <- 0.0123
AAref["D"] <- 0.0545
AAref["E"] <- 0.0617
AAref["F"] <- 0.0394
AAref["G"] <- 0.0724
AAref["H"] <- 0.0221
AAref["I"] <- 0.0573
AAref["K"] <- 0.0504
AAref["L"] <- 0.0986
AAref["M"] <- 0.0240
AAref["N"] <- 0.0392
AAref["P"] <- 0.0486
AAref["Q"] <- 0.0381
AAref["R"] <- 0.0570
AAref["S"] <- 0.0673
AAref["T"] <- 0.0558
AAref["V"] <- 0.0686
AAref["W"] <- 0.0129
AAref["Y"] <- 0.0294
sum(AAref)
# Function to calculate Shannon entropy
H <- function(pmf) {
# Calculate Shannon entropy
# Parameters:
# pmf (numeric) probability mass function: a vector of states and
# associated probabilities. Each element of
# pmf must be in (0, 1] and sum(pmf) must be 1.
# Value:
# Shannon entropy in bits.
# Examples:
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
# # nucleotide sequence
# H(1) # If all elements are the same, entropy is zero
#
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
stop("Input is not a discrete probability distribution.")
}
H <- -sum(pmf * (log(pmf) / log(2)))
return(H)
}
# Why use all.equal()? Exact comparisons with floating point numbers are
# brittle. Consider for example:
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
# Entropy of the database frequencies (in bits):
(Href <- H(AAref))
# for comparison: entropy if all amino acids are equiprobable
H(rep(0.05, 20))
# Set up a simulation to estimate the distribution of Information values
# from random sequences drawn from AAref. This is the distribution for the
# statistical null hypothesis:
nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000 # number of trials
IObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H
IObs[i] <- Href - Hobs # store information
}
# evaluate
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
# i.e. an actual observation that lies outside the purple lines is deemed
# "significant"(1)(2). Of course, this is only true to the degree that the
# database frequencies are a valid model for the null-hypothesis on the
# sequence position we are considering here.
# (1) If we use 5% quantiles, this means a value is significantly larger
# than expected, and we ignore cases when the value is < 0; if we
# consider both smaller and larger values, we need to use 2.5% quantiles,
# since 5% of all observations lie outside the 0.025 and 0.975
# quantiles.
#
# (2) For an actual observation of counts, we calculate its observed
# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
# You can probably now appreciate that information is a bit of a shortcut for
# biological sequences, and does not really take the different inherent
# frequencies based on the character of the amino acids into account. For
# example, L is the most frequent and C is the least frequent, but if we have an
# alignment of 1000 sequences and we see that the frequencies for L and C are
# swapped, that would be _very_ surprising - nevertheless, the information would
# be 0. In order to take that into account, we should actually compute
# Kullback-Leibler divergences.
# Swap C and L frequencies
p <- AAref
q <- AAref
q["L"] <- AAref["C"]
q["C"] <- AAref["L"]
H(p)
H(q)
KLdiv <- function(p, q) {
# p and q are two pmfs of discrete probability distributions
# with the same outcomes, which are nowhere 0.
# Value: Kullback-Leibler divergence sum(p * log( p / q))).
if (length(p) != length(q)) {
stop("PANIC: input vector lengths differ!")
}
if (any(c((p == 0), (q == 0)))) {
stop("PANIC: 0's found in input vectors!")
}
return(sum(p * log( p / q )))
}
KLdiv(p, p)
KLdiv(p, q)
nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80
nTrials <- 10000 # number of trials
KLdivObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
simCounts <- simCounts/sum(simCounts) # counts to frequency
KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
}
# evaluate
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
quantile(KLdivObs, 0.992)
# Running the simulation with KL does not give a fundamentally
# different behaviour - since we are just randomly sampling. But KL would be
# more sensitive in case there is biological selection, where the sampling is no
# longer random. If I run the same simulation, with nObs <- 80 but calculating
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
# nice addition to the toolbox.
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,351 +1,351 @@
# tocID <- "FND-STA-Significance.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit.
#
# Version: 1.3
#
# Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance. Add sample solution.
# 1.2 Update set.seed() usage
# 1.1 Corrected treatment of empirical p-value
# 1.0 First contents
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------------
#TOC> 1 Significance and p-value 49
#TOC> 1.1 Significance levels 60
#TOC> 1.2 probability and p-value 77
#TOC> 1.2.1 p-value illustrated 109
#TOC> 2 One- or two-sided 165
#TOC> 3 Significance by integration 209
#TOC> 4 Significance by simulation or permutation 215
#TOC> 5 Final tasks 327
#TOC> 6 Sample solutions 336
#TOC> 6.1 338
#TOC> 6.2 342
#TOC> 6.3 346
#TOC>
#TOC> ==========================================================================
# = 1 Significance and p-value ============================================
# The idea of the probability of an event has a precise mathematical
# interpretation, but how is it useful to know the probability? Usually we are
# interested in whether we should accept or reject a hypothesis based on the
# observations we have. A rational way to do this is to say: if the probability
# of observing the data is very small under the null-hypothesis, then we will
# assume the observation is due to something other than the null-hypothesis. But
# what do we mean by the "probability of our observation"? And what is "very
# small"?
# == 1.1 Significance levels ===============================================
# A "very small" probability is purely a matter of convention - a cultural
# convention. In the biomedical field we usually call probabilities of less then
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
# observations with a probability of less than 0.05 "significant" and if we want
# to highlight this in text or in a graph, we often mark them with an asterisk
# (*). Also we often call observations with a probability of less than 0.01
# "highly significant" and mark them with two asterisks (**). But there is no
# special significance in these numbers, the cutoff point for significance could
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
# British statistician Ronald Fisher happened to propose for this purpose in
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
# different purposes (cf.
# https://en.wikipedia.org/wiki/Statistical_significance).
# == 1.2 probability and p-value ===========================================
# But what do we even mean by the probability of an observation?
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
# standard deviation of 1. The sample I get is ...
set.seed(sqrt(5))
x <- rnorm(1)
set.seed(NULL)
print(x, digits = 22)
# [1] -0.8969145466249813791748
# So what's the probability of that number? Obviously, the probability of
# getting exactly this number is very, very, very small. But also obviously,
# this does not mean that observing this number is in any way significant - we
# always observe some number. That's not what we mean in this case. There are
# several implicit assumptions when we speak of the probability of an
# observation:
# 1: the observation can be compared to a probability distribution;
# 2: that distribution can be integrated between any specific value
# and its upper and lower bounds (or +- infinity).
# Then what we really mean by the probability of an observation in the context
# of that distribution is: the probability of observing that value, or a value
# more extreme than the one we have. We call this the p-value. Note that we are
# not talking about an individual number anymore, we are talking about the area
# under the curve between our observation and the upper (or lower) bound of the
# curve, as a fraction of the whole.
# === 1.2.1 p-value illustrated
# Let's illustrate. First we draw a million random values from our
# standard, normal distribution:
N <- 1e6 # one million
set.seed(112358) # set RNG seed for repeatable randomness
r <- rnorm(N) # N values from a normal distribution
set.seed(NULL) # reset the RNG
# Let's see what the distribution looks like:
(h <- hist(r))
# The histogram details are now available in the list h - e.g. h$counts
# Where is the value we have drawn previously?
abline(v = x, col = "#EE0000")
# How many values are smaller?
sum(r < x)
# Let's color the bars:
# first, make a vector of red and green colors for the bars with breaks
# smaller and larger then x, white for the bar that contains x ...
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
hCol <- c(hCol, "#FFFFFFFF")
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
# ... then plot the histogram, with colored bars ...
hist(r, col = hCol)
# ... add two colored rectangles into the white bar ...
idx <- sum(h$breaks < x)
xMin <- h$breaks[idx]
xMax <- h$breaks[idx + 1]
y <- h$counts[idx]
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
# ... and a red line for our observation.
abline(v = x, col = "#EE0000", lwd = 2)
# The p-value of our observation is the red area as a fraction of the
# whole histogram (red + green).
# Task:
# Explain how the expression sum(r < x) works to give us a count of values
# with the property we are looking for. E.g., examine -4:4 < x
# Task:
# Write an expression to estimate the probability that a value
# drawn from the vector r is less-or-equal to x. The result you get
# will depend on the exact values that went into the vector r but it should
# be close to 0.185 That expression is the p-value associated with x.
# (Sample solution 6.1)
# = 2 One- or two-sided ===================================================
# The shape of our histogram confirms that the rnorm() function has returned
# values that appear distributed according to a normal distribution. In a normal
# distribution, readily available tables tell us that 5% of the values (i.e. our
# significance level) lie 1.96 (or approximately 2) standard deviations away
# from the mean. Is this the case here? How many values in our vector r are
# larger than 1.96?
sum(r > 1.96)
# [1] 24589
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
# The answer is: we have to be careful with two-sided distributions. 2 standard
# deviations away from the mean means either larger or smaller than 1.96 . This
# can give rise to errors. If we are simply are interested in outliers, no
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
# But if we are specifically interested in, say, larger values, because a
# smaller value is not meaningful, then the significance cutoff, expressed as
# standard deviations, is relaxed. We can use the quantile function to see what
# the cutoff values are:
quantile(r)
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
# close to ± 1.96, as expected
quantile(r, probs = 0.95) # for the single 5% boundary
# close to 1.64 . Check counts to confirm:
sum(r > quantile(r, probs = 0.95))
# [1] 50000
# which is 5%, as expected.
# Task:
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
# (Sample solution 6.2)
# To summarize: when we evaluate the significance of an event, we divide a
# probability distribution into two parts at the point where the event was
# observed. We then ask whether the integral over the more extreme part is less
# or more than 5% of the whole. If it is less, we deem the event to be
# significant.
#
# = 3 Significance by integration =========================================
# If the underlying probability distribution can be analytically or numerically
# integrated, the siginificance of an observation can be directly computed.
# = 4 Significance by simulation or permutation ===========================
# But whether the integration is correct, or relies on assumptions that may not
# be warranted for biological data, can be a highly technical question.
# Fortunately, we can often simply run a simulation, a random resampling, or a
# permutation and then count the number of outcomes, just as we did with our
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
# p-value" is defined as (Nobs + 1) / (N + 1). )
# Here is an example. Assume you have a protein sequence and
# you speculate that positively charged residues are close to negatively charged
# residues to balance charge locally. A statistic that would capture this is the
# mean minimum distance between all D,E residues and the closest R,K,H
# residue. Let's compute this for the sequence of yeast Mbp1.
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
# first we split this string into individual characters:
v <- unlist(strsplit(MBP1, ""))
# and find the positions of our charged residues
ED <- grep("[ED]", v)
RKH <- grep("[RKH]", v)
sep <- numeric(length(ED)) # this vector will hold the distances
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
# Task: read and explain this bit of code
# Now that sep is computed, what does it look like?
table(sep) # these are the minimum distances
# 24 of D,E residues are adjacent to R,K,H;
# the longest separation is 28 residues.
# What is the mean separation?
mean(sep)
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
# to solve this analytically. But by permutation it's soooo easy.
# First, we combine what we have done above into a function:
chSep <- function(v) {
# computes the mean minimum separation of oppositely charged residues
# Parameter: v (char) a vector of amino acids in the one-letter code
# Value: msep (numeric) mean minimum separation
ED <- grep("[EDed]", v)
RKH <- grep("[RKHrkh]", v)
sep <- numeric(length(ED))
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
return(mean(sep))
}
# Execute the function to define it.
# Confirm that the function gives the same result as the number we
# calculated above:
chSep(v)
# Now we can produce a random permutation of v, and recalculate
set.seed(pi) # set RNG seed for repeatable randomness
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful.
set.seed(NULL) # reset the RNG
chSep(w)
# 3.773 ... that's actually less than what we had before.
# Let's do this 10000 times and record the results (takes a few seconds):
N <- 10000
chs <- numeric(N)
for (i in 1:N) {
chs[i] <- chSep(sample(v, length(v))) # charge
}
hist(chs, breaks = 50)
abline(v = chSep(v), col = "#EE0000")
# Contrary to our expectations, the actual observed mean minimum charge
# separation seems to be larger than what we observe in randomly permuted
# sequences. But is this significant? Your task to find out.
# Task:
# Calculate the empirical p-value for chsep(v)
# (Sample solution 6.3)
# = 5 Final tasks =========================================================
# From chs, compute the empirical p-value of a mean minimum charge separation to
# be larger or equal to the value observed for the yeast MBP1 sequence. Note
# the result in your journal. Is it significant? Also note the result of
# the following expression for validation:
seal(sum(chs))
# = 6 Sample solutions ====================================================
# == 6.1 ==================================================================
#
sum(r <= x) / length(r)
# == 6.2 ==================================================================
#
abline(v = quantile(r, probs = c(0.05)))
# == 6.3 ==================================================================
#
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
# [END]
# tocID <- "FND-STA-Significance.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit.
#
# Version: 1.3
#
# Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance. Add sample solution.
# 1.2 Update set.seed() usage
# 1.1 Corrected treatment of empirical p-value
# 1.0 First contents
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------------
#TOC> 1 Significance and p-value 49
#TOC> 1.1 Significance levels 60
#TOC> 1.2 probability and p-value 77
#TOC> 1.2.1 p-value illustrated 109
#TOC> 2 One- or two-sided 165
#TOC> 3 Significance by integration 209
#TOC> 4 Significance by simulation or permutation 215
#TOC> 5 Final tasks 327
#TOC> 6 Sample solutions 336
#TOC> 6.1 338
#TOC> 6.2 342
#TOC> 6.3 346
#TOC>
#TOC> ==========================================================================
# = 1 Significance and p-value ============================================
# The idea of the probability of an event has a precise mathematical
# interpretation, but how is it useful to know the probability? Usually we are
# interested in whether we should accept or reject a hypothesis based on the
# observations we have. A rational way to do this is to say: if the probability
# of observing the data is very small under the null-hypothesis, then we will
# assume the observation is due to something other than the null-hypothesis. But
# what do we mean by the "probability of our observation"? And what is "very
# small"?
# == 1.1 Significance levels ===============================================
# A "very small" probability is purely a matter of convention - a cultural
# convention. In the biomedical field we usually call probabilities of less then
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
# observations with a probability of less than 0.05 "significant" and if we want
# to highlight this in text or in a graph, we often mark them with an asterisk
# (*). Also we often call observations with a probability of less than 0.01
# "highly significant" and mark them with two asterisks (**). But there is no
# special significance in these numbers, the cutoff point for significance could
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
# British statistician Ronald Fisher happened to propose for this purpose in
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
# different purposes (cf.
# https://en.wikipedia.org/wiki/Statistical_significance).
# == 1.2 probability and p-value ===========================================
# But what do we even mean by the probability of an observation?
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
# standard deviation of 1. The sample I get is ...
set.seed(sqrt(5))
x <- rnorm(1)
set.seed(NULL)
print(x, digits = 22)
# [1] -0.8969145466249813791748
# So what's the probability of that number? Obviously, the probability of
# getting exactly this number is very, very, very small. But also obviously,
# this does not mean that observing this number is in any way significant - we
# always observe some number. That's not what we mean in this case. There are
# several implicit assumptions when we speak of the probability of an
# observation:
# 1: the observation can be compared to a probability distribution;
# 2: that distribution can be integrated between any specific value
# and its upper and lower bounds (or +- infinity).
# Then what we really mean by the probability of an observation in the context
# of that distribution is: the probability of observing that value, or a value
# more extreme than the one we have. We call this the p-value. Note that we are
# not talking about an individual number anymore, we are talking about the area
# under the curve between our observation and the upper (or lower) bound of the
# curve, as a fraction of the whole.
# === 1.2.1 p-value illustrated
# Let's illustrate. First we draw a million random values from our
# standard, normal distribution:
N <- 1e6 # one million
set.seed(112358) # set RNG seed for repeatable randomness
r <- rnorm(N) # N values from a normal distribution
set.seed(NULL) # reset the RNG
# Let's see what the distribution looks like:
(h <- hist(r))
# The histogram details are now available in the list h - e.g. h$counts
# Where is the value we have drawn previously?
abline(v = x, col = "#EE0000")
# How many values are smaller?
sum(r < x)
# Let's color the bars:
# first, make a vector of red and green colors for the bars with breaks
# smaller and larger then x, white for the bar that contains x ...
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
hCol <- c(hCol, "#FFFFFFFF")
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
# ... then plot the histogram, with colored bars ...
hist(r, col = hCol)
# ... add two colored rectangles into the white bar ...
idx <- sum(h$breaks < x)
xMin <- h$breaks[idx]
xMax <- h$breaks[idx + 1]
y <- h$counts[idx]
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
# ... and a red line for our observation.
abline(v = x, col = "#EE0000", lwd = 2)
# The p-value of our observation is the red area as a fraction of the
# whole histogram (red + green).
# Task:
# Explain how the expression sum(r < x) works to give us a count of values
# with the property we are looking for. E.g., examine -4:4 < x
# Task:
# Write an expression to estimate the probability that a value
# drawn from the vector r is less-or-equal to x. The result you get
# will depend on the exact values that went into the vector r but it should
# be close to 0.185 That expression is the p-value associated with x.
# (Sample solution 6.1)
# = 2 One- or two-sided ===================================================
# The shape of our histogram confirms that the rnorm() function has returned
# values that appear distributed according to a normal distribution. In a normal
# distribution, readily available tables tell us that 5% of the values (i.e. our
# significance level) lie 1.96 (or approximately 2) standard deviations away
# from the mean. Is this the case here? How many values in our vector r are
# larger than 1.96?
sum(r > 1.96)
# [1] 24589
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
# The answer is: we have to be careful with two-sided distributions. 2 standard
# deviations away from the mean means either larger or smaller than 1.96 . This
# can give rise to errors. If we are simply are interested in outliers, no
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
# But if we are specifically interested in, say, larger values, because a
# smaller value is not meaningful, then the significance cutoff, expressed as
# standard deviations, is relaxed. We can use the quantile function to see what
# the cutoff values are:
quantile(r)
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
# close to ± 1.96, as expected
quantile(r, probs = 0.95) # for the single 5% boundary
# close to 1.64 . Check counts to confirm:
sum(r > quantile(r, probs = 0.95))
# [1] 50000
# which is 5%, as expected.
# Task:
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
# (Sample solution 6.2)
# To summarize: when we evaluate the significance of an event, we divide a
# probability distribution into two parts at the point where the event was
# observed. We then ask whether the integral over the more extreme part is less
# or more than 5% of the whole. If it is less, we deem the event to be
# significant.
#
# = 3 Significance by integration =========================================
# If the underlying probability distribution can be analytically or numerically
# integrated, the siginificance of an observation can be directly computed.
# = 4 Significance by simulation or permutation ===========================
# But whether the integration is correct, or relies on assumptions that may not
# be warranted for biological data, can be a highly technical question.
# Fortunately, we can often simply run a simulation, a random resampling, or a
# permutation and then count the number of outcomes, just as we did with our
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
# p-value" is defined as (Nobs + 1) / (N + 1). )
# Here is an example. Assume you have a protein sequence and
# you speculate that positively charged residues are close to negatively charged
# residues to balance charge locally. A statistic that would capture this is the
# mean minimum distance between all D,E residues and the closest R,K,H
# residue. Let's compute this for the sequence of yeast Mbp1.
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
# first we split this string into individual characters:
v <- unlist(strsplit(MBP1, ""))
# and find the positions of our charged residues
ED <- grep("[ED]", v)
RKH <- grep("[RKH]", v)
sep <- numeric(length(ED)) # this vector will hold the distances
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
# Task: read and explain this bit of code
# Now that sep is computed, what does it look like?
table(sep) # these are the minimum distances
# 24 of D,E residues are adjacent to R,K,H;
# the longest separation is 28 residues.
# What is the mean separation?
mean(sep)
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
# to solve this analytically. But by permutation it's soooo easy.
# First, we combine what we have done above into a function:
chSep <- function(v) {
# computes the mean minimum separation of oppositely charged residues
# Parameter: v (char) a vector of amino acids in the one-letter code
# Value: msep (numeric) mean minimum separation
ED <- grep("[EDed]", v)
RKH <- grep("[RKHrkh]", v)
sep <- numeric(length(ED))
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
return(mean(sep))
}
# Execute the function to define it.
# Confirm that the function gives the same result as the number we
# calculated above:
chSep(v)
# Now we can produce a random permutation of v, and recalculate
set.seed(pi) # set RNG seed for repeatable randomness
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful.
set.seed(NULL) # reset the RNG
chSep(w)
# 3.773 ... that's actually less than what we had before.
# Let's do this 10000 times and record the results (takes a few seconds):
N <- 10000
chs <- numeric(N)
for (i in 1:N) {
chs[i] <- chSep(sample(v, length(v))) # charge
}
hist(chs, breaks = 50)
abline(v = chSep(v), col = "#EE0000")
# Contrary to our expectations, the actual observed mean minimum charge
# separation seems to be larger than what we observe in randomly permuted
# sequences. But is this significant? Your task to find out.
# Task:
# Calculate the empirical p-value for chsep(v)
# (Sample solution 6.3)
# = 5 Final tasks =========================================================
# From chs, compute the empirical p-value of a mean minimum charge separation to
# be larger or equal to the value observed for the yeast MBP1 sequence. Note
# the result in your journal. Is it significant? Also note the result of
# the following expression for validation:
seal(sum(chs))
# = 6 Sample solutions ====================================================
# == 6.1 ==================================================================
#
sum(r <= x) / length(r)
# == 6.2 ==================================================================
#
abline(v = quantile(r, probs = c(0.05)))
# == 6.3 ==================================================================
#
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
# [END]

View File

@ -1,3 +1,3 @@
# BCH441-WORK-ABC-units
# BCH441-WORK-ABC-units
This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.

View File

@ -1,245 +1,245 @@
# tocID <- "RPR-Biostrings.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Biostrings unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 2017 Revisions
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 The Biostrings:: Package 56
#TOC> 2 Getting Data into Biostrings:: Objects 88
#TOC> 3 Working with Biostrings:: Objects 110
#TOC> 3.1 Properties 127
#TOC> 3.2 Subsetting 168
#TOC> 3.3 Operators 180
#TOC> 3.4 Transformations 187
#TOC> 4 Getting Data out of Biostrings:: Objects 194
#TOC> 5 More 203
#TOC> 5.1 Views 205
#TOC> 5.2 Iranges 219
#TOC> 5.3 StringSets 225
#TOC>
#TOC> ==========================================================================
# This is a very brief introduction to the Biostrings:: package, other units will
# be using more of the Biostrings:: functions.
# = 1 The Biostrings:: Package ============================================
# First, we install and load the Biostrings:: package from bioconductor (if we
# haven't done so already).
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Examine the package information:
library(help = Biostrings) # basic information
browseVignettes("Biostrings") # available vignettes
data(package = "Biostrings") # available datasets
# At its core, Biostrings:: objects are "classes" of type XString (you can think
# of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information.
class(Biostrings::RNAString("AUG"))
class(Biostrings::DNAString("ATG"))
class(Biostrings::AAString("M"))
# An essential property of Biostrings:: objects is that they only allow letters
# from the applicable IUPAC alphabet:
Biostrings::RNAString("AUG")
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
# = 2 Getting Data into Biostrings:: Objects ==============================
# Example: read FASTA. Extract sequence. Convert to DNAString object.
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
rawSeq <- dbSanitizeSequence(rawSeq)
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
# into an object of class DNAstring
# Multi FASTA files can be read directly as a "XStringSet) ...
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
# ... and if you subset one sequence from the set, you get an XString object
# back again.
(Xseq <- biosDNASet[[1]])
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
# = 3 Working with Biostrings:: Objects ===================================
# Biostrings:: is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with
# Biostrings::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from.
library(Biostrings)
# == 3.1 Properties ========================================================
str(rawSeq)
str(biosDNAseq)
length(rawSeq) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar().
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
nchar(rawSeq)
nchar(biosDNAseq) # ... but nchar() works too.
(uL <- Biostrings::uniqueLetters(biosDNAseq))
# Count frequencies - with strings, you would strsplit() into a character
# vector and then use table(). biost
Biostrings::alphabetFrequency(biosDNAseq)
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
# returns.
Biostrings::letterFrequency(biosDNAseq, uL)
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
length(biosDNAseq) # GC contents
Biostrings::dinucleotideFrequency(biosDNAseq)
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
barplot(sort(triNuc), col="#4499EE33")
triNuc[triNuc == max(triNuc)]
triNuc[triNuc == min(triNuc)]
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
# compare to a shuffled sequence:
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
max(triNuc)
# Interpret this plot.
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433")
max(triNuc)
# == 3.2 Subsetting ========================================================
# Subsetting any XString object works as expected:
biosDNAseq[4:15]
# ... well - maybe not expected, because rawSeq[4:15] would not work.
# Alternatively to the "[" operator, use the subseq() function - especially for
# long sequences. This is far more efficient.
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
# == 3.3 Operators =========================================================
# RNAstring() and DNAstring() objects compare U and T as equals!
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
# == 3.4 Transformations ===================================================
biosDNAseq[4:15]
Biostrings::reverseComplement(biosDNAseq[4:15])
Biostrings::translate(biosDNAseq[4:15])
# = 4 Getting Data out of Biostrings:: Objects ============================
# If you need a character object, use toString():
Biostrings::toString(biosDNAseq[4:15])
# saveRDS() and readRDS() works like on all other R objects.
# = 5 More ================================================================
# == 5.1 Views =============================================================
# Biostring "Views" are objects that store multiple substrings of one
# Biostring object.
(myView <- Biostrings::Views(biosDNAseq,
start = c(1, 19, 37),
end = c(15, 30, 45)))
# Views are convenient to store feature annotations
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
# == 5.2 Iranges ===========================================================
# Biostrings:: Iranges are like Views with a common start point. These can be
# useful for feature annotations. Instead of start/end you store start/width.
# == 5.3 StringSets ========================================================
# Biostring "StringSets" store multiple sequences.
#
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
sample(ompA) # sample can work directly on a Biostring object to shuffle it
x <- Biostrings::toString(ompA)
for (i in 2:10) {
x[i] <- Biostrings::toString(sample(ompA))
}
shuffledPeptideSet <- Biostrings::AAStringSet(x)
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
shuffledPeptideSet
length(shuffledPeptideSet)
Biostrings::width(shuffledPeptideSet)
Biostrings::alphabetFrequency(shuffledPeptideSet)
# [END]
# tocID <- "RPR-Biostrings.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Biostrings unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 2017 Revisions
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 The Biostrings:: Package 56
#TOC> 2 Getting Data into Biostrings:: Objects 88
#TOC> 3 Working with Biostrings:: Objects 110
#TOC> 3.1 Properties 127
#TOC> 3.2 Subsetting 168
#TOC> 3.3 Operators 180
#TOC> 3.4 Transformations 187
#TOC> 4 Getting Data out of Biostrings:: Objects 194
#TOC> 5 More 203
#TOC> 5.1 Views 205
#TOC> 5.2 Iranges 219
#TOC> 5.3 StringSets 225
#TOC>
#TOC> ==========================================================================
# This is a very brief introduction to the Biostrings:: package, other units will
# be using more of the Biostrings:: functions.
# = 1 The Biostrings:: Package ============================================
# First, we install and load the Biostrings:: package from bioconductor (if we
# haven't done so already).
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Examine the package information:
library(help = Biostrings) # basic information
browseVignettes("Biostrings") # available vignettes
data(package = "Biostrings") # available datasets
# At its core, Biostrings:: objects are "classes" of type XString (you can think
# of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information.
class(Biostrings::RNAString("AUG"))
class(Biostrings::DNAString("ATG"))
class(Biostrings::AAString("M"))
# An essential property of Biostrings:: objects is that they only allow letters
# from the applicable IUPAC alphabet:
Biostrings::RNAString("AUG")
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
# = 2 Getting Data into Biostrings:: Objects ==============================
# Example: read FASTA. Extract sequence. Convert to DNAString object.
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
rawSeq <- dbSanitizeSequence(rawSeq)
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
# into an object of class DNAstring
# Multi FASTA files can be read directly as a "XStringSet) ...
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
# ... and if you subset one sequence from the set, you get an XString object
# back again.
(Xseq <- biosDNASet[[1]])
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
# = 3 Working with Biostrings:: Objects ===================================
# Biostrings:: is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with
# Biostrings::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from.
library(Biostrings)
# == 3.1 Properties ========================================================
str(rawSeq)
str(biosDNAseq)
length(rawSeq) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar().
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
nchar(rawSeq)
nchar(biosDNAseq) # ... but nchar() works too.
(uL <- Biostrings::uniqueLetters(biosDNAseq))
# Count frequencies - with strings, you would strsplit() into a character
# vector and then use table(). biost
Biostrings::alphabetFrequency(biosDNAseq)
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
# returns.
Biostrings::letterFrequency(biosDNAseq, uL)
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
length(biosDNAseq) # GC contents
Biostrings::dinucleotideFrequency(biosDNAseq)
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
barplot(sort(triNuc), col="#4499EE33")
triNuc[triNuc == max(triNuc)]
triNuc[triNuc == min(triNuc)]
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
# compare to a shuffled sequence:
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
max(triNuc)
# Interpret this plot.
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433")
max(triNuc)
# == 3.2 Subsetting ========================================================
# Subsetting any XString object works as expected:
biosDNAseq[4:15]
# ... well - maybe not expected, because rawSeq[4:15] would not work.
# Alternatively to the "[" operator, use the subseq() function - especially for
# long sequences. This is far more efficient.
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
# == 3.3 Operators =========================================================
# RNAstring() and DNAstring() objects compare U and T as equals!
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
# == 3.4 Transformations ===================================================
biosDNAseq[4:15]
Biostrings::reverseComplement(biosDNAseq[4:15])
Biostrings::translate(biosDNAseq[4:15])
# = 4 Getting Data out of Biostrings:: Objects ============================
# If you need a character object, use toString():
Biostrings::toString(biosDNAseq[4:15])
# saveRDS() and readRDS() works like on all other R objects.
# = 5 More ================================================================
# == 5.1 Views =============================================================
# Biostring "Views" are objects that store multiple substrings of one
# Biostring object.
(myView <- Biostrings::Views(biosDNAseq,
start = c(1, 19, 37),
end = c(15, 30, 45)))
# Views are convenient to store feature annotations
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
# == 5.2 Iranges ===========================================================
# Biostrings:: Iranges are like Views with a common start point. These can be
# useful for feature annotations. Instead of start/end you store start/width.
# == 5.3 StringSets ========================================================
# Biostring "StringSets" store multiple sequences.
#
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
sample(ompA) # sample can work directly on a Biostring object to shuffle it
x <- Biostrings::toString(ompA)
for (i in 2:10) {
x[i] <- Biostrings::toString(sample(ompA))
}
shuffledPeptideSet <- Biostrings::AAStringSet(x)
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
shuffledPeptideSet
length(shuffledPeptideSet)
Biostrings::width(shuffledPeptideSet)
Biostrings::alphabetFrequency(shuffledPeptideSet)
# [END]

View File

@ -1,165 +1,165 @@
# tocID <- "RPR-ChimeraX_remote.R"
#
# Purpose: A Bioinformatics Course:
# R code demonstrating remote scripting of ChimeraX.
#
# Version: 1.0.1
#
# Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0.1 2021 Minimal updates
# 1.0 First ABC units version
#
#
# TODO:
# %-encode and escape quotes, or just pass-through?
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------
#TOC> 1 ChimeraX REMOTE SCRIPTING 41
#TOC> 1.1 Defining a Port 59
#TOC> 1.2 Open ChimeraX 81
#TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113
#TOC>
#TOC> ==========================================================================
# = 1 ChimeraX REMOTE SCRIPTING ===========================================
# One of the cool features of ChimeraX is that it can be driven by Python code,
# both within a running session and through Python scripts. What I find even
# cooler though is that ChimeraX can be driven from any programming language via
# its remote control function that can listen to commands sent from any other
# application. The interface that is used here is the standard REST (method) -
# the GET and POST verbs that ubiquitously underly the communication of clients
# and servers on the Web.
# In order to establish the communication between this script and ChimeraX, all
# we need to do is:
# - open ChimeraX;
# - tell it to listen on a specific "port";
# - send commands to that port via httr::
# == 1.1 Defining a Port ===================================================
# The httr:: package needs to be available
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We need to think od a port. Any available port number between 49152-65535 is
# fine. We'll choose 61803 because that's the fractional part of the golden
# ratio. But one could choose another.
CXPORT <- 61803
# Check that our current version of R supports sockets (default since V 3.3)
capabilities("sockets") # MUST be TRUE. If not, don't continue.
# == 1.2 Open ChimeraX =====================================================
# - Open a fresh, new session of recently updated version of ChimeraX
# - type:
#
# remotecontrol rest start port 61803
#
# ... or whatever the value of CXPORT is.
# Now watch what happens in ChimeraX when you execute the following line:
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
# The .utilities.R script includes the function CX(), based on this principle,
# through which you can send commands to ChimeraX
CX("camera sbs")
CX("lighting soft")
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
# The command echos Chimera's response if the parameter "quietly" is
# FALSE (default), and we can silence output with quietly = TRUE :
CX("info models #1 attribute num_residues")
CX("info models #1 attribute num_residues", quietly = TRUE)
# Either way, the command also returns Chimera's responses "invisibly";
# i.e. we can use the results by assigning the output to a variable:
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
x <- read.table(file = textConnection(hBonds), skip = 9,
blank.lines.skip = TRUE, fill = TRUE)
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
# = 2 WORKED EXAMPLE: SUPERPOSITION =======================================
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
# to explore possible DNA binding regions in 1BM8
# The model for 1BM8 is already open as model 1 (#1)
CX("hide #1 cartoons") # hide model 1 cartoon representation
CX("open 1DUX") # assume this is opened as model #2
CX("hide #2") # hide everything ...
CX("select #2/C") # chain c (protein)
CX("show sel cartoons") # ... and show cartoons of chain c (protein)
CX("color sequential sel target c palette steelblue:darkmagenta")
CX("view #2/C") # re-center the display
CX("cofr #2/C:62@CA") # set pivot to an interface residue
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
CX("style sel stick")
CX("show sel target ab") # show atoms/bonds
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
CX("surface sel enclose sel") # compute joint accessible surface of both chains
CX("transparency 50")
CX("select clear")
# Now superimpose the 1BM8 chain onto 1DUX chain C
CX("show #1 cartoons")
CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition
# study the general layout, and the position of the 1mb8 secondary structure
# elements relative to 1DUX
# Let's examine side chain orientations in more detail
CX("hide #2/C cartoons") # hide the 1DUX protein
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
CX("select zone #2/A,B 3.5 #1 & protein residues true")
CX("~select sel & H") # de-select H atoms
CX("show sel target ab")
CX("size stickRadius 0.4")
CX("select clear")
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
# transcription factor binding mode; the detailed conformations of side chains
# would need to change only to a minor degree. There is a very significant
# degree of structural similarity; remarkable, given that the DNA is not the
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
# determined without a DNA ligand.
CX("remotecontrol rest stop") # release the socket
# Done.
# [END]
# tocID <- "RPR-ChimeraX_remote.R"
#
# Purpose: A Bioinformatics Course:
# R code demonstrating remote scripting of ChimeraX.
#
# Version: 1.0.1
#
# Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0.1 2021 Minimal updates
# 1.0 First ABC units version
#
#
# TODO:
# %-encode and escape quotes, or just pass-through?
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------
#TOC> 1 ChimeraX REMOTE SCRIPTING 41
#TOC> 1.1 Defining a Port 59
#TOC> 1.2 Open ChimeraX 81
#TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113
#TOC>
#TOC> ==========================================================================
# = 1 ChimeraX REMOTE SCRIPTING ===========================================
# One of the cool features of ChimeraX is that it can be driven by Python code,
# both within a running session and through Python scripts. What I find even
# cooler though is that ChimeraX can be driven from any programming language via
# its remote control function that can listen to commands sent from any other
# application. The interface that is used here is the standard REST (method) -
# the GET and POST verbs that ubiquitously underly the communication of clients
# and servers on the Web.
# In order to establish the communication between this script and ChimeraX, all
# we need to do is:
# - open ChimeraX;
# - tell it to listen on a specific "port";
# - send commands to that port via httr::
# == 1.1 Defining a Port ===================================================
# The httr:: package needs to be available
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We need to think od a port. Any available port number between 49152-65535 is
# fine. We'll choose 61803 because that's the fractional part of the golden
# ratio. But one could choose another.
CXPORT <- 61803
# Check that our current version of R supports sockets (default since V 3.3)
capabilities("sockets") # MUST be TRUE. If not, don't continue.
# == 1.2 Open ChimeraX =====================================================
# - Open a fresh, new session of recently updated version of ChimeraX
# - type:
#
# remotecontrol rest start port 61803
#
# ... or whatever the value of CXPORT is.
# Now watch what happens in ChimeraX when you execute the following line:
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
# The .utilities.R script includes the function CX(), based on this principle,
# through which you can send commands to ChimeraX
CX("camera sbs")
CX("lighting soft")
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
# The command echos Chimera's response if the parameter "quietly" is
# FALSE (default), and we can silence output with quietly = TRUE :
CX("info models #1 attribute num_residues")
CX("info models #1 attribute num_residues", quietly = TRUE)
# Either way, the command also returns Chimera's responses "invisibly";
# i.e. we can use the results by assigning the output to a variable:
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
x <- read.table(file = textConnection(hBonds), skip = 9,
blank.lines.skip = TRUE, fill = TRUE)
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
# = 2 WORKED EXAMPLE: SUPERPOSITION =======================================
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
# to explore possible DNA binding regions in 1BM8
# The model for 1BM8 is already open as model 1 (#1)
CX("hide #1 cartoons") # hide model 1 cartoon representation
CX("open 1DUX") # assume this is opened as model #2
CX("hide #2") # hide everything ...
CX("select #2/C") # chain c (protein)
CX("show sel cartoons") # ... and show cartoons of chain c (protein)
CX("color sequential sel target c palette steelblue:darkmagenta")
CX("view #2/C") # re-center the display
CX("cofr #2/C:62@CA") # set pivot to an interface residue
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
CX("style sel stick")
CX("show sel target ab") # show atoms/bonds
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
CX("surface sel enclose sel") # compute joint accessible surface of both chains
CX("transparency 50")
CX("select clear")
# Now superimpose the 1BM8 chain onto 1DUX chain C
CX("show #1 cartoons")
CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition
# study the general layout, and the position of the 1mb8 secondary structure
# elements relative to 1DUX
# Let's examine side chain orientations in more detail
CX("hide #2/C cartoons") # hide the 1DUX protein
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
CX("select zone #2/A,B 3.5 #1 & protein residues true")
CX("~select sel & H") # de-select H atoms
CX("show sel target ab")
CX("size stickRadius 0.4")
CX("select clear")
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
# transcription factor binding mode; the detailed conformations of side chains
# would need to change only to a minor degree. There is a very significant
# degree of structural similarity; remarkable, given that the DNA is not the
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
# determined without a DNA ligand.
CX("remotecontrol rest stop") # release the socket
# Done.
# [END]

View File

@ -1,322 +1,322 @@
# tocID <- "RPR-FASTA.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-FASTA unit.
#
# Version: 1.1.2
#
# Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1.2 style update
# 1.1.1 bugfix - wrong function name
# 1.1 2020 Maintenance. Rewrite validation logic. Add data
# to utilities. Define AACOLS
# 1.0 New unit.
#
#
# TODO: Make a simple solution first, then extend it to error checking, and
# to handle .mfa files.
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------
#TOC> 1 Reading and validating FASTA 45
#TOC> 1.1 Validating FASTA 81
#TOC> 2 Parsing FASTA 227
#TOC> 3 Interpreting FASTA 247
#TOC> 4 Writing FASTA 274
#TOC>
#TOC> ==========================================================================
# = 1 Reading and validating FASTA ========================================
# FASTA is a text based format, structured in lines that are separated by
# line-feed or paragraph-break characters. Which one of these is used, depends
# on your operating system. But R's readLines() function knows how to handle
# these correctly, accross platforms. Don't try to read such files "by hand".
# Here is the yeast Mbp1 gene, via SGD.
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# The warning is generated because the programmer at the NCBI who implemented
# the code to write this FASTA file neglected to place a line-break character
# after the last sequence character. While this is not technically incorrect,
# it is poor practice: the resulting file can't be distinguished from one that
# has been truncated in transmission.
head(faMBP1)
# Note that there are NO line-break characters ("\n") at the end of these
# strings, even though they were present in the original file. readLines()
# has "consumed" these characters while reading - but every single line is in
# a vector of its own.
tail(faMBP1)
# Also note that the last line has fewer characters - this means readLines()
# imported the whole line, despite it not being terminated by "\n".
# It's very straightforward to work with such data, for example by collapsing
# everything except the first line into a single string ...
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
f[1]
nchar(f[2])
# == 1.1 Validating FASTA ==================================================
# The code above is making the assumption that everything from line 2 until
# the end IS sequence, the whole sequence and nothing but sequence.
# That assumption can break down in many ways:
#
# - there could be more than one header line. The specification says otherwise,
# but some older files use multiple, consecutive header lines. You don't
# want that to end up in your sequence.
# - this could be not a FASTA file at all. It could be raw sequence, a
# different sequence file format, or a wholly different file altogether.
# If you look at the file, you can immediately tell, but if you are
# reading the file in a complex workflow, your could easily import wrong
# data into your analysis.
# - there could be more than one sequence in the file. Such Multi-FASTA files
# occur commonly, as downloads of ORFs from genome regions or other
# sets of genes or proteins, or as the input / output for multiple
# sequence alignment programs.
#
# Data "from the wild" can (and usually does) have the most unexpected
# variations and it is really, really important to be clear about the
# assumptions that you are making. It is possible to "fix" things, according
# to the "Robustness Principle" :
# "Be conservative in what you send,
# be liberal in what you accept".
# (cf. https://en.wikipedia.org/wiki/Robustness_principle )
# ... but if you think about this, that's actually a really poor idea,
# which is much more likely to dilute standards, make unwarranted
# assumptions, and allow errors to pass silently and corrupt data.
#
# Let's discard this principle on the trash-heap of
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
# identify problems, and follow the principle: "crash early, crash often". Of
# course I can write code that would reformat any possible input as a FASTA
# file - but what good will it do me if it parses the file I receive
# from a server into FASTA format like:
#
# >404- Page Not Found</title</head>
# dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
# spellingrcntacttheadministratrsdyhtml
#
# Therefore, we write ourselves a FASTA checker that will enforce the following:
# (1) a FASTA file contains one or more sequences separated by zero or
# more empty lines
# (2) a sequence contains one header line followed by
# one or more sequence lines
# (3) a sequence line contains one or more uppercase or lowercase single
# letter amino acid codes, hyphens (gap character), or * (stop).
#
# Anything else should generate an error.
# (Case 1): Header(s) exist
fX <- c("ABC",
"defghi",
"klmnpq")
sel <- grepl("^>", fX) # "^>" is a regular expression that
# means: the exact character ">" at the
# beginning ("^") of the line.
if ( ! any(sel) ) { stop("no header lines in input.") }
# (Case 2) No adjacent header lines
fX <- c(">ABC",
">123",
"defghi",
"klmnpq")
sel <- grepl("^>", fX)
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
if ( any(sel)) { stop("adjacent header lines in input.") }
# (Case 3.1) all sequence lines contain only valid characters
# (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
# are defined with the .utilities.R script)
AAVALID
fX <- c(">ABC",
"def ;-) ghi",
"klmnpq")
myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
sel <- ! grepl("^>", fX) # NOT headers
if (any(grepl(myRegex, fX[sel]))) {
stop("invalid chracter(s) outside of header lines.")
}
# (Case 3.2) all headers are followed directly by
# at least one letter of sequence
fX <- c(">ABC",
"",
">123",
"defghi",
"klmnpq")
sel <- grep("^>", fX) + 1 # indexes of headers + 1
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
if (! all(grepl(myRegex, fX[sel]))) {
stop("a header has no adjacent sequence.")
}
# Ah, you might ask - couldn't we just have dropped all empty lines, and
# then caught this in Case 2? No - for two reasons: we would still miss headers
# at the end of file, and, we would have changed the line numbering - and
# ideally our "production" function will create information about where the
# error is to be found.
# Now combine this into a function ...
val <- function(fa) {
if ( ! any(grepl("^>", fa)) ) {
stop("no header lines in input.")
}
sel <- grepl("^>", fa)
if ( any(sel[- length(sel)] & sel[-1])) {
stop("adjacent header lines in input.")
}
sel <- ! grepl("^>", fa)
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
stop("invalid chracter(s) outside of header lines.")
}
sel <- grep("^>", fa) + 1
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
stop("a header has no adjacent sequence.")
}
return(invisible(NULL))
}
# Here is an example
FA <- c(">head1",
"acdef",
"ghi",
"",
">head2",
"kl",
">head3",
"mn",
"pqrs")
val(FA) # ... should not create an error
# A somewhat more elaborate validateFA() function was loaded with the
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
# fasta files have space-characters in their spacer lines. Try it ...
validateFA(FA)
# = 2 Parsing FASTA =======================================================
# Once we have validated our assumptions about our input, it's quite
# painless to parse it. I have put this together as a function and the function
# gets loaded from ./.utilities.R
#
# Lets try this:
# - the first 3 elements of faMBP1:
readFASTA(faMBP1[1:3])
# - a multi FASTA file of aligned APSES domain sequences:
refAPSES <- readFASTA("./data/refAPSES.mfa")
# Subset the sequence with "P39678" in the header
refAPSES[grep("P39678", refAPSES$head) ,]
# = 3 Interpreting FASTA ==================================================
# FASTA files are straightforward to interpret - just one thing may be of note:
# when working with strings, we can use substr(<string>, <start>, <stop>) to
# extract substrings, but more often we expand the string into a vector of
# single characters with strsplit(<string>, ""). strsplit() returns a list,
# to accommodate that <string> could be a vector of many elements, therefore
# we usually unlist() the result if we use it only on a single string.
# Example: How many positive charged residues in "MBP1_SACCE"?
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
s
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
# for the characters, sum() coerces to 1 and 0
# respectively, and that gives us the result.
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
# residue distribution
x <- factor(s, levels = names(AACOLS))
pie(table(x)[names(AACOLS)], col = AACOLS)
# = 4 Writing FASTA =======================================================
# Writing FASTA files is mostly just the reverse of reading, with one
# twist: we need to break the long sequence string into chunks of the desired
# width. The FASTA specification calls for a maximum of 120 characters per line,
# but writing out much less than that is common, since it allows to comfortably
# view lines on the console, or printing them on a sheet of paper (do we still
# do that actually?). How do we break a string into chunks? A combination of
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
# loop through our FASTA object in memory, we can build the output by c()'ing
# blocks of header + sequence to each other. For VERY large objects this might
# be slow - in that case, we might want to precalculate the size of the output
# object. But that's more of a hypothetical consideration.
( s <- refAPSES$seq[2] )
nchar(s)
w <- 30 # width of chunk
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk
(ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk
# Task: Is this safe? What happens if nchar(s) is shorter than w?
# What happens if nchar(s) is an exact multiple of w?
substring(s, starts, ends)
# confirm that the output contains the first and last residue, and both
# residues adjacent to the breaks
# As always, the function has been defined in ".utilities.R" for to use
# any time... type writeFASTA to examine it.
# Let's try this...
writeFASTA(refAPSES, width = 40)
# roundtrip for validation: write refAPSES with a different format,
# read it back in - the new dataframe must be identical
# to the original dataframe.
fname <- tempfile()
writeFASTA(refAPSES, fn = fname, width = 30)
identical(refAPSES, readFASTA(fname))
# ...works for me :-)
# [END]
# tocID <- "RPR-FASTA.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-FASTA unit.
#
# Version: 1.1.2
#
# Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1.2 style update
# 1.1.1 bugfix - wrong function name
# 1.1 2020 Maintenance. Rewrite validation logic. Add data
# to utilities. Define AACOLS
# 1.0 New unit.
#
#
# TODO: Make a simple solution first, then extend it to error checking, and
# to handle .mfa files.
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------
#TOC> 1 Reading and validating FASTA 45
#TOC> 1.1 Validating FASTA 81
#TOC> 2 Parsing FASTA 227
#TOC> 3 Interpreting FASTA 247
#TOC> 4 Writing FASTA 274
#TOC>
#TOC> ==========================================================================
# = 1 Reading and validating FASTA ========================================
# FASTA is a text based format, structured in lines that are separated by
# line-feed or paragraph-break characters. Which one of these is used, depends
# on your operating system. But R's readLines() function knows how to handle
# these correctly, accross platforms. Don't try to read such files "by hand".
# Here is the yeast Mbp1 gene, via SGD.
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# The warning is generated because the programmer at the NCBI who implemented
# the code to write this FASTA file neglected to place a line-break character
# after the last sequence character. While this is not technically incorrect,
# it is poor practice: the resulting file can't be distinguished from one that
# has been truncated in transmission.
head(faMBP1)
# Note that there are NO line-break characters ("\n") at the end of these
# strings, even though they were present in the original file. readLines()
# has "consumed" these characters while reading - but every single line is in
# a vector of its own.
tail(faMBP1)
# Also note that the last line has fewer characters - this means readLines()
# imported the whole line, despite it not being terminated by "\n".
# It's very straightforward to work with such data, for example by collapsing
# everything except the first line into a single string ...
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
f[1]
nchar(f[2])
# == 1.1 Validating FASTA ==================================================
# The code above is making the assumption that everything from line 2 until
# the end IS sequence, the whole sequence and nothing but sequence.
# That assumption can break down in many ways:
#
# - there could be more than one header line. The specification says otherwise,
# but some older files use multiple, consecutive header lines. You don't
# want that to end up in your sequence.
# - this could be not a FASTA file at all. It could be raw sequence, a
# different sequence file format, or a wholly different file altogether.
# If you look at the file, you can immediately tell, but if you are
# reading the file in a complex workflow, your could easily import wrong
# data into your analysis.
# - there could be more than one sequence in the file. Such Multi-FASTA files
# occur commonly, as downloads of ORFs from genome regions or other
# sets of genes or proteins, or as the input / output for multiple
# sequence alignment programs.
#
# Data "from the wild" can (and usually does) have the most unexpected
# variations and it is really, really important to be clear about the
# assumptions that you are making. It is possible to "fix" things, according
# to the "Robustness Principle" :
# "Be conservative in what you send,
# be liberal in what you accept".
# (cf. https://en.wikipedia.org/wiki/Robustness_principle )
# ... but if you think about this, that's actually a really poor idea,
# which is much more likely to dilute standards, make unwarranted
# assumptions, and allow errors to pass silently and corrupt data.
#
# Let's discard this principle on the trash-heap of
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
# identify problems, and follow the principle: "crash early, crash often". Of
# course I can write code that would reformat any possible input as a FASTA
# file - but what good will it do me if it parses the file I receive
# from a server into FASTA format like:
#
# >404- Page Not Found</title</head>
# dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
# spellingrcntacttheadministratrsdyhtml
#
# Therefore, we write ourselves a FASTA checker that will enforce the following:
# (1) a FASTA file contains one or more sequences separated by zero or
# more empty lines
# (2) a sequence contains one header line followed by
# one or more sequence lines
# (3) a sequence line contains one or more uppercase or lowercase single
# letter amino acid codes, hyphens (gap character), or * (stop).
#
# Anything else should generate an error.
# (Case 1): Header(s) exist
fX <- c("ABC",
"defghi",
"klmnpq")
sel <- grepl("^>", fX) # "^>" is a regular expression that
# means: the exact character ">" at the
# beginning ("^") of the line.
if ( ! any(sel) ) { stop("no header lines in input.") }
# (Case 2) No adjacent header lines
fX <- c(">ABC",
">123",
"defghi",
"klmnpq")
sel <- grepl("^>", fX)
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
if ( any(sel)) { stop("adjacent header lines in input.") }
# (Case 3.1) all sequence lines contain only valid characters
# (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
# are defined with the .utilities.R script)
AAVALID
fX <- c(">ABC",
"def ;-) ghi",
"klmnpq")
myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
sel <- ! grepl("^>", fX) # NOT headers
if (any(grepl(myRegex, fX[sel]))) {
stop("invalid chracter(s) outside of header lines.")
}
# (Case 3.2) all headers are followed directly by
# at least one letter of sequence
fX <- c(">ABC",
"",
">123",
"defghi",
"klmnpq")
sel <- grep("^>", fX) + 1 # indexes of headers + 1
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
if (! all(grepl(myRegex, fX[sel]))) {
stop("a header has no adjacent sequence.")
}
# Ah, you might ask - couldn't we just have dropped all empty lines, and
# then caught this in Case 2? No - for two reasons: we would still miss headers
# at the end of file, and, we would have changed the line numbering - and
# ideally our "production" function will create information about where the
# error is to be found.
# Now combine this into a function ...
val <- function(fa) {
if ( ! any(grepl("^>", fa)) ) {
stop("no header lines in input.")
}
sel <- grepl("^>", fa)
if ( any(sel[- length(sel)] & sel[-1])) {
stop("adjacent header lines in input.")
}
sel <- ! grepl("^>", fa)
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
stop("invalid chracter(s) outside of header lines.")
}
sel <- grep("^>", fa) + 1
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
stop("a header has no adjacent sequence.")
}
return(invisible(NULL))
}
# Here is an example
FA <- c(">head1",
"acdef",
"ghi",
"",
">head2",
"kl",
">head3",
"mn",
"pqrs")
val(FA) # ... should not create an error
# A somewhat more elaborate validateFA() function was loaded with the
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
# fasta files have space-characters in their spacer lines. Try it ...
validateFA(FA)
# = 2 Parsing FASTA =======================================================
# Once we have validated our assumptions about our input, it's quite
# painless to parse it. I have put this together as a function and the function
# gets loaded from ./.utilities.R
#
# Lets try this:
# - the first 3 elements of faMBP1:
readFASTA(faMBP1[1:3])
# - a multi FASTA file of aligned APSES domain sequences:
refAPSES <- readFASTA("./data/refAPSES.mfa")
# Subset the sequence with "P39678" in the header
refAPSES[grep("P39678", refAPSES$head) ,]
# = 3 Interpreting FASTA ==================================================
# FASTA files are straightforward to interpret - just one thing may be of note:
# when working with strings, we can use substr(<string>, <start>, <stop>) to
# extract substrings, but more often we expand the string into a vector of
# single characters with strsplit(<string>, ""). strsplit() returns a list,
# to accommodate that <string> could be a vector of many elements, therefore
# we usually unlist() the result if we use it only on a single string.
# Example: How many positive charged residues in "MBP1_SACCE"?
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
s
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
# for the characters, sum() coerces to 1 and 0
# respectively, and that gives us the result.
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
# residue distribution
x <- factor(s, levels = names(AACOLS))
pie(table(x)[names(AACOLS)], col = AACOLS)
# = 4 Writing FASTA =======================================================
# Writing FASTA files is mostly just the reverse of reading, with one
# twist: we need to break the long sequence string into chunks of the desired
# width. The FASTA specification calls for a maximum of 120 characters per line,
# but writing out much less than that is common, since it allows to comfortably
# view lines on the console, or printing them on a sheet of paper (do we still
# do that actually?). How do we break a string into chunks? A combination of
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
# loop through our FASTA object in memory, we can build the output by c()'ing
# blocks of header + sequence to each other. For VERY large objects this might
# be slow - in that case, we might want to precalculate the size of the output
# object. But that's more of a hypothetical consideration.
( s <- refAPSES$seq[2] )
nchar(s)
w <- 30 # width of chunk
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk
(ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk
# Task: Is this safe? What happens if nchar(s) is shorter than w?
# What happens if nchar(s) is an exact multiple of w?
substring(s, starts, ends)
# confirm that the output contains the first and last residue, and both
# residues adjacent to the breaks
# As always, the function has been defined in ".utilities.R" for to use
# any time... type writeFASTA to examine it.
# Let's try this...
writeFASTA(refAPSES, width = 40)
# roundtrip for validation: write refAPSES with a different format,
# read it back in - the new dataframe must be identical
# to the original dataframe.
fname <- tempfile()
writeFASTA(refAPSES, fn = fname, width = 30)
identical(refAPSES, readFASTA(fname))
# ...works for me :-)
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,385 +1,385 @@
# tocID <- "RPR-Genetic_code_optimality.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit.
#
# Version: 1.3
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 58
#TOC> 2 Setting up the tools 74
#TOC> 2.1 Natural and alternative genetic codes 77
#TOC> 2.2 Effect of mutations 135
#TOC> 2.2.1 reverse-translate 146
#TOC> 2.2.2 Randomly mutate 171
#TOC> 2.2.3 Forward- translate 196
#TOC> 2.2.4 measure effect 213
#TOC> 3 Run the experiment 267
#TOC> 4 Task solutions 363
#TOC>
#TOC> ==========================================================================
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
# their robsustness to code changes. The approaches are quite simple and you
# will be able to come up with obvious refinements; the point of this code is to
# demonstrate some R programming techniques, in preparation for more
# sophisticated questions later.
# = 1 Designing a computational experiment ================================
# Computational experiments are conducted like wet-lab experiments. We begin
# with a hypothesis, then define the observables that relate to the hypothesis,
# then define the measures we apply to observations, and finally we interpret
# our observations. If we want to learn something about the evolution of the
# genetic code ...
# - we construct a hypothesis such as: the genetic code has evolved so as to
# minimize the effect of mutations;
# - we define the observables: the effect of mutations in
# sequences, given the natural and possible alternative codes;
# - we define the measures to quantify the effect of mutations;
# - then we compute alternatives and interpret the results.
# = 2 Setting up the tools ================================================
# == 2.1 Natural and alternative genetic codes =============================
# Load genetic code tables from the Biostrings package
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# There are many ways to generate alternative codes. The simplest way is to
# randomly assign amino acids to codons. A more sophisticated way is to keep the
# redundancy of codons intact, since it may reflect some form of symmetry
# breaking that ignores the third nucleotide of a codon for the most part;
# therefore we only replace the amino acids of the existing code with random
# others. Here are two functions that implement these two ideas about alternate
# codes.
randomGC <- function(GC) {
# Return a genetic code with randomly assigned amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments in which
# every amino acid and "*" is encoded at least once.
aa <- unique(GC) # the amino acids in the input code
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
# does not contain all amino acids,
# then we sample() again.
GC[1:64] <- sample(aa, 64, replace = TRUE)
}
return(GC)
}
swappedGC <- function(GC) {
# Return a genetic code with randomly swapped amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments where the
# amino acids have been swapped.
aaOrig <- unique(GC) # the amino acids in the input code
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
names(aaSwap) <- aaOrig # name them after the original
GC[1:64] <- aaSwap[GC] # replace original with shuffled
return(GC)
}
# == 2.2 Effect of mutations ===============================================
# To evaluate the effects of mutations we will do the following:
# - we take an amino acid sequence (Mbp1 will do just nicely);
# - we reverse-translate it into a nucleotide sequence;
# - we mutate it randomly;
# - we translate it back to amino acids;
# - we count the number of mutations and evaluate their severity.
# === 2.2.1 reverse-translate
# To reverse-translate an amino acid vector, we randomly pick one of its
# codons from a genetic code, and assemble all codons to a sequence.
traRev <- function(s, GC) {
# Parameters:
# s chr a sequence vector
# GC chr a genetic code
# Value:
# A reverse-translated vector of codons
vC <- character(length(s))
for (i in seq_along(s)) {
codon <- names(GC)[GC == s[i]] # get all codons for this AA
if (length(codon) > 1) { # if there's more than one ...
codon <- sample(codon, 1) # pick one at random ...
}
vC[i] <- codon # store it
}
return(vC)
}
# === 2.2.2 Randomly mutate
# To mutate, we split a codon into it's three nucleotides, then randomly replace
# one of the three with another nucleotide.
randMut <- function(vC) {
# Parameter:
# vC chr a vector of codons
# Value: chr a vector of codons with a single point mutation from vC
nuc <- c("A", "C", "G", "T")
for (i in seq_along(vC)) {
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
iNuc <- sample(1:3, 1) # choose one of the three
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
triplet[iNuc] <- mutNuc # replace the original
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
}
return(vC)
}
# === 2.2.3 Forward- translate
traFor <- function(vC, GC) {
# Parameters:
# vC chr a codon vector
# GC chr a genetic code
# Value:
# A vector of amino acids
vAA <- character(length(vC))
for (i in seq_along(vC)) {
vAA[i] <- GC[vC[i]] # translate and store
}
return(vAA)
}
# === 2.2.4 measure effect
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
# categories, according to their free energy of transfer from water to octanol:
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
# Then we will penalize as follows:
# Changes within one category: 0.1
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
# Changes from hydrophobic to hydrophilic or back: 1.0
# Changes to stop-codon: 3.0
evalMut <- function(nat, mut) {
# Evaluate severity of mutations between amino acid sequence vectors nat and
# mut in an ad hoc approach based on hydrophobicity changes.
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
penalties <- numeric(length(nat))
lMut <- nat != mut # logical TRUE for all mutated positions
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
return(sum(penalties))
}
# A more sophisticated approach could take additional quantities into account,
# such as charge, size, or flexibility - and it could add heuristics, such as:
# proline is always bad in secondary structure, charged amino acids are terrible
# in the folded core of a protein, replacing a small by a large amino acid in
# the core is very disruptive ... etc.
#
# For our experiment, we should not use a mutation data matrix however:
# empirical mutation probabilities are superbly suited to estimate evolutionary
# relationships. Here however, as we are trying to evaluate effects of random
# mutations on genetic codes, our reasoning would be circular - we would
# discover that the natural genetic code is optimal ... because it is most
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
# = 3 Run the experiment ==================================================
# Fetch the standard Genetic code from Biostrings::
stdCode <- Biostrings::GENETIC_CODE
# Fetch the nucleotide sequence for MBP1:
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
myDNA <- paste0(myDNA, collapse = "")
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
myAA <- traFor(myDNA, stdCode)
# Mutate and evaluate
set.seed(112358)
x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, stdCode)
evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed.
N <- 200
valSTDC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { # this takes a few seconds ...
x <- randMut(myDNA) # mutate
x <- traFor(x, stdCode) # translate
valSTDC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSTDC,
breaks = 15,
col = "palegoldenrod",
xlim = c(0, 400),
ylim = c(0, N/4),
main = "Standard vs. Synthetic Genetic Code",
xlab = "Mutation penalty")
# This looks like a normal distribution. Let's assume the effect of mutations
# under the standard genetic code is the mean of this distribution:
effectSTDC <- mean(valSTDC) # 178.1
# Now we can look at the effects of alternate genetic codes:
set.seed(112358)
# choose a new code
GC <- randomGC(stdCode)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC)
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
x <- traFor(x, GC) # translate back, with the new code
evalMut(myAA, x) # evaluate mutation effects: 298.5
# That seems a fair bit higher than what we saw as "effectUGC"
# Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code.
N <- 200
valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- randomGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valXGC,
col = "plum",
breaks = 15,
add = TRUE)
# These two distributions are very widely separated!
# Task: Perform the same experiment with the swapped genetic code.
# Compare the distributions. Interpret the result.
# These are simple experiments, under assumptions that can be refined in
# meaningful ways. Yet, even those simple computational experiments show
# that the Universal Genetic Code has features that one would predict if
# it has evolved under selective pressure to minimize the effects of mutations.
# Gradual change under mutation is benificial to evolution, disruptive
# change is not.
# = 4 Task solutions ======================================================
N <- 200
valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- swappedGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSGC,
col = "#6688FF88",
breaks = 15,
add = TRUE)
# [END]
# tocID <- "RPR-Genetic_code_optimality.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit.
#
# Version: 1.3
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 58
#TOC> 2 Setting up the tools 74
#TOC> 2.1 Natural and alternative genetic codes 77
#TOC> 2.2 Effect of mutations 135
#TOC> 2.2.1 reverse-translate 146
#TOC> 2.2.2 Randomly mutate 171
#TOC> 2.2.3 Forward- translate 196
#TOC> 2.2.4 measure effect 213
#TOC> 3 Run the experiment 267
#TOC> 4 Task solutions 363
#TOC>
#TOC> ==========================================================================
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
# their robsustness to code changes. The approaches are quite simple and you
# will be able to come up with obvious refinements; the point of this code is to
# demonstrate some R programming techniques, in preparation for more
# sophisticated questions later.
# = 1 Designing a computational experiment ================================
# Computational experiments are conducted like wet-lab experiments. We begin
# with a hypothesis, then define the observables that relate to the hypothesis,
# then define the measures we apply to observations, and finally we interpret
# our observations. If we want to learn something about the evolution of the
# genetic code ...
# - we construct a hypothesis such as: the genetic code has evolved so as to
# minimize the effect of mutations;
# - we define the observables: the effect of mutations in
# sequences, given the natural and possible alternative codes;
# - we define the measures to quantify the effect of mutations;
# - then we compute alternatives and interpret the results.
# = 2 Setting up the tools ================================================
# == 2.1 Natural and alternative genetic codes =============================
# Load genetic code tables from the Biostrings package
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# There are many ways to generate alternative codes. The simplest way is to
# randomly assign amino acids to codons. A more sophisticated way is to keep the
# redundancy of codons intact, since it may reflect some form of symmetry
# breaking that ignores the third nucleotide of a codon for the most part;
# therefore we only replace the amino acids of the existing code with random
# others. Here are two functions that implement these two ideas about alternate
# codes.
randomGC <- function(GC) {
# Return a genetic code with randomly assigned amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments in which
# every amino acid and "*" is encoded at least once.
aa <- unique(GC) # the amino acids in the input code
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
while(length(unique(GC)) < length(aa)) { # We could end up with a code that
# does not contain all amino acids,
# then we sample() again.
GC[1:64] <- sample(aa, 64, replace = TRUE)
}
return(GC)
}
swappedGC <- function(GC) {
# Return a genetic code with randomly swapped amino acids.
# Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments where the
# amino acids have been swapped.
aaOrig <- unique(GC) # the amino acids in the input code
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
names(aaSwap) <- aaOrig # name them after the original
GC[1:64] <- aaSwap[GC] # replace original with shuffled
return(GC)
}
# == 2.2 Effect of mutations ===============================================
# To evaluate the effects of mutations we will do the following:
# - we take an amino acid sequence (Mbp1 will do just nicely);
# - we reverse-translate it into a nucleotide sequence;
# - we mutate it randomly;
# - we translate it back to amino acids;
# - we count the number of mutations and evaluate their severity.
# === 2.2.1 reverse-translate
# To reverse-translate an amino acid vector, we randomly pick one of its
# codons from a genetic code, and assemble all codons to a sequence.
traRev <- function(s, GC) {
# Parameters:
# s chr a sequence vector
# GC chr a genetic code
# Value:
# A reverse-translated vector of codons
vC <- character(length(s))
for (i in seq_along(s)) {
codon <- names(GC)[GC == s[i]] # get all codons for this AA
if (length(codon) > 1) { # if there's more than one ...
codon <- sample(codon, 1) # pick one at random ...
}
vC[i] <- codon # store it
}
return(vC)
}
# === 2.2.2 Randomly mutate
# To mutate, we split a codon into it's three nucleotides, then randomly replace
# one of the three with another nucleotide.
randMut <- function(vC) {
# Parameter:
# vC chr a vector of codons
# Value: chr a vector of codons with a single point mutation from vC
nuc <- c("A", "C", "G", "T")
for (i in seq_along(vC)) {
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
iNuc <- sample(1:3, 1) # choose one of the three
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
triplet[iNuc] <- mutNuc # replace the original
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
}
return(vC)
}
# === 2.2.3 Forward- translate
traFor <- function(vC, GC) {
# Parameters:
# vC chr a codon vector
# GC chr a genetic code
# Value:
# A vector of amino acids
vAA <- character(length(vC))
for (i in seq_along(vC)) {
vAA[i] <- GC[vC[i]] # translate and store
}
return(vAA)
}
# === 2.2.4 measure effect
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
# categories, according to their free energy of transfer from water to octanol:
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
# Then we will penalize as follows:
# Changes within one category: 0.1
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
# Changes from hydrophobic to hydrophilic or back: 1.0
# Changes to stop-codon: 3.0
evalMut <- function(nat, mut) {
# Evaluate severity of mutations between amino acid sequence vectors nat and
# mut in an ad hoc approach based on hydrophobicity changes.
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G")
penalties <- numeric(length(nat))
lMut <- nat != mut # logical TRUE for all mutated positions
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
return(sum(penalties))
}
# A more sophisticated approach could take additional quantities into account,
# such as charge, size, or flexibility - and it could add heuristics, such as:
# proline is always bad in secondary structure, charged amino acids are terrible
# in the folded core of a protein, replacing a small by a large amino acid in
# the core is very disruptive ... etc.
#
# For our experiment, we should not use a mutation data matrix however:
# empirical mutation probabilities are superbly suited to estimate evolutionary
# relationships. Here however, as we are trying to evaluate effects of random
# mutations on genetic codes, our reasoning would be circular - we would
# discover that the natural genetic code is optimal ... because it is most
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
# = 3 Run the experiment ==================================================
# Fetch the standard Genetic code from Biostrings::
stdCode <- Biostrings::GENETIC_CODE
# Fetch the nucleotide sequence for MBP1:
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
myDNA <- paste0(myDNA, collapse = "")
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
myAA <- traFor(myDNA, stdCode)
# Mutate and evaluate
set.seed(112358)
x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, stdCode)
evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed.
N <- 200
valSTDC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { # this takes a few seconds ...
x <- randMut(myDNA) # mutate
x <- traFor(x, stdCode) # translate
valSTDC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSTDC,
breaks = 15,
col = "palegoldenrod",
xlim = c(0, 400),
ylim = c(0, N/4),
main = "Standard vs. Synthetic Genetic Code",
xlab = "Mutation penalty")
# This looks like a normal distribution. Let's assume the effect of mutations
# under the standard genetic code is the mean of this distribution:
effectSTDC <- mean(valSTDC) # 178.1
# Now we can look at the effects of alternate genetic codes:
set.seed(112358)
# choose a new code
GC <- randomGC(stdCode)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC)
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
x <- traFor(x, GC) # translate back, with the new code
evalMut(myAA, x) # evaluate mutation effects: 298.5
# That seems a fair bit higher than what we saw as "effectUGC"
# Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code.
N <- 200
valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- randomGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valXGC,
col = "plum",
breaks = 15,
add = TRUE)
# These two distributions are very widely separated!
# Task: Perform the same experiment with the swapped genetic code.
# Compare the distributions. Interpret the result.
# These are simple experiments, under assumptions that can be refined in
# meaningful ways. Yet, even those simple computational experiments show
# that the Universal Genetic Code has features that one would predict if
# it has evolved under selective pressure to minimize the effects of mutations.
# Gradual change under mutation is benificial to evolution, disruptive
# change is not.
# = 4 Task solutions ======================================================
N <- 200
valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- swappedGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valSGC,
col = "#6688FF88",
breaks = 15,
add = TRUE)
# [END]

View File

@ -1,50 +1,50 @@
# tocID <- "RPR-Introduction.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Introduction unit
#
# Version: 1.0
#
# Date: 2020-09-18
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 Updtaed workflow; live
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# === TASK: Local script
#
# - Open the file myScript.R
#
# - Create a section header with a date.
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
# from 0). Not a loop - a single expression. The first number you get must
# be 1. The last number you get must be 1024.
#
# - Save the file in the myScripts folder, and close it.
#
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
# to execute it.
#
# - Done
# (This task is meant to make sure that writing R expressions, saving
# them in scripts, opening script files and executing code in the file works
# for you. If there is an issue, get in touch.)
# [END]
# tocID <- "RPR-Introduction.R"
#
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Introduction unit
#
# Version: 1.0
#
# Date: 2020-09-18
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 Updtaed workflow; live
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# === TASK: Local script
#
# - Open the file myScript.R
#
# - Create a section header with a date.
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
# from 0). Not a loop - a single expression. The first number you get must
# be 1. The last number you get must be 1024.
#
# - Save the file in the myScripts folder, and close it.
#
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
# to execute it.
#
# - Done
# (This task is meant to make sure that writing R expressions, saving
# them in scripts, opening script files and executing code in the file works
# for you. If there is an issue, get in touch.)
# [END]

View File

@ -1,168 +1,168 @@
# tocID <- "RPR-PROSITE_POST.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 156
#TOC>
#TOC> ==========================================================================
# = 1 Constructing a POST command from a Web query ========================
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We have reverse engineered the Web form for a ScanProsite request, and can
# construct a valid POST request from knowing the required field names. The POST
# command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs
UniProtID <- "P39678"
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = UniProtID,
skip = "on",
output = "tabular"))
# Send off this request, and you should have a response in a few
# seconds. Let's check the status first:
httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask
# on the Discussion Board what to do.
# The text contents of the response is available with the
# content() function:
httr::content(response, "text")
# ... should show you the same as the page contents that you have seen in the
# browser. Now we need to extract the data from the page. For this simple
# example we can get away with using regular expressions, but in general we need
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# strsplit() the response into individual lines, since each of our data elements
# is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines)
# Now we define a query pattern for the lines we want:
# we can use the uID, bracketed by two "|" pipe
# characters:
patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this
# pattern:
( lines <- lines[grep(patt, lines)] )
# ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
unlist(strsplit(lines[1], "\\t|\\|"))
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
# not the literal character "t". And "|" has to be escaped because we mean the
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
# backslash turns a special meaning off, and sometimes it turns a special
# meaning on. Unfortunately there's no easy way to tell - you just need to
# remember the characters - or have a reference handy. The metacharacters are
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
# where in the regex they are.
# Let's put the tokens into named slots of a data frame
features <- data.frame()
for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|"))
features <- rbind(features,
data.frame(uID = tokens[2],
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
psSeq = tokens[11]))
}
features
# This forms the base of a function that collects the features automatically
# from a PrositeScan result. You can write this!
# == 1.1 Task - fetchPrositeFeatures() function ============================
# Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a data frame as given above, or
# an empty data frame if there is an error.
# = 2 Task solutions ======================================================
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
# clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test:
dbFetchPrositeFeatures("Q5KMQ9")
# [END]
# tocID <- "RPR-PROSITE_POST.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 156
#TOC>
#TOC> ==========================================================================
# = 1 Constructing a POST command from a Web query ========================
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# We have reverse engineered the Web form for a ScanProsite request, and can
# construct a valid POST request from knowing the required field names. The POST
# command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs
UniProtID <- "P39678"
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = UniProtID,
skip = "on",
output = "tabular"))
# Send off this request, and you should have a response in a few
# seconds. Let's check the status first:
httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask
# on the Discussion Board what to do.
# The text contents of the response is available with the
# content() function:
httr::content(response, "text")
# ... should show you the same as the page contents that you have seen in the
# browser. Now we need to extract the data from the page. For this simple
# example we can get away with using regular expressions, but in general we need
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# strsplit() the response into individual lines, since each of our data elements
# is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines)
# Now we define a query pattern for the lines we want:
# we can use the uID, bracketed by two "|" pipe
# characters:
patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this
# pattern:
( lines <- lines[grep(patt, lines)] )
# ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
unlist(strsplit(lines[1], "\\t|\\|"))
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
# not the literal character "t". And "|" has to be escaped because we mean the
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
# backslash turns a special meaning off, and sometimes it turns a special
# meaning on. Unfortunately there's no easy way to tell - you just need to
# remember the characters - or have a reference handy. The metacharacters are
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on
# where in the regex they are.
# Let's put the tokens into named slots of a data frame
features <- data.frame()
for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|"))
features <- rbind(features,
data.frame(uID = tokens[2],
start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]),
psID = tokens[6],
psName = tokens[7],
psSeq = tokens[11]))
}
features
# This forms the base of a function that collects the features automatically
# from a PrositeScan result. You can write this!
# == 1.1 Task - fetchPrositeFeatures() function ============================
# Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a data frame as given above, or
# an empty data frame if there is an error.
# = 2 Task solutions ======================================================
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
# clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test:
dbFetchPrositeFeatures("Q5KMQ9")
# [END]

View File

@ -1,135 +1,135 @@
# tocID <- "RPR-Pipe.R"
#
# Purpose: A Bioinformatics Course:
# Discussing pipe operators.
#
# Version: 1.0
#
# Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 New code
#
#
# TODO:
# - find more interesting examples
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------
#TOC> 1 Pipe Concept 41
#TOC> 2 Nested Expression 73
#TOC> 3 magrittr:: Pipe 78
#TOC> 4 Base R Pipe 93
#TOC> 5 Intermediate Assignment 108
#TOC> 6 Postscript 127
#TOC>
#TOC> ==========================================================================
# = 1 Pipe Concept =======================================================
# Pipes are actually an awesome idea for any code that implements a workflow -
# a sequence of operations, each of which transforms data in a specialized way.
#
# This principle is familiar from maths: chained functions. If have a function
# y = f(x) and want to use those results as in z = g(y), I can just write
# z = g(f(x))
#
# On the unix command line, pipes were used from the very beginning, implemented
# with the "|" pipe character.
#
# In R, the magrittr package provided the %>% operator, and recently the |>
# operator has been introduced into base R.
#
# However there are alternatives: intermediate assignment, and nested functions
# that have always existed in base R anyway.
#
# Let us look at an example. In writing this, I found out that virtually
# ALL non-trivial examples I came up with don't translate well into this idiom
# at all. It is actually quite limited to simple filtering operations on
# data. A more interesting example might be added in the future, let me know if
# you have a good idea.
#
# A somewhat contrived example is to sort a list of files by the
# length of the file names:
myFiles <- list.files(pattern = "\\.R$")
# nchar() gives the number of characters in a string, order() produces indices
# that map an array to its sorted form.
#
# = 2 Nested Expression ===================================================
myFiles[order(nchar(myFiles))]
# = 3 magrittr:: Pipe =====================================================
if (! requireNamespace("magrittr", quietly = TRUE)) {
install.packages("magrittr")
}
# Package information:
# library(help = magrittr) # basic information
# browseVignettes("magrittr") # available vignettes
# data(package = "magrittr") # available datasets
library(magrittr)
myFiles %>% nchar %>% order %>% myFiles[.]
# = 4 Base R Pipe =========================================================
# Since version 4.1, base R now supports a pipe operator without the need
# to load a special package. Such an introductions of external functionality
# into the language is very rare.
#
# Unfortunately it won't (yet) work with the '[' function, so we need to write
# an intermediate function for this example
extract <- function(x, v) {
return(v[x])
}
myFiles |> nchar() |> order() |> extract(myFiles)
# = 5 Intermediate Assignment =============================================
# So what's the problem? As you can see, the piped code may be concise and
# expressive. But there is also a large amount of implicit assignment and
# processing going on and that is usually a bad idea because it makes code hard
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
# replacing it with the pipe makes things much better. My preferred idiom is
# to use intermediate assignments. Only then is it convenient to examine
# the code step by step and validate every single step. And that is the most
# important objective at all: no code is good if it does not compute
# correctly.
x <- nchar(myFiles)
x <- order(x)
myFiles[x]
# = 6 Postscript ==========================================================
# I tried to write an example that strips all comments from a list of files, and
# another example that finds all files that were not yet updated this year
# (according to the "# Date: in the header). Neither examples can be well
# written without intermediate assignments, or at least sapply() functions
# that are not simpler at all than the intermediate assignment.
# [END]
# tocID <- "RPR-Pipe.R"
#
# Purpose: A Bioinformatics Course:
# Discussing pipe operators.
#
# Version: 1.0
#
# Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 New code
#
#
# TODO:
# - find more interesting examples
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------
#TOC> 1 Pipe Concept 41
#TOC> 2 Nested Expression 73
#TOC> 3 magrittr:: Pipe 78
#TOC> 4 Base R Pipe 93
#TOC> 5 Intermediate Assignment 108
#TOC> 6 Postscript 127
#TOC>
#TOC> ==========================================================================
# = 1 Pipe Concept =======================================================
# Pipes are actually an awesome idea for any code that implements a workflow -
# a sequence of operations, each of which transforms data in a specialized way.
#
# This principle is familiar from maths: chained functions. If have a function
# y = f(x) and want to use those results as in z = g(y), I can just write
# z = g(f(x))
#
# On the unix command line, pipes were used from the very beginning, implemented
# with the "|" pipe character.
#
# In R, the magrittr package provided the %>% operator, and recently the |>
# operator has been introduced into base R.
#
# However there are alternatives: intermediate assignment, and nested functions
# that have always existed in base R anyway.
#
# Let us look at an example. In writing this, I found out that virtually
# ALL non-trivial examples I came up with don't translate well into this idiom
# at all. It is actually quite limited to simple filtering operations on
# data. A more interesting example might be added in the future, let me know if
# you have a good idea.
#
# A somewhat contrived example is to sort a list of files by the
# length of the file names:
myFiles <- list.files(pattern = "\\.R$")
# nchar() gives the number of characters in a string, order() produces indices
# that map an array to its sorted form.
#
# = 2 Nested Expression ===================================================
myFiles[order(nchar(myFiles))]
# = 3 magrittr:: Pipe =====================================================
if (! requireNamespace("magrittr", quietly = TRUE)) {
install.packages("magrittr")
}
# Package information:
# library(help = magrittr) # basic information
# browseVignettes("magrittr") # available vignettes
# data(package = "magrittr") # available datasets
library(magrittr)
myFiles %>% nchar %>% order %>% myFiles[.]
# = 4 Base R Pipe =========================================================
# Since version 4.1, base R now supports a pipe operator without the need
# to load a special package. Such an introductions of external functionality
# into the language is very rare.
#
# Unfortunately it won't (yet) work with the '[' function, so we need to write
# an intermediate function for this example
extract <- function(x, v) {
return(v[x])
}
myFiles |> nchar() |> order() |> extract(myFiles)
# = 5 Intermediate Assignment =============================================
# So what's the problem? As you can see, the piped code may be concise and
# expressive. But there is also a large amount of implicit assignment and
# processing going on and that is usually a bad idea because it makes code hard
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
# replacing it with the pipe makes things much better. My preferred idiom is
# to use intermediate assignments. Only then is it convenient to examine
# the code step by step and validate every single step. And that is the most
# important objective at all: no code is good if it does not compute
# correctly.
x <- nchar(myFiles)
x <- order(x)
myFiles[x]
# = 6 Postscript ==========================================================
# I tried to write an example that strips all comments from a list of files, and
# another example that finds all files that were not yet updated this year
# (according to the "# Date: in the header). Neither examples can be well
# written without intermediate assignments, or at least sapply() functions
# that are not simpler at all than the intermediate assignment.
# [END]

View File

@ -1,180 +1,180 @@
# tocID <- "RPR-RegEx.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-RegEx unit
#
# Version: 1.0
#
# Date: 2017-08 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 Maintenance 2020
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 A regex example 41
#TOC> 2 Counting lines 108
#TOC> 2.1 Counting C-alpha atoms only 126
#TOC> 3 Code Solutions 142
#TOC> 3.1 Counting atoms 144
#TOC> 3.2 Counting C-alpha records 160
#TOC>
#TOC> ==========================================================================
# = 1 A regex example =====================================================
# The canonical FASTA version of yeast Mbp1 at Uniprot
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
nchar(s)
# Must be 969
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
# two vertical bars ("pipes") in the header record.
#
# Develop the regular expression:
# Just five characters returned, so we know we are using
patt <- "^>(.{5})" # the right functions
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
patt <- "^>(.*)|" # everything to the pipe character
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Ooops - "|" is a metacharacter - we must escape it
patt <- "^>(.*)\|" # using "\|"
# Ooops - that's not how we escape: must double the \ to send a literal
# "\" plus the character "|" to the regex engine.
patt <- "^>(.*)\\|" # using "\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Good. Now let's first match everything that is not a "|", then match a "|"
patt <- "^>([^|]*)\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# the same thing again, but capture the second match. And insist that there
# must be at least one character captured
patt <- "^>[^|]*\\|([^|]+)\\|"
# Analyze this pattern:
# ^ anchor the match at the beginning of the line
# > ">" must be the first character
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
# we don't know what other versions of the string "sp"
# might appear. Note that within the brackets "|" is NOT a
# metacharacter.
# \\| "|" character: ouside of square brackets "|" is a metacharacter
# and means "OR"; we need to escape it to match a literal "|".
# ( open parenthesis: capture what comes next ...
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
# ) close parenthesis: stop capturing here
# \\| second "|" character, escaped
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# = 2 Counting lines ======================================================
# Task: Write a function that returns the number of atoms in a PDB file. Call it
# atomCount(). Sample data is here:
myPDB <- readLines("./data/0TST.pdb")
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that begin with "ATOM "
# or with "HETATM".
# Try this. Write a function. Solution code is at the end of this file.
# Don't peek.
atomCount("./data/0TST.pdb") # must return 6
# == 2.1 Counting C-alpha atoms only =======================================
# Task: write a function based on the previous one that matches only CA records,
# i.e. it can be used to count the number of amino acids. Don't get
# fooled by calcium atoms, or the string CA appearing elsewhere.
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that have a C-alpha atom.
# Try this. Solution code is at the end of this file. Don't peek.
CAcount("./data/0TST.pdb") # must return 1
# = 3 Code Solutions ======================================================
# == 3.1 Counting atoms ====================================================
atomCount <- function(IN) {
# count the number of atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match "^ATOM " or "^HETATM"
# Note: the regex MUST be anchored to the beginning of the line, otherwise
# it might match somewhere in a comment!
x <- readLines(IN)
patt <- "(^ATOM )|(^HETATM)"
return(length(grep(patt, x)))
}
# == 3.2 Counting C-alpha records ==========================================
CAcount <- function(IN) {
# count the number of C-alpha atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match " CA " in position 13 - 16 of
# an ATOM record.
# Note: the regex MUST be aligned into the right position, otherwise it
# might match Calcium records!
x <- readLines(IN)
patt <- "^ATOM ...... CA "
return(length(grep(patt, x)))
}
# [END]
# tocID <- "RPR-RegEx.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-RegEx unit
#
# Version: 1.0
#
# Date: 2017-08 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 Maintenance 2020
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 A regex example 41
#TOC> 2 Counting lines 108
#TOC> 2.1 Counting C-alpha atoms only 126
#TOC> 3 Code Solutions 142
#TOC> 3.1 Counting atoms 144
#TOC> 3.2 Counting C-alpha records 160
#TOC>
#TOC> ==========================================================================
# = 1 A regex example =====================================================
# The canonical FASTA version of yeast Mbp1 at Uniprot
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
nchar(s)
# Must be 969
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
# two vertical bars ("pipes") in the header record.
#
# Develop the regular expression:
# Just five characters returned, so we know we are using
patt <- "^>(.{5})" # the right functions
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
patt <- "^>(.*)|" # everything to the pipe character
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Ooops - "|" is a metacharacter - we must escape it
patt <- "^>(.*)\|" # using "\|"
# Ooops - that's not how we escape: must double the \ to send a literal
# "\" plus the character "|" to the regex engine.
patt <- "^>(.*)\\|" # using "\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Good. Now let's first match everything that is not a "|", then match a "|"
patt <- "^>([^|]*)\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# the same thing again, but capture the second match. And insist that there
# must be at least one character captured
patt <- "^>[^|]*\\|([^|]+)\\|"
# Analyze this pattern:
# ^ anchor the match at the beginning of the line
# > ">" must be the first character
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
# we don't know what other versions of the string "sp"
# might appear. Note that within the brackets "|" is NOT a
# metacharacter.
# \\| "|" character: ouside of square brackets "|" is a metacharacter
# and means "OR"; we need to escape it to match a literal "|".
# ( open parenthesis: capture what comes next ...
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
# ) close parenthesis: stop capturing here
# \\| second "|" character, escaped
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# = 2 Counting lines ======================================================
# Task: Write a function that returns the number of atoms in a PDB file. Call it
# atomCount(). Sample data is here:
myPDB <- readLines("./data/0TST.pdb")
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that begin with "ATOM "
# or with "HETATM".
# Try this. Write a function. Solution code is at the end of this file.
# Don't peek.
atomCount("./data/0TST.pdb") # must return 6
# == 2.1 Counting C-alpha atoms only =======================================
# Task: write a function based on the previous one that matches only CA records,
# i.e. it can be used to count the number of amino acids. Don't get
# fooled by calcium atoms, or the string CA appearing elsewhere.
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that have a C-alpha atom.
# Try this. Solution code is at the end of this file. Don't peek.
CAcount("./data/0TST.pdb") # must return 1
# = 3 Code Solutions ======================================================
# == 3.1 Counting atoms ====================================================
atomCount <- function(IN) {
# count the number of atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match "^ATOM " or "^HETATM"
# Note: the regex MUST be anchored to the beginning of the line, otherwise
# it might match somewhere in a comment!
x <- readLines(IN)
patt <- "(^ATOM )|(^HETATM)"
return(length(grep(patt, x)))
}
# == 3.2 Counting C-alpha records ==========================================
CAcount <- function(IN) {
# count the number of C-alpha atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match " CA " in position 13 - 16 of
# an ATOM record.
# Note: the regex MUST be aligned into the right position, otherwise it
# might match Calcium records!
x <- readLines(IN)
patt <- "^ATOM ...... CA "
return(length(grep(patt, x)))
}
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,135 +1,135 @@
# tocID <- "RPR-UniProt_GET.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
# added FASTA headers as attribute
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 UniProt files via GET 43
#TOC> 1.1 Task - fetchUniProtSeq() function 105
#TOC> 2 Task solutions 118
#TOC>
#TOC> ==========================================================================
# = 1 UniProt files via GET ===============================================
# Perhaps the simplest example of scripted download is to retrieve a protein
# FASTA sequence from UniProt. All we need is to construct an URL with the
# correct UniProt ID.
# An interface between R scripts and Web servers is provided by the httr::
# package. This sends and receives information via the http protocol, just like
# a Web browser. Since this is a short and simple request, the GET verb is the
# right tool:
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# The UniProt ID for Mbp1 is ...
UniProtID <- "P39678"
# and the base URL to retrieve data is ...
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
# retrieve a FASTA sequence:
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
# the GET() function from httr will get the data.
response <- httr::GET(URL)
str(response) # the response object is a bit complex ...
as.character(response) # ... but it is easy to pull out the data.
# to process ...
x <- as.character(response)
x <- strsplit(x, "\n")
dbSanitizeSequence(x)
# Simple.
# But what happens if there is an error, e.g. the uniprot ID does not exist?
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
as.character(response)
# this is a large HTML page that tells us the URL was not found. So we need to
# check for errors. The Right Way to do this is to evaluate the staus code that
# every Web server returns for every transaction.
#
httr::status_code(response) # 404 == Page Not Found
# There are many possible codes, but the only code we will be happy with
# is 200 - oK.
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
response <- httr::GET(URL)
httr::status_code(response)
# == 1.1 Task - fetchUniProtSeq() function =================================
# Task: write a function that
# - takes as input a vector of UniProt IDs,
# - fetches the FASTA sequence for each
# - returns a vector of the same length as the input, where an element is:
# - ... the sequence, if the query was successful
# - ... NA if there was an error
# - each element has the UniProt ID as the name()
# - bonus: the output has an attribute "headers" that is a vector of the
# FASTA headers ( cf. ?attr )
# = 2 Task solutions ======================================================
# I have placed such a function - dbFetchUniProtSeq() - into
# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
# in the Environment pane.
# Test this:
( x <- dbFetchUniProtSeq("P39678") )
names(x)[1]
attr(x, "headers")[1]
x[1]
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
width = 40), sep = "\n")
# [END]
# tocID <- "RPR-UniProt_GET.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
# added FASTA headers as attribute
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 UniProt files via GET 43
#TOC> 1.1 Task - fetchUniProtSeq() function 105
#TOC> 2 Task solutions 118
#TOC>
#TOC> ==========================================================================
# = 1 UniProt files via GET ===============================================
# Perhaps the simplest example of scripted download is to retrieve a protein
# FASTA sequence from UniProt. All we need is to construct an URL with the
# correct UniProt ID.
# An interface between R scripts and Web servers is provided by the httr::
# package. This sends and receives information via the http protocol, just like
# a Web browser. Since this is a short and simple request, the GET verb is the
# right tool:
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# The UniProt ID for Mbp1 is ...
UniProtID <- "P39678"
# and the base URL to retrieve data is ...
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
# retrieve a FASTA sequence:
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
# the GET() function from httr will get the data.
response <- httr::GET(URL)
str(response) # the response object is a bit complex ...
as.character(response) # ... but it is easy to pull out the data.
# to process ...
x <- as.character(response)
x <- strsplit(x, "\n")
dbSanitizeSequence(x)
# Simple.
# But what happens if there is an error, e.g. the uniprot ID does not exist?
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
as.character(response)
# this is a large HTML page that tells us the URL was not found. So we need to
# check for errors. The Right Way to do this is to evaluate the staus code that
# every Web server returns for every transaction.
#
httr::status_code(response) # 404 == Page Not Found
# There are many possible codes, but the only code we will be happy with
# is 200 - oK.
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
response <- httr::GET(URL)
httr::status_code(response)
# == 1.1 Task - fetchUniProtSeq() function =================================
# Task: write a function that
# - takes as input a vector of UniProt IDs,
# - fetches the FASTA sequence for each
# - returns a vector of the same length as the input, where an element is:
# - ... the sequence, if the query was successful
# - ... NA if there was an error
# - each element has the UniProt ID as the name()
# - bonus: the output has an attribute "headers" that is a vector of the
# FASTA headers ( cf. ?attr )
# = 2 Task solutions ======================================================
# I have placed such a function - dbFetchUniProtSeq() - into
# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
# in the Environment pane.
# Test this:
( x <- dbFetchUniProtSeq("P39678") )
names(x)[1]
attr(x, "headers")[1]
x[1]
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
width = 40), sep = "\n")
# [END]

View File

@ -1,234 +1,234 @@
# tocID <- "RPR-Unit_testing.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Unit_testing unit.
#
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates. Discuss local tests.
# 1.1 Change from require() to requireNamespace()
# 1.0 New code
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Unit Tests with testthat 42
#TOC> 2 Organizing your tests 165
#TOC> 2.1 Testing scripts 189
#TOC> 2.2 Rethinking testing 202
#TOC> 3 Task solutions 220
#TOC>
#TOC> ==========================================================================
# = 1 Unit Tests with testthat ============================================
# The testthat package supports writing and executing unit tests in many ways.
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
}
# Package information:
# library(help = testthat) # basic information
# browseVignettes("testthat") # available vignettes
# data(package = "testthat") # available datasets
# testthat is one of those packages that we either use A LOT in a script,
# or not at all. Therefore it's more reasonable to depart from our usual
# <package>::<function>() idiom, and load the entire library. In fact, if
# we author packages, it is common practice to load testthat in the part
# of the package that automates testing.
library(testthat)
# An atomic test consists of an expectation about the bahaviour of a function or
# the existence of an object. testthat provides a number of useful expectations:
# At the most basic level, you can use expect_true() and expect_false():
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
expect_true(file.exists("NO-SUCH-FILE.txt"))
expect_false(is.integer(NA))
# More commonly, you will test for equality of an output with a given result.
# But you need to consider what it means for two numbers to be "equal" on a
# digital computer. Consider:
49*(1/49) == 1 # Surprised? Read FAQ 7.31
# https://cran.r-project.org/doc/FAQ/R-FAQ.html
49*(1/49) - 1 # NOT zero (but almost)
# This is really unpredictable ...
0.1 + 0.05 == 0.15
0.2 + 0.07 == 0.27
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
# R uses the function all.equal() to test whether two numbers are equal for
# practical puposes up to machine precision.
49*(1/49) == 1
all.equal(49*(1/49), 1)
# The testthat function expect_equal() uses all.equal internally:
expect_equal(49*(1/49), 1)
# ... which is reasonable, or, if things MUST be exactly the same ...
expect_identical(49*(1/49), 1)
# ... but consider:
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
# Some very useful expectations are expect_warning(), and expect_error(), for
# constructing tests that check for erroneous output:
as.integer(c("1", "2", "three"))
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
# printed.
1/"x"
expect_warning(1/"x")
expect_error(1/"x") # Again: note that the error is NOT printed, as well
# code execution will continue.
# Even better, you can check if the warning or error is what you expect it
# to be - because it could actually have occured somewhere else in your code.
v <- c("1", "x")
log(v[1:2])
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
expect_error(log(v[1,2])) # This appears oK, but ...
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
# Producing unit tests simply means: we define a function, and then we check
# whether all test pass. Consider a function that is loaded on startup from
# the .utilities.R script:
biCode
# We could test it like so:
expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA")
expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU"))
expect_error(biCode(), "argument \"s\" is missing, with no default")
# The test_that() function allows to group related tests, include an informative
# message which test is being executed, and run a number of tests that are
# passed to the function inside a code block - i.e. {...}
# test_that("<descriptive string>, {<code block>})
test_that("NA values are preserved", {
# bicode() respects vector length: input and output must have the smae length.
# Therefore NA's can't be simply skipped, bust must be properly passed
# into output:
expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST."))
})
# Task: Write a function calcGC() that calculates GC content in a sequence.
# Hint: you could strsplit() the sequence into a vector, and count
# G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
# A's and T's, and use nchar() before and after to calculate the content
# from the length difference.
# Then write tests that:
# confirm that calcGC("AATT") is 0;
# confirm that calcGC("ATGC") is 0.5;
# confirm that calcGC("AC") is 0.5;
# confirm that calcGC("CGCG") is 1;
# = 2 Organizing your tests ===============================================
# Tests are only useful if they are actually executed and we need to make sure
# there are no barriers to do that. The testthat package supports automatic
# execution of tests:
# - put your tests into an R-script,
# - save your tests in a file called "test_<my-function-name>.R"
# - execute the test with test_file("test_<my-function-name>.R") ...
# ... or, if you are working on a project ...
# - place the file in a test-directory (e.g. the directory "test" in this
# project),
# - execute all your tests with test_dir("<my-test-directory>")
# For example I have provided a "tests" directory with this project, and
# placed the file "test_biCode.R" inside.
file.show("./tests/test_biCode.R")
# Execute the file ...
test_file("./tests/test_biCode.R")
# .. or execute all the test files in the directory:
test_dir("./tests")
# == 2.1 Testing scripts ===================================================
# Scripts need special consideration since we do not necessarily source() them
# entirely. Therefore automated testing is not reasonable. What you can do
# instead is to place a conditional block at the end of your script, that
# never gets executed - then you can manually execute the code in the block
# whenever you wish to test your functions. For example:
if (FALSE) {
# ... your tests go here
}
# == 2.2 Rethinking testing ================================================
# However, it is important to keep in mind that different objectives lead to
# different ideas of what works best. There is never a "best" in and of itself,
# the question is always: "Best for what?" While automated unit testing is a
# great way to assure the integrity of packages and larger software artefacts as
# they are being developed, more loosely conceived aggregates of code - like the
# scripts for this course for example - have different objectives and in this
# case I find the testthat approach to actually be inferior. The reason is its
# tendency to physically separate code and tests. Keeping assets, and functions
# that operate on those assets separated is always poor design. I have found
# over time that a more stable approach is to move individual functions into
# their individual scripts, all in one folder, one function (and its helpers)
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
# explained above.
# = 3 Task solutions ======================================================
calcGC <- function(s) {
s <- gsub("[^agctAGCT]", "", s)
return(nchar(gsub("[atAT]", "", s)) / nchar(s))
}
expect_equal(calcGC("AATT"), 0)
expect_equal(calcGC("ATGC"), 0.5)
expect_equal(calcGC("AC"), 0.5)
expect_equal(calcGC("CGCG"), 1)
# [END]
# tocID <- "RPR-Unit_testing.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Unit_testing unit.
#
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Updates. Discuss local tests.
# 1.1 Change from require() to requireNamespace()
# 1.0 New code
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Unit Tests with testthat 42
#TOC> 2 Organizing your tests 165
#TOC> 2.1 Testing scripts 189
#TOC> 2.2 Rethinking testing 202
#TOC> 3 Task solutions 220
#TOC>
#TOC> ==========================================================================
# = 1 Unit Tests with testthat ============================================
# The testthat package supports writing and executing unit tests in many ways.
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
}
# Package information:
# library(help = testthat) # basic information
# browseVignettes("testthat") # available vignettes
# data(package = "testthat") # available datasets
# testthat is one of those packages that we either use A LOT in a script,
# or not at all. Therefore it's more reasonable to depart from our usual
# <package>::<function>() idiom, and load the entire library. In fact, if
# we author packages, it is common practice to load testthat in the part
# of the package that automates testing.
library(testthat)
# An atomic test consists of an expectation about the bahaviour of a function or
# the existence of an object. testthat provides a number of useful expectations:
# At the most basic level, you can use expect_true() and expect_false():
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
expect_true(file.exists("NO-SUCH-FILE.txt"))
expect_false(is.integer(NA))
# More commonly, you will test for equality of an output with a given result.
# But you need to consider what it means for two numbers to be "equal" on a
# digital computer. Consider:
49*(1/49) == 1 # Surprised? Read FAQ 7.31
# https://cran.r-project.org/doc/FAQ/R-FAQ.html
49*(1/49) - 1 # NOT zero (but almost)
# This is really unpredictable ...
0.1 + 0.05 == 0.15
0.2 + 0.07 == 0.27
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
# R uses the function all.equal() to test whether two numbers are equal for
# practical puposes up to machine precision.
49*(1/49) == 1
all.equal(49*(1/49), 1)
# The testthat function expect_equal() uses all.equal internally:
expect_equal(49*(1/49), 1)
# ... which is reasonable, or, if things MUST be exactly the same ...
expect_identical(49*(1/49), 1)
# ... but consider:
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
# Some very useful expectations are expect_warning(), and expect_error(), for
# constructing tests that check for erroneous output:
as.integer(c("1", "2", "three"))
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
# printed.
1/"x"
expect_warning(1/"x")
expect_error(1/"x") # Again: note that the error is NOT printed, as well
# code execution will continue.
# Even better, you can check if the warning or error is what you expect it
# to be - because it could actually have occured somewhere else in your code.
v <- c("1", "x")
log(v[1:2])
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
expect_error(log(v[1,2])) # This appears oK, but ...
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
# Producing unit tests simply means: we define a function, and then we check
# whether all test pass. Consider a function that is loaded on startup from
# the .utilities.R script:
biCode
# We could test it like so:
expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA")
expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU"))
expect_error(biCode(), "argument \"s\" is missing, with no default")
# The test_that() function allows to group related tests, include an informative
# message which test is being executed, and run a number of tests that are
# passed to the function inside a code block - i.e. {...}
# test_that("<descriptive string>, {<code block>})
test_that("NA values are preserved", {
# bicode() respects vector length: input and output must have the smae length.
# Therefore NA's can't be simply skipped, bust must be properly passed
# into output:
expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST."))
})
# Task: Write a function calcGC() that calculates GC content in a sequence.
# Hint: you could strsplit() the sequence into a vector, and count
# G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
# A's and T's, and use nchar() before and after to calculate the content
# from the length difference.
# Then write tests that:
# confirm that calcGC("AATT") is 0;
# confirm that calcGC("ATGC") is 0.5;
# confirm that calcGC("AC") is 0.5;
# confirm that calcGC("CGCG") is 1;
# = 2 Organizing your tests ===============================================
# Tests are only useful if they are actually executed and we need to make sure
# there are no barriers to do that. The testthat package supports automatic
# execution of tests:
# - put your tests into an R-script,
# - save your tests in a file called "test_<my-function-name>.R"
# - execute the test with test_file("test_<my-function-name>.R") ...
# ... or, if you are working on a project ...
# - place the file in a test-directory (e.g. the directory "test" in this
# project),
# - execute all your tests with test_dir("<my-test-directory>")
# For example I have provided a "tests" directory with this project, and
# placed the file "test_biCode.R" inside.
file.show("./tests/test_biCode.R")
# Execute the file ...
test_file("./tests/test_biCode.R")
# .. or execute all the test files in the directory:
test_dir("./tests")
# == 2.1 Testing scripts ===================================================
# Scripts need special consideration since we do not necessarily source() them
# entirely. Therefore automated testing is not reasonable. What you can do
# instead is to place a conditional block at the end of your script, that
# never gets executed - then you can manually execute the code in the block
# whenever you wish to test your functions. For example:
if (FALSE) {
# ... your tests go here
}
# == 2.2 Rethinking testing ================================================
# However, it is important to keep in mind that different objectives lead to
# different ideas of what works best. There is never a "best" in and of itself,
# the question is always: "Best for what?" While automated unit testing is a
# great way to assure the integrity of packages and larger software artefacts as
# they are being developed, more loosely conceived aggregates of code - like the
# scripts for this course for example - have different objectives and in this
# case I find the testthat approach to actually be inferior. The reason is its
# tendency to physically separate code and tests. Keeping assets, and functions
# that operate on those assets separated is always poor design. I have found
# over time that a more stable approach is to move individual functions into
# their individual scripts, all in one folder, one function (and its helpers)
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
# explained above.
# = 3 Task solutions ======================================================
calcGC <- function(s) {
s <- gsub("[^agctAGCT]", "", s)
return(nchar(gsub("[atAT]", "", s)) / nchar(s))
}
expect_equal(calcGC("AATT"), 0)
expect_equal(calcGC("ATGC"), 0.5)
expect_equal(calcGC("AC"), 0.5)
expect_equal(calcGC("CGCG"), 1)
# [END]

View File

@ -1,166 +1,166 @@
# tocID <- "RPR-eUtils_XML.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2.1
#
# Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2.1 2021 Maintenance
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------
#TOC> 1 Working with NCBI eUtils 43
#TOC> 1.1 Task - fetchNCBItaxData() function 145
#TOC> 2 Task solutions 152
#TOC>
#TOC> ==========================================================================
# = 1 Working with NCBI eUtils ============================================
# To begin, we load the xml2 package that contains functions
# we need to receive and parse html data. NCBI's eUtils send information in
# XML format so we need to be able to parse XML.
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
# Package information:
# library(help = xml2) # basic information
# browseVignettes("xml2") # available vignettes
# data(package = "xml2") # available datasets
# We will walk through the process with the refSeqID
# of yeast Mbp1
refSeqID <- "NP_010227"
# First we build a query URL...
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
# Then we assemble an URL that will search for get the
# unique, NCBI internal identifier,
# for our refSeqID...
URL <- paste(eUtilsBase,
"esearch.fcgi?", # ...using the esearch program
# that finds an entry in an
# NCBI database
"db=protein",
"&term=", refSeqID,
sep="")
# Copy the URL and paste it into your browser to see
# what the response should look like.
URL
# To fetch a response in R, we use the function read_xml()
# with our URL as its argument.
( myXML <- xml2::read_xml(URL) )
# This is XML. We can take the response apart into
# its individual components with the as_list() function.
xml2::as_list(myXML)
# Note how the XML "tree" is represented as a list of
# lists of lists ...
# If we know exactly what element we are looking for,
# we can extract it from this structure:
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
# But this is not very robust, it would break with the
# slightest change that the NCBI makes to their data format -
# and the NCBI changes things A LOT!
# Somewhat more robust is to specify the type of element
# we want - its the text contained in an <Id>...</Id>
# element, and use the XPath XML parsing language to
# retrieve it.
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
# of the node set
# We will need to do this more than once, so we write a function
# for it...
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
# between given tags in an XML response.
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes))
}
# using node2text() ...
(GID <- node2text(myXML, "Id"))
# The GI is the pivot for data requests at the
# NCBI.
# Let's first get the associated data for this GI
URL <- paste0(eUtilsBase,
"esummary.fcgi?",
"db=protein",
"&id=",
GID,
"&version=2.0")
(myXML <- xml2::read_xml(URL))
(taxID <- node2text(myXML, "TaxId"))
(organism <- node2text(myXML, "Organism"))
# This forms the base of a function that gets taxonomy data
# from an Entrez result. You can write this!
# == 1.1 Task - fetchNCBItaxData() function ================================
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
# information, returns a list with taxID and organism, if the operation is
# successful, or a list of length 0 if there is an error.
# = 2 Task solutions ======================================================
# I have placed such a function into the dbUtilities script: look it up by
# clicking on dbFetchNCBItaxData() in the Environment pane.
# Test:
dbFetchNCBItaxData("XP_001837394")
# Expected outout:
# ----------------
# taxID organism
# 1 240176 Coprinopsis cinerea okayama7#130
# [END]
# tocID <- "RPR-eUtils_XML.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.2.1
#
# Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2.1 2021 Maintenance
# 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------
#TOC> 1 Working with NCBI eUtils 43
#TOC> 1.1 Task - fetchNCBItaxData() function 145
#TOC> 2 Task solutions 152
#TOC>
#TOC> ==========================================================================
# = 1 Working with NCBI eUtils ============================================
# To begin, we load the xml2 package that contains functions
# we need to receive and parse html data. NCBI's eUtils send information in
# XML format so we need to be able to parse XML.
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
# Package information:
# library(help = xml2) # basic information
# browseVignettes("xml2") # available vignettes
# data(package = "xml2") # available datasets
# We will walk through the process with the refSeqID
# of yeast Mbp1
refSeqID <- "NP_010227"
# First we build a query URL...
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
# Then we assemble an URL that will search for get the
# unique, NCBI internal identifier,
# for our refSeqID...
URL <- paste(eUtilsBase,
"esearch.fcgi?", # ...using the esearch program
# that finds an entry in an
# NCBI database
"db=protein",
"&term=", refSeqID,
sep="")
# Copy the URL and paste it into your browser to see
# what the response should look like.
URL
# To fetch a response in R, we use the function read_xml()
# with our URL as its argument.
( myXML <- xml2::read_xml(URL) )
# This is XML. We can take the response apart into
# its individual components with the as_list() function.
xml2::as_list(myXML)
# Note how the XML "tree" is represented as a list of
# lists of lists ...
# If we know exactly what element we are looking for,
# we can extract it from this structure:
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
# But this is not very robust, it would break with the
# slightest change that the NCBI makes to their data format -
# and the NCBI changes things A LOT!
# Somewhat more robust is to specify the type of element
# we want - its the text contained in an <Id>...</Id>
# element, and use the XPath XML parsing language to
# retrieve it.
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
# of the node set
# We will need to do this more than once, so we write a function
# for it...
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
# between given tags in an XML response.
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes))
}
# using node2text() ...
(GID <- node2text(myXML, "Id"))
# The GI is the pivot for data requests at the
# NCBI.
# Let's first get the associated data for this GI
URL <- paste0(eUtilsBase,
"esummary.fcgi?",
"db=protein",
"&id=",
GID,
"&version=2.0")
(myXML <- xml2::read_xml(URL))
(taxID <- node2text(myXML, "TaxId"))
(organism <- node2text(myXML, "Organism"))
# This forms the base of a function that gets taxonomy data
# from an Entrez result. You can write this!
# == 1.1 Task - fetchNCBItaxData() function ================================
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
# information, returns a list with taxID and organism, if the operation is
# successful, or a list of length 0 if there is an error.
# = 2 Task solutions ======================================================
# I have placed such a function into the dbUtilities script: look it up by
# clicking on dbFetchNCBItaxData() in the Environment pane.
# Test:
dbFetchNCBItaxData("XP_001837394")
# Expected outout:
# ----------------
# taxID organism
# 1 240176 Coprinopsis cinerea okayama7#130
# [END]

View File

@ -1,10 +1,10 @@
HEADER TEST 0TST 0TST 1
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
TER 5 GLY 1 0TST 7
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
END 0TST 10
HEADER TEST 0TST 0TST 1
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
TER 5 GLY 1 0TST 7
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
END 0TST 10

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
>2F1C:X|PDBID|CHAIN|SEQUENCE
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
>2F1C:X|PDBID|CHAIN|SEQUENCE
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH

View File

@ -1,6 +1,6 @@
>3FG7:A|PDBID|CHAIN|SEQUENCE
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
>3FG7:A|PDBID|CHAIN|SEQUENCE
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN

View File

@ -1,20 +1,20 @@
[
{ "name" : "MBP1_SACCE",
"RefSeqID" : "NP_010227",
"UniProtID" : "P39678",
"taxonomyID" : 559292,
"sequence" : [
"MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
"GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
"KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
"PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
"QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
"NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
"SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
"MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
"MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
"KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
"LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
}
]
[
{ "name" : "MBP1_SACCE",
"RefSeqID" : "NP_010227",
"UniProtID" : "P39678",
"taxonomyID" : 559292,
"sequence" : [
"MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
"GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
"KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
"PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
"QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
"NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
"SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
"MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
"MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
"KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
"LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
}
]

View File

@ -1,30 +1,30 @@
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
CACCAGTCCCCAGAATGA
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
CACCAGTCCCCAGAATGA

View File

@ -1,12 +1,12 @@
>RAB39B cds:protein_coding (ENST00000369454.4)
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
>RAB39B cds:protein_coding (ENST00000369454.4)
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG

View File

@ -1,131 +1,131 @@
```{css, echo = FALSE}
.striped tr:nth-child(even) {
background: #eaf1ff;
}
.striped {
padding: 5px;
}
```
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Phobias! ##
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
```{r packages}
if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest")
}
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
```
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
```{r getPageData, cache=TRUE}
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
allTables <- rvest::html_table(webPage, fill = TRUE)
```
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
```{r collateTables, cache=TRUE}
phobiaTable <- data.frame(Phobia = character(), Condition = character())
for (i in seq_along(allTables)) {
df <- allTables[[i]]
if (all(colnames(df) == c("Phobia", "Condition"))) {
phobiaTable <- rbind(phobiaTable, df)
}
}
```
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
<p>&nbsp;
<p>
```{r , ref.label="randRow", echo=FALSE}
```
**Table**: seven random phobias<br/>
```{r renderPhobiaTable, echo=FALSE, results='asis'}
sel <- sample(1:nrow(phobiaTable), 7)
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
```
<p>&nbsp;
<p>
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
```{r randRow}
randRow <- function(M, seed = FALSE) {
# Return a random row from a dataframe M.
if (seed) {
oldseed <- .Random.seed # play nice and save the RNG state ...
set.seed(as.integer(seed))
}
r <- M[sample(1:nrow(M), 1), ] # fetch one random row
if (seed) { .Random.seed <- oldseed } # ... restore the RNG state
return(r)
}
```
<p>&nbsp;
<p>
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
<p>&nbsp;
<p>
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
```{r preProcess}
# select only single-word phobias that end with "phobia"
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
names <- phobiaTable$Phobia[sel]
# extract the ones we did _not_ select
x <- phobiaTable$Phobia[! sel]
# use strsplit() to split them apart and flatten the resulting list
x <- unlist(strsplit(x, ", "))
x <- unlist(strsplit(x, " "))
x <- unlist(strsplit(x, "/"))
# use the same selection as above, and append the result to our "names""
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
names <- c(names, x[sel])
```
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
```{r showHist}
x <- nchar(names)
pShort <- names[which(x == min(x))[1]] # pull out the shortest name ...
pLong <- names[which(x == max(x))[1]] # ... and the longest name too.
hist(x,
main = "Length of phobia-names",
sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
pShort, nchar(pShort), pLong, nchar(pLong)),
cex.sub = 0.8,
xlab = "name",
ylab = "counts",
col ="#aef5ee")
```
That's all.
<!-- [END] -->
```{css, echo = FALSE}
.striped tr:nth-child(even) {
background: #eaf1ff;
}
.striped {
padding: 5px;
}
```
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Phobias! ##
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
```{r packages}
if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest")
}
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
}
```
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
```{r getPageData, cache=TRUE}
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
allTables <- rvest::html_table(webPage, fill = TRUE)
```
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
```{r collateTables, cache=TRUE}
phobiaTable <- data.frame(Phobia = character(), Condition = character())
for (i in seq_along(allTables)) {
df <- allTables[[i]]
if (all(colnames(df) == c("Phobia", "Condition"))) {
phobiaTable <- rbind(phobiaTable, df)
}
}
```
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
<p>&nbsp;
<p>
```{r , ref.label="randRow", echo=FALSE}
```
**Table**: seven random phobias<br/>
```{r renderPhobiaTable, echo=FALSE, results='asis'}
sel <- sample(1:nrow(phobiaTable), 7)
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
```
<p>&nbsp;
<p>
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
```{r randRow}
randRow <- function(M, seed = FALSE) {
# Return a random row from a dataframe M.
if (seed) {
oldseed <- .Random.seed # play nice and save the RNG state ...
set.seed(as.integer(seed))
}
r <- M[sample(1:nrow(M), 1), ] # fetch one random row
if (seed) { .Random.seed <- oldseed } # ... restore the RNG state
return(r)
}
```
<p>&nbsp;
<p>
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
<p>&nbsp;
<p>
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
```{r preProcess}
# select only single-word phobias that end with "phobia"
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
names <- phobiaTable$Phobia[sel]
# extract the ones we did _not_ select
x <- phobiaTable$Phobia[! sel]
# use strsplit() to split them apart and flatten the resulting list
x <- unlist(strsplit(x, ", "))
x <- unlist(strsplit(x, " "))
x <- unlist(strsplit(x, "/"))
# use the same selection as above, and append the result to our "names""
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
names <- c(names, x[sel])
```
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
```{r showHist}
x <- nchar(names)
pShort <- names[which(x == min(x))[1]] # pull out the shortest name ...
pLong <- names[which(x == max(x))[1]] # ... and the longest name too.
hist(x,
main = "Length of phobia-names",
sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
pShort, nchar(pShort), pLong, nchar(pLong)),
cex.sub = 0.8,
xlab = "name",
ylab = "counts",
col ="#aef5ee")
```
That's all.
<!-- [END] -->

View File

@ -1,43 +1,43 @@
>MBP1 YDL056W SGDID:S000002214
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
>MBP1 YDL056W SGDID:S000002214
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA

View File

@ -1,47 +1,47 @@
SGD_features.tab
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
The SGD_features.tab file is updated weekly (Saturday).
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
used chromosomal_feature.tab file.
File contents:
1. Information on current chromosomal features in SGD, including Dubious ORFs.
Also contains coordinates of intron, exons, and other subfeatures that are located
within a chromosomal feature.
2. The relationship between subfeatures and the feature in which they
are located is identified by the feature name in column #7 (parent
feature). For example, the parent feature of the intron found in
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
chromosome 6.
3. The coordinates of all features are in chromosomal coordinates.
Columns within SGD_features.tab:
1. Primary SGDID (mandatory)
2. Feature type (mandatory)
3. Feature qualifier (optional)
4. Feature name (optional)
5. Standard gene name (optional)
6. Alias (optional, multiples separated by |)
7. Parent feature name (optional)
8. Secondary SGDID (optional, multiples separated by |)
9. Chromosome (optional)
10. Start_coordinate (optional)
11. Stop_coordinate (optional)
12. Strand (optional)
13. Genetic position (optional)
14. Coordinate version (optional)
15. Sequence version (optional)
16. Description (optional)
Note that "chromosome 17" is the mitochondrial chromosome.
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
SGD_features.tab
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
The SGD_features.tab file is updated weekly (Saturday).
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
used chromosomal_feature.tab file.
File contents:
1. Information on current chromosomal features in SGD, including Dubious ORFs.
Also contains coordinates of intron, exons, and other subfeatures that are located
within a chromosomal feature.
2. The relationship between subfeatures and the feature in which they
are located is identified by the feature name in column #7 (parent
feature). For example, the parent feature of the intron found in
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
chromosome 6.
3. The coordinates of all features are in chromosomal coordinates.
Columns within SGD_features.tab:
1. Primary SGDID (mandatory)
2. Feature type (mandatory)
3. Feature qualifier (optional)
4. Feature name (optional)
5. Standard gene name (optional)
6. Alias (optional, multiples separated by |)
7. Parent feature name (optional)
8. Secondary SGDID (optional, multiples separated by |)
9. Chromosome (optional)
10. Start_coordinate (optional)
11. Stop_coordinate (optional)
12. Strand (optional)
13. Genetic position (optional)
14. Coordinate version (optional)
15. Sequence version (optional)
16. Description (optional)
Note that "chromosome 17" is the mitochondrial chromosome.
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,179 +1,179 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
3 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
4 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
5 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
6 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
7 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
8 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
9 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
10 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
11 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
12 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
13 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
14 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
15 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
16 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
17 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
18 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
19 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
20 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
21 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
22 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
23 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
24 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
25 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
26 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
27 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
28 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
29 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
30 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
31 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
32 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
33 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
34 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
35 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
36 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
37 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
38 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
39 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
40 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
41 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
42 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
43 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
44 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
45 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
46 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
47 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
48 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
49 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
50 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
51 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
52 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
53 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
54 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
55 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
56 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
57 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
58 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
59 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
60 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
61 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
62 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
63 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
64 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
65 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
66 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
67 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
68 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
69 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
70 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
71 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
72 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
73 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
74 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
75 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
76 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
77 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
78 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
79 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
80 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
81 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
82 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
83 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
84 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
85 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
86 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
87 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
88 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
89 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
90 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
91 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
92 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
93 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
94 1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
95 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
96 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
97 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
98 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
99 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
100 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
101 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
102 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
103 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
104 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
105 1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
106 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
107 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
108 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
109 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
110 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
111 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
112 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
113 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
114 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
115 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
116 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
117 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
118 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
119 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
120 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
121 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
122 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
123 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
124 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
125 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
126 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
127 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
128 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
129 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
130 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
131 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
132 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
133 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
134 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
135 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
136 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
137 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
138 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
139 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
140 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
141 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
142 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
143 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
144 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
145 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
146 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
147 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
148 1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
149 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
150 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
151 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
152 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
153 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
154 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
155 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
156 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
157 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
158 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
159 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
160 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
161 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
162 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
163 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
164 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
165 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
166 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
167 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
168 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
169 1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
170 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
171 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
172 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
173 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
174 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
175 1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
176 1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
177 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
178 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
179 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078

View File

@ -1,49 +1,49 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
3 2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
4 0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
5 1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
6 1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
7 0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
8 0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
9 1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
10 1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
11 0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
12 0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
13 1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
14 1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
15 1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
16 1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
17 1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
18 1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
19 1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
20 1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
21 0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
22 1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
23 0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
24 1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
25 0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
26 1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
27 1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
28 0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
29 1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
30 0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
31 1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
32 1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
33 1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
34 1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
35 0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
36 1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
37 1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
38 0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
39 0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
40 0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
41 0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
42 1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
43 1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
44 1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
45 1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
46 1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
47 0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
48 0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
49 1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094

View File

@ -1,113 +1,113 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
3 4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
4 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
5 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
6 2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
7 2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
8 2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
9 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
10 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
11 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
12 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
13 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
14 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
15 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
16 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
17 0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
18 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
19 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
20 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
21 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
22 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
23 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
24 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
25 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
26 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
27 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
28 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
29 1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
30 0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
31 1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
32 0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
33 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
34 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
35 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
36 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
37 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
38 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
39 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
40 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
41 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
42 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
43 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
44 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
45 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
46 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
47 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
48 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
49 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
50 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
51 0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
52 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
53 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
54 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
55 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
56 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
57 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
58 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
59 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
60 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
61 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
62 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
63 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
64 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
65 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
66 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
67 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
68 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
69 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
70 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
71 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
72 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
73 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
74 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
75 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
76 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
77 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
78 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
79 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
80 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
81 1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
82 1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
83 1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
84 1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
85 1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
86 0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
87 1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
88 1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
89 0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
90 1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
91 0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
92 1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
93 0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
94 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
95 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
96 1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
97 1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
98 0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
99 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
100 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
101 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
102 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
103 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
104 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
105 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
106 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
107 1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
108 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
109 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
110 1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
111 1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
112 1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
113 1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677

View File

@ -1,39 +1,39 @@
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
>MBP1_COPCI - XP_001837394 A8NYC6
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
>MBP1_CRYNE - XP_569090 Q5KMQ9
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
>MBP1_SACCE Mbp1 NP_010227 P39678
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
>MBP1_SCHPO Res2 NP_593032 P41412
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
>MBP1_WALME - XP_006957051 I4YGC0
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
>MBP1_COPCI - XP_001837394 A8NYC6
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
>MBP1_CRYNE - XP_569090 Q5KMQ9
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
>MBP1_SACCE Mbp1 NP_010227 P39678
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
>MBP1_SCHPO Res2 NP_593032 P41412
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
>MBP1_WALME - XP_006957051 I4YGC0
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY

View File

@ -1,490 +1,490 @@
[
{ "name" : "68476_WALME",
"RefSeqID" : "XP_006957790",
"UniProtID" : "I4YDD8",
"taxonomyID" : "671144",
"sequence" : [
"MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
"PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
"GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
"LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
"LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
"LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
"FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
"IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
"GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
},
{ "name" : "00846_COPCI",
"RefSeqID" : "XP_001831299",
"UniProtID" : "A8N8X1",
"taxonomyID" : "240176",
"sequence" : [
"MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
"GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
"NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
"SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
"PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
"DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
"KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
"KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
"RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
"FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
"SEAQVVDIGRVSGFMQKVRDGII"]
},
{ "name" : "8533_BIPOR",
"RefSeqID" : "XP_007691662",
"UniProtID" : "W6ZE71",
"taxonomyID" : "930090",
"sequence" : [
"MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
"SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
"HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
"RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
"EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
"ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
"IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
"VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
"QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
"DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
"KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
},
{ "name" : "PGTG_02039",
"RefSeqID" : "XP_003320997",
"UniProtID" : "E3JX03",
"taxonomyID" : "418459",
"sequence" : [
"MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
"AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
"ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
"HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
"NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
"LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
"LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
"RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
"GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
"TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
"LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
},
{ "name" : "MBPA_ASPNI",
"RefSeqID" : "XP_664319",
"UniProtID" : "Q5AYB5",
"taxonomyID" : "227321",
"sequence" : [
"MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
"FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
"VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
"QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
"RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
"FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
"AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
"QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
"GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
"IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
"SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
"MRDRGQDW"]
},
{ "name" : "05520_CRYNE",
"RefSeqID" : "XP_570545",
"UniProtID" : "Q5KHS0",
"taxonomyID" : "214684",
"sequence" : [
"MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
"FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
"KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
"PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
"SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
"DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
"ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
"VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
"AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
"EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
"GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
"VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
},
{ "name" : "RES1_SCHPO",
"RefSeqID" : "NP_595496",
"UniProtID" : "P33520",
"taxonomyID" : "284812",
"sequence" : [
"MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
"SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
"GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
"LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
"KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
"KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
"AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
"DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
},
{ "name" : "CDC10_SCHPO",
"RefSeqID" : "NP_596132",
"UniProtID" : "P01129",
"taxonomyID" : "284812",
"sequence" : [
"MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
"SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
"LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
"NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
"KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
"TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
"DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
"QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
"KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
"LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
},
{ "name" : "05338_USTMA",
"RefSeqID" : "XP_011392041",
"UniProtID" : "A0A0D1BWD8",
"taxonomyID" : "237631",
"sequence" : [
"MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
"SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
"PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
"AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
"RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
"DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
"TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
"DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
"NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
"GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
"TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
"LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
"TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
"DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
},
{ "name" : "SWI4_SACCE",
"RefSeqID" : "NP_011036",
"UniProtID" : "P25302",
"taxonomyID" : "559292",
"sequence" : [
"MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
"SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
"GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
"NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
"SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
"QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
"KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
"SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
"IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
"QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
"KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
"SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
"EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
"QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
},
{ "name" : "SWI6_NEUCR",
"RefSeqID" : "XP_962967",
"UniProtID" : "Q7SBG9",
"taxonomyID" : "367110",
"sequence" : [
"MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
"ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
"PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
"PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
"EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
"LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
"KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
"ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
"HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
"QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
"EKPELEIARVRRFLGGVEGVVH"]
},
{ "name" : "15042_USTMA",
"RefSeqID" : "XP_011388143",
"UniProtID" : "A0A0D1CVS5",
"taxonomyID" : "237631",
"sequence" : [
"MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
"HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
"GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
"DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
"ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
"SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
"AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
"SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
},
{ "name" : "04778_USTMA",
"RefSeqID" : "XP_011391646",
"UniProtID" : "A0A0D1DQM4",
"taxonomyID" : "237631",
"sequence" : [
"MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
"VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
"QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
"LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
"YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
"PESAQLFTIHDFGSDPFYAEQVERG"]
},
{ "name" : "STUA_ASPNI",
"RefSeqID" : "XP_663440",
"UniProtID" : "P36011",
"taxonomyID" : "227321",
"sequence" : [
"MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
"SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
"INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
"RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
"SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
"GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
"RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
"SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
},
{ "name" : "STUA_NEUCR",
"RefSeqID" : "XP_960837",
"UniProtID" : "Q1K6U0",
"taxonomyID" : "367110",
"sequence" : [
"MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
"QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
"RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
"DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
"NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
"SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
"NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
"PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
"RRR"]
},
{ "name" : "PHD1_SACCE",
"RefSeqID" : "NP_012881",
"UniProtID" : "P36093",
"taxonomyID" : "559292",
"sequence" : [
"MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
"VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
"IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
"EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
"ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
},
{ "name" : "08099_COPCI",
"RefSeqID" : "XP_001836714",
"UniProtID" : "A8NVH3",
"taxonomyID" : "240176",
"sequence" : [
"MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
"KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
"NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
"HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
"LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
"EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
"PHRPW"]
},
{ "name" : "68479_WALME",
"RefSeqID" : "XP_006957792",
"UniProtID" : "I4YDE0",
"taxonomyID" : "671144",
"sequence" : [
"MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
"ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
"PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
"SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
},
{ "name" : "11943_PUCGR",
"RefSeqID" : "XP_003330006",
"UniProtID" : "E3KMR2",
"taxonomyID" : "418459",
"sequence" : [
"MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
"HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
"GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
"RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
"MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
"QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
"GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
"TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
},
{ "name" : "03082_PUCGR",
"RefSeqID" : "XP_003321545",
"UniProtID" : "E3JYK1",
"taxonomyID" : "418459",
"sequence" : [
"MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
"STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
"PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
"FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
"RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
"RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
"QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
"AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
"LIMEWNPSC"]
},
{ "name" : "SOK2_SACCE",
"RefSeqID" : "NP_013729",
"UniProtID" : "P53438",
"taxonomyID" : "559292",
"sequence" : [
"MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
"QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
"MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
"PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
"NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
"SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
"HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
"AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
"TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
"LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
},
{ "name" : "14426_COPCI",
"RefSeqID" : "XP_002911429",
"UniProtID" : "D6RMB0",
"taxonomyID" : "240176",
"sequence" : [
"MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
"EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
"TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
"KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
"KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
"ITYLPNFL"]
},
{ "name" : "BQT4_SCHPO",
"RefSeqID" : "NP_596166",
"UniProtID" : "O60158",
"taxonomyID" : "284812",
"sequence" : [
"MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
"FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
"EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
"SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
"KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
"LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
},
{ "name" : "PGTG_05590",
"RefSeqID" : "XP_003323688",
"UniProtID" : "E3K4V4",
"taxonomyID" : "418459",
"sequence" : [
"MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
"KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
"SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
"PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
"ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
"TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
"GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
},
{ "name" : "06560_NEUCR",
"RefSeqID" : "XP_962267",
"UniProtID" : "Q7S9H5",
"taxonomyID" : "367110",
"sequence" : [
"MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
"PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
"SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
"VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
"DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
"TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
"ANVL"]
},
{ "name" : "81480_BIPOR",
"RefSeqID" : "XP_007682909",
"UniProtID" : "W6ZKJ4",
"taxonomyID" : "930090",
"sequence" : [
"MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
"RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
"EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
"DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
"TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
"KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
},
{ "name" : "01622_ASPNI",
"RefSeqID" : "XP_657766",
"UniProtID" : "Q5BH18",
"taxonomyID" : "227321",
"sequence" : [
"MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
"QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
"LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
"QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
"EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
"RVRNRALMGVTAAFALAKPALVLLEA"]
},
{ "name" : "05405_ASPNI",
"RefSeqID" : "XP_663009",
"UniProtID" : "Q5B225",
"taxonomyID" : "227321",
"sequence" : [
"MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
"FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
"LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
"TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
"PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
"SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
"DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
"ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
},
{ "name" : "105954_BIPOR",
"RefSeqID" : "XP_007691967",
"UniProtID" : "W6Z1H5",
"taxonomyID" : "930090",
"sequence" : [
"MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
"YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
"ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
"TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
"RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
"GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
},
{ "name" : "69819_WALME",
"RefSeqID" : "XP_006959479",
"UniProtID" : "I4Y911",
"taxonomyID" : "671144",
"sequence" : [
"MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
"ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
"PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
"KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
"NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
},
{ "name" : "02840_CRYNE",
"RefSeqID" : "XP_568872",
"UniProtID" : "Q5KM59",
"taxonomyID" : "214684",
"sequence" : [
"MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
"THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
"QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
"EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
"VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
},
{ "name" : "11055_USTMA",
"RefSeqID" : "XP_011390537",
"UniProtID" : "A0A0D1DZM8",
"taxonomyID" : "237631",
"sequence" : [
"MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
"LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
"LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
"AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
"VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
"VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
"ASILPW"]
},
{ "name" : "XBP1_NEUCR",
"RefSeqID" : "XP_962373",
"UniProtID" : "Q7S9W7",
"taxonomyID" : "367110",
"sequence" : [
"MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
"MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
"GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
"SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
"RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
"DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
"SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
"TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
"DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
},
{ "name" : "XBP1_SACCE",
"RefSeqID" : "NP_012165",
"UniProtID" : "P40489",
"taxonomyID" : "559292",
"sequence" : [
"MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
"QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
"ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
"FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
"YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
"NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
"KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
"DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
"FKTNSKQ"]
}
]
[
{ "name" : "68476_WALME",
"RefSeqID" : "XP_006957790",
"UniProtID" : "I4YDD8",
"taxonomyID" : "671144",
"sequence" : [
"MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
"PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
"GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
"LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
"LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
"LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
"FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
"IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
"GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
},
{ "name" : "00846_COPCI",
"RefSeqID" : "XP_001831299",
"UniProtID" : "A8N8X1",
"taxonomyID" : "240176",
"sequence" : [
"MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
"GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
"NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
"SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
"PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
"DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
"KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
"KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
"RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
"FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
"SEAQVVDIGRVSGFMQKVRDGII"]
},
{ "name" : "8533_BIPOR",
"RefSeqID" : "XP_007691662",
"UniProtID" : "W6ZE71",
"taxonomyID" : "930090",
"sequence" : [
"MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
"SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
"HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
"RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
"EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
"ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
"IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
"VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
"QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
"DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
"KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
},
{ "name" : "PGTG_02039",
"RefSeqID" : "XP_003320997",
"UniProtID" : "E3JX03",
"taxonomyID" : "418459",
"sequence" : [
"MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
"AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
"ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
"HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
"NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
"LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
"LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
"RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
"GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
"TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
"LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
},
{ "name" : "MBPA_ASPNI",
"RefSeqID" : "XP_664319",
"UniProtID" : "Q5AYB5",
"taxonomyID" : "227321",
"sequence" : [
"MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
"FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
"VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
"QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
"RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
"FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
"AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
"QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
"GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
"IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
"SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
"MRDRGQDW"]
},
{ "name" : "05520_CRYNE",
"RefSeqID" : "XP_570545",
"UniProtID" : "Q5KHS0",
"taxonomyID" : "214684",
"sequence" : [
"MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
"FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
"KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
"PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
"SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
"DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
"ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
"VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
"AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
"EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
"GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
"VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
},
{ "name" : "RES1_SCHPO",
"RefSeqID" : "NP_595496",
"UniProtID" : "P33520",
"taxonomyID" : "284812",
"sequence" : [
"MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
"SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
"GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
"LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
"KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
"KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
"AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
"DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
},
{ "name" : "CDC10_SCHPO",
"RefSeqID" : "NP_596132",
"UniProtID" : "P01129",
"taxonomyID" : "284812",
"sequence" : [
"MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
"SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
"LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
"NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
"KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
"TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
"DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
"QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
"KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
"LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
},
{ "name" : "05338_USTMA",
"RefSeqID" : "XP_011392041",
"UniProtID" : "A0A0D1BWD8",
"taxonomyID" : "237631",
"sequence" : [
"MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
"SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
"PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
"AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
"RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
"DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
"TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
"DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
"NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
"GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
"TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
"LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
"TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
"DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
},
{ "name" : "SWI4_SACCE",
"RefSeqID" : "NP_011036",
"UniProtID" : "P25302",
"taxonomyID" : "559292",
"sequence" : [
"MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
"SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
"GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
"NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
"SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
"QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
"KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
"SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
"IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
"QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
"KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
"SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
"EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
"QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
},
{ "name" : "SWI6_NEUCR",
"RefSeqID" : "XP_962967",
"UniProtID" : "Q7SBG9",
"taxonomyID" : "367110",
"sequence" : [
"MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
"ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
"PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
"PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
"EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
"LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
"KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
"ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
"HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
"QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
"EKPELEIARVRRFLGGVEGVVH"]
},
{ "name" : "15042_USTMA",
"RefSeqID" : "XP_011388143",
"UniProtID" : "A0A0D1CVS5",
"taxonomyID" : "237631",
"sequence" : [
"MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
"HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
"GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
"DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
"ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
"SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
"AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
"SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
},
{ "name" : "04778_USTMA",
"RefSeqID" : "XP_011391646",
"UniProtID" : "A0A0D1DQM4",
"taxonomyID" : "237631",
"sequence" : [
"MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
"VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
"QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
"LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
"YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
"PESAQLFTIHDFGSDPFYAEQVERG"]
},
{ "name" : "STUA_ASPNI",
"RefSeqID" : "XP_663440",
"UniProtID" : "P36011",
"taxonomyID" : "227321",
"sequence" : [
"MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
"SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
"INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
"RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
"SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
"GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
"RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
"SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
},
{ "name" : "STUA_NEUCR",
"RefSeqID" : "XP_960837",
"UniProtID" : "Q1K6U0",
"taxonomyID" : "367110",
"sequence" : [
"MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
"QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
"RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
"DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
"NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
"SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
"NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
"PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
"RRR"]
},
{ "name" : "PHD1_SACCE",
"RefSeqID" : "NP_012881",
"UniProtID" : "P36093",
"taxonomyID" : "559292",
"sequence" : [
"MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
"VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
"IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
"EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
"ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
},
{ "name" : "08099_COPCI",
"RefSeqID" : "XP_001836714",
"UniProtID" : "A8NVH3",
"taxonomyID" : "240176",
"sequence" : [
"MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
"KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
"NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
"HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
"LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
"EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
"PHRPW"]
},
{ "name" : "68479_WALME",
"RefSeqID" : "XP_006957792",
"UniProtID" : "I4YDE0",
"taxonomyID" : "671144",
"sequence" : [
"MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
"ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
"PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
"SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
},
{ "name" : "11943_PUCGR",
"RefSeqID" : "XP_003330006",
"UniProtID" : "E3KMR2",
"taxonomyID" : "418459",
"sequence" : [
"MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
"HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
"GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
"RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
"MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
"QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
"GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
"TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
},
{ "name" : "03082_PUCGR",
"RefSeqID" : "XP_003321545",
"UniProtID" : "E3JYK1",
"taxonomyID" : "418459",
"sequence" : [
"MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
"STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
"PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
"FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
"RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
"RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
"QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
"AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
"LIMEWNPSC"]
},
{ "name" : "SOK2_SACCE",
"RefSeqID" : "NP_013729",
"UniProtID" : "P53438",
"taxonomyID" : "559292",
"sequence" : [
"MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
"QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
"MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
"PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
"NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
"SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
"HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
"AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
"TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
"LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
},
{ "name" : "14426_COPCI",
"RefSeqID" : "XP_002911429",
"UniProtID" : "D6RMB0",
"taxonomyID" : "240176",
"sequence" : [
"MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
"EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
"TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
"KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
"KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
"ITYLPNFL"]
},
{ "name" : "BQT4_SCHPO",
"RefSeqID" : "NP_596166",
"UniProtID" : "O60158",
"taxonomyID" : "284812",
"sequence" : [
"MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
"FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
"EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
"SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
"KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
"LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
},
{ "name" : "PGTG_05590",
"RefSeqID" : "XP_003323688",
"UniProtID" : "E3K4V4",
"taxonomyID" : "418459",
"sequence" : [
"MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
"KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
"SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
"PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
"ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
"TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
"GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
},
{ "name" : "06560_NEUCR",
"RefSeqID" : "XP_962267",
"UniProtID" : "Q7S9H5",
"taxonomyID" : "367110",
"sequence" : [
"MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
"PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
"SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
"VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
"DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
"TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
"ANVL"]
},
{ "name" : "81480_BIPOR",
"RefSeqID" : "XP_007682909",
"UniProtID" : "W6ZKJ4",
"taxonomyID" : "930090",
"sequence" : [
"MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
"RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
"EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
"DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
"TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
"KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
},
{ "name" : "01622_ASPNI",
"RefSeqID" : "XP_657766",
"UniProtID" : "Q5BH18",
"taxonomyID" : "227321",
"sequence" : [
"MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
"QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
"LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
"QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
"EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
"RVRNRALMGVTAAFALAKPALVLLEA"]
},
{ "name" : "05405_ASPNI",
"RefSeqID" : "XP_663009",
"UniProtID" : "Q5B225",
"taxonomyID" : "227321",
"sequence" : [
"MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
"FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
"LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
"TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
"PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
"SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
"DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
"ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
},
{ "name" : "105954_BIPOR",
"RefSeqID" : "XP_007691967",
"UniProtID" : "W6Z1H5",
"taxonomyID" : "930090",
"sequence" : [
"MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
"YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
"ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
"TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
"RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
"GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
},
{ "name" : "69819_WALME",
"RefSeqID" : "XP_006959479",
"UniProtID" : "I4Y911",
"taxonomyID" : "671144",
"sequence" : [
"MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
"ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
"PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
"KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
"NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
},
{ "name" : "02840_CRYNE",
"RefSeqID" : "XP_568872",
"UniProtID" : "Q5KM59",
"taxonomyID" : "214684",
"sequence" : [
"MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
"THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
"QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
"EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
"VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
},
{ "name" : "11055_USTMA",
"RefSeqID" : "XP_011390537",
"UniProtID" : "A0A0D1DZM8",
"taxonomyID" : "237631",
"sequence" : [
"MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
"LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
"LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
"AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
"VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
"VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
"ASILPW"]
},
{ "name" : "XBP1_NEUCR",
"RefSeqID" : "XP_962373",
"UniProtID" : "Q7S9W7",
"taxonomyID" : "367110",
"sequence" : [
"MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
"MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
"GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
"SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
"RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
"DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
"SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
"TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
"DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
},
{ "name" : "XBP1_SACCE",
"RefSeqID" : "NP_012165",
"UniProtID" : "P40489",
"taxonomyID" : "559292",
"sequence" : [
"MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
"QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
"ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
"FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
"YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
"NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
"KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
"DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
"FKTNSKQ"]
}
]

View File

@ -1,116 +1,116 @@
[
{"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
{"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
{"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
{"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
{"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
{"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
{"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
{"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
{"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
{"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
{"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
{"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
{"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
{"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
{"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
{"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
{"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
{"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
{"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
{"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
{"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
{"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
{"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
{"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
{"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
{"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
{"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
{"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
{"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
{"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
{"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
{"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
{"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
{"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
{"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
{"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
]
[
{"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
{"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
{"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
{"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
{"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
{"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
{"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
{"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
{"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
{"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
{"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
{"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
{"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
{"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
{"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
{"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
{"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
{"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
{"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
{"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
{"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
{"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
{"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
{"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
{"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
{"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
{"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
{"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
{"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
{"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
{"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
{"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
{"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
{"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
{"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
{"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
]

View File

@ -1,47 +1,47 @@
[
{ "name" : "APSES fold",
"description " : "DNA binding domain by similarity to structure",
"sourceDB" : "PDB",
"accession" : "1BM8_A_1_99"},
{ "name" : "KilA-N",
"description " : "DNA binding domain by Pfam annotation",
"sourceDB" : "Pfam",
"accession" : "PF04383"},
{ "name" : "AT hook",
"description " : "DNA interaction motif by SMART annotation",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "low complexity",
"description " : "SEG annotation by SMART",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "Ankyrin fold",
"description " : "Ankyrin domain by SMART annotation",
"sourceDB" : "SMART",
"accession" : "SM00248"},
{ "name" : "Swi6 fold",
"description " : "Swi6 fold by similarity to structure",
"sourceDB" : "PDB",
"accession" : "1SW6_B"},
{ "name" : "coiled coil",
"description " : "Coiled coil by SMART annotation",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "McInerny 2011",
"description " : "Yeast cell cycle review",
"sourceDB" : "PubMed",
"accession" : "21310294"}
]
[
{ "name" : "APSES fold",
"description " : "DNA binding domain by similarity to structure",
"sourceDB" : "PDB",
"accession" : "1BM8_A_1_99"},
{ "name" : "KilA-N",
"description " : "DNA binding domain by Pfam annotation",
"sourceDB" : "Pfam",
"accession" : "PF04383"},
{ "name" : "AT hook",
"description " : "DNA interaction motif by SMART annotation",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "low complexity",
"description " : "SEG annotation by SMART",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "Ankyrin fold",
"description " : "Ankyrin domain by SMART annotation",
"sourceDB" : "SMART",
"accession" : "SM00248"},
{ "name" : "Swi6 fold",
"description " : "Swi6 fold by similarity to structure",
"sourceDB" : "PDB",
"accession" : "1SW6_B"},
{ "name" : "coiled coil",
"description " : "Coiled coil by SMART annotation",
"sourceDB" : "SMART",
"accession" : null},
{ "name" : "McInerny 2011",
"description " : "Yeast cell cycle review",
"sourceDB" : "PubMed",
"accession" : "21310294"}
]

View File

@ -1,155 +1,155 @@
[
{ "name" : "MBP1_SCHPO",
"RefSeqID" : "NP_593032",
"UniProtID" : "P41412",
"taxonomyID" : 284812,
"sequence" : [
"MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
"GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
"STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
"YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
"SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
"SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
"QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
"NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
"DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
"AMSCGINPEDLSLEILDAVEEALTREK"]
},
{ "name" : "MBP1_ASPNI",
"RefSeqID" : "XP_660758",
"UniProtID" : "Q5B8H6",
"taxonomyID" : 227321,
"sequence" : [
"MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
"FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
"AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
"LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
"SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
"SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
"TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
"QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
"DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
},
{ "name" : "MBP1_BIPOR",
"RefSeqID" : "XP_007682304",
"UniProtID" : "W6ZM86",
"taxonomyID" : 930090,
"sequence" : [
"MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
"AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
"NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
"GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
"SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
"GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
"SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
"EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
"REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
},
{ "name" : "MBP1_NEUCR",
"RefSeqID" : "XP_955821",
"UniProtID" : "Q7RW59",
"taxonomyID" : 367110,
"sequence" : [
"MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
"QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
"PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
"TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
"DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
"IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
"NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
"QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
"MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
"NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
"RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
"GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
},
{ "name" : "MBP1_COPCI",
"RefSeqID" : "XP_001837394",
"UniProtID" : "A8NYC6",
"taxonomyID" : 240176,
"sequence" : [
"MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
"YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
"TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
"RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
"MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
"VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
"HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
"RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
"NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
"VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
"GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
},
{ "name" : "MBP1_CRYNE",
"RefSeqID" : "XP_569090",
"UniProtID" : "Q5KMQ9",
"taxonomyID" : 214684,
"sequence" : [
"MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
"QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
"KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
"TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
"IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
"AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
"YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
"DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
"RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
"EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
"IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
},
{ "name" : "MBP1_PUCGR",
"RefSeqID" : "XP_003327086",
"UniProtID" : "E3KED4",
"taxonomyID" : 418459,
"sequence" : [
"MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
"HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
"REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
"VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
"LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
"QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
"QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
"DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
"DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
"GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
"LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
"RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
"SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
"NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
},
{ "name" : "MBP1_USTMA",
"RefSeqID" : "XP_011392621",
"UniProtID" : "A0A0D1DP35",
"taxonomyID" : 237631,
"sequence" : [
"MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
"GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
"RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
"ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
"TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
"GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
"AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
"IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
"TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
"AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
"P"]
},
{ "name" : "MBP1_WALME",
"RefSeqID" : "XP_006957051",
"UniProtID" : "I4YGC0",
"taxonomyID" : 671144,
"sequence" : [
"MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
"YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
"VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
"GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
"NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
"KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
"LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
"NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
"ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
"NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
}
]
[
{ "name" : "MBP1_SCHPO",
"RefSeqID" : "NP_593032",
"UniProtID" : "P41412",
"taxonomyID" : 284812,
"sequence" : [
"MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
"GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
"STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
"YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
"SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
"SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
"QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
"NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
"DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
"AMSCGINPEDLSLEILDAVEEALTREK"]
},
{ "name" : "MBP1_ASPNI",
"RefSeqID" : "XP_660758",
"UniProtID" : "Q5B8H6",
"taxonomyID" : 227321,
"sequence" : [
"MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
"FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
"AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
"LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
"SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
"SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
"TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
"QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
"DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
},
{ "name" : "MBP1_BIPOR",
"RefSeqID" : "XP_007682304",
"UniProtID" : "W6ZM86",
"taxonomyID" : 930090,
"sequence" : [
"MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
"AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
"NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
"GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
"SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
"GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
"SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
"EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
"REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
},
{ "name" : "MBP1_NEUCR",
"RefSeqID" : "XP_955821",
"UniProtID" : "Q7RW59",
"taxonomyID" : 367110,
"sequence" : [
"MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
"QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
"PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
"TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
"DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
"IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
"NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
"QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
"MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
"NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
"RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
"GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
},
{ "name" : "MBP1_COPCI",
"RefSeqID" : "XP_001837394",
"UniProtID" : "A8NYC6",
"taxonomyID" : 240176,
"sequence" : [
"MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
"YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
"TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
"RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
"MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
"VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
"HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
"RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
"NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
"VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
"GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
},
{ "name" : "MBP1_CRYNE",
"RefSeqID" : "XP_569090",
"UniProtID" : "Q5KMQ9",
"taxonomyID" : 214684,
"sequence" : [
"MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
"QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
"KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
"TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
"IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
"AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
"YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
"DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
"RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
"EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
"IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
},
{ "name" : "MBP1_PUCGR",
"RefSeqID" : "XP_003327086",
"UniProtID" : "E3KED4",
"taxonomyID" : 418459,
"sequence" : [
"MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
"HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
"REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
"VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
"LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
"QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
"QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
"DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
"DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
"GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
"LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
"RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
"SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
"NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
},
{ "name" : "MBP1_USTMA",
"RefSeqID" : "XP_011392621",
"UniProtID" : "A0A0D1DP35",
"taxonomyID" : 237631,
"sequence" : [
"MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
"GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
"RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
"ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
"TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
"GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
"AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
"IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
"TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
"AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
"P"]
},
{ "name" : "MBP1_WALME",
"RefSeqID" : "XP_006957051",
"UniProtID" : "I4YGC0",
"taxonomyID" : 671144,
"sequence" : [
"MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
"YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
"VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
"GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
"NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
"KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
"LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
"NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
"ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
"NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
}
]

View File

@ -1,22 +1,22 @@
[
{ "ID" : 227321,
"species" : "Aspergillus nidulans FGSC A4"},
{ "ID" : 930090,
"species" : "Bipolaris oryzae ATCC 44560"},
{ "ID" : 240176,
"species" : "Coprinopsis cinerea okayama7#130"},
{ "ID" : 214684,
"species" : "Cryptococcus neoformans var. neoformans JEC21"},
{ "ID" : 367110,
"species" : "Neurospora crassa OR74A"},
{ "ID" : 418459,
"species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
{ "ID" : 559292,
"species" : "Saccharomyces cerevisiae S288C"},
{ "ID" : 284812,
"species" : "Schizosaccharomyces pombe 972h-"},
{ "ID" : 237631,
"species" : "Ustilago maydis 521"},
{ "ID" : 671144,
"species" : "Wallemia mellicola CBS 633.66"}
]
[
{ "ID" : 227321,
"species" : "Aspergillus nidulans FGSC A4"},
{ "ID" : 930090,
"species" : "Bipolaris oryzae ATCC 44560"},
{ "ID" : 240176,
"species" : "Coprinopsis cinerea okayama7#130"},
{ "ID" : 214684,
"species" : "Cryptococcus neoformans var. neoformans JEC21"},
{ "ID" : 367110,
"species" : "Neurospora crassa OR74A"},
{ "ID" : 418459,
"species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
{ "ID" : 559292,
"species" : "Saccharomyces cerevisiae S288C"},
{ "ID" : 284812,
"species" : "Schizosaccharomyces pombe 972h-"},
{ "ID" : 237631,
"species" : "Ustilago maydis 521"},
{ "ID" : 671144,
"species" : "Wallemia mellicola CBS 633.66"}
]

View File

@ -1,115 +1,115 @@
ID protein.ID feature.ID start end note
# MBP1_SACCE
NA ref_pro_4 ref_ftr_1 4 102 APSES fold
NA ref_pro_4 ref_ftr_2 22 105 KilA-N
NA ref_pro_4 ref_ftr_4 108 122 low complexity
NA ref_pro_4 ref_ftr_4 236 241 low complexity
NA ref_pro_4 ref_ftr_4 279 307 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_5 394 423 Ankyrin
NA ref_pro_4 ref_ftr_5 427 463 Ankyrin
NA ref_pro_4 ref_ftr_5 512 541 Ankyrin
NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold
NA ref_pro_4 ref_ftr_7 633 655 coiled coil
# MBP1_ASPNI
NA ref_pro_1 ref_ftr_1 9 106 APSES fold
NA ref_pro_1 ref_ftr_2 26 109 KilA-N
NA ref_pro_1 ref_ftr_4 529 534 low complexity
NA ref_pro_1 ref_ftr_5 260 289 Ankyrin
NA ref_pro_1 ref_ftr_5 381 413 Ankyrin
NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold
NA ref_pro_1 ref_ftr_7 509 572 coiled coil
# MBP1_BIPOR
NA ref_pro_2 ref_ftr_1 8 106 APSES fold
NA ref_pro_2 ref_ftr_2 26 109 KilA-N
NA ref_pro_2 ref_ftr_4 134 152 low complexity
NA ref_pro_2 ref_ftr_4 267 278 low complexity
NA ref_pro_2 ref_ftr_4 670 685 low complexity
NA ref_pro_2 ref_ftr_5 266 295 Ankyrin
NA ref_pro_2 ref_ftr_5 387 416 Ankyrin
NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold
NA ref_pro_2 ref_ftr_7 659 681 coiled coil
NA ref_pro_2 ref_ftr_7 500 590 coiled coil
# MBP1_NEUCR
NA ref_pro_3 ref_ftr_1 14 114 APSES fold
NA ref_pro_3 ref_ftr_2 34 117 KilA-N
NA ref_pro_3 ref_ftr_4 130 141 low complexity
NA ref_pro_3 ref_ftr_4 253 266 low complexity
NA ref_pro_3 ref_ftr_4 514 525 low complexity
NA ref_pro_3 ref_ftr_4 554 564 low complexity
NA ref_pro_3 ref_ftr_4 601 618 low complexity
NA ref_pro_3 ref_ftr_4 620 629 low complexity
NA ref_pro_3 ref_ftr_4 636 652 low complexity
NA ref_pro_3 ref_ftr_4 658 672 low complexity
NA ref_pro_3 ref_ftr_4 725 735 low complexity
NA ref_pro_3 ref_ftr_4 752 771 low complexity
NA ref_pro_3 ref_ftr_5 268 297 Ankyrin
NA ref_pro_3 ref_ftr_5 390 419 Ankyrin
NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold
NA ref_pro_3 ref_ftr_7 500 550 coiled coil
# MBP1_SCHPO
NA ref_pro_5 ref_ftr_1 8 104 APSES fold
NA ref_pro_5 ref_ftr_2 25 113 KilA-N
NA ref_pro_5 ref_ftr_4 111 125 low complexity
NA ref_pro_5 ref_ftr_4 136 145 low complexity
NA ref_pro_5 ref_ftr_4 176 191 low complexity
NA ref_pro_5 ref_ftr_4 422 447 low complexity
NA ref_pro_5 ref_ftr_5 247 276 Ankyrin
NA ref_pro_5 ref_ftr_5 368 397 Ankyrin
NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold
NA ref_pro_5 ref_ftr_7 457 538 coiled coil
# MBP1_COPCI
NA ref_pro_6 ref_ftr_1 5 103 APSES fold
NA ref_pro_6 ref_ftr_2 23 106 KilA-N
NA ref_pro_6 ref_ftr_4 170 191 low complexity
NA ref_pro_6 ref_ftr_4 435 450 low complexity
NA ref_pro_6 ref_ftr_4 611 626 low complexity
NA ref_pro_6 ref_ftr_5 270 299 Ankyrin
NA ref_pro_6 ref_ftr_5 389 418 Ankyrin
NA ref_pro_6 ref_ftr_5 474 509 Ankyrin
NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold
NA ref_pro_6 ref_ftr_7 500 570 coiled coil
NA ref_pro_6 ref_ftr_7 651 678 coiled coil
# MBP1_CRYNE
NA ref_pro_7 ref_ftr_1 113 211 APSES fold
NA ref_pro_7 ref_ftr_2 131 215 KilA-N
NA ref_pro_7 ref_ftr_4 66 85 low complexity
NA ref_pro_7 ref_ftr_4 413 423 low complexity
NA ref_pro_7 ref_ftr_4 633 644 low complexity
NA ref_pro_7 ref_ftr_4 697 709 low complexity
NA ref_pro_7 ref_ftr_5 477 506 Ankyrin
NA ref_pro_7 ref_ftr_5 618 647 Ankyrin
NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold
# MBP1_PUCGR
NA ref_pro_8 ref_ftr_1 90 187 APSES fold
NA ref_pro_8 ref_ftr_2 107 190 KilA-N
NA ref_pro_8 ref_ftr_4 208 227 low complexity
NA ref_pro_8 ref_ftr_4 273 291 low complexity
NA ref_pro_8 ref_ftr_5 442 271 Ankyrin
NA ref_pro_8 ref_ftr_5 475 509 Ankyrin
NA ref_pro_8 ref_ftr_5 561 590 Ankyrin
NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold
NA ref_pro_8 ref_ftr_7 827 863 coiled coil
# MBP1_USTMA
NA ref_pro_9 ref_ftr_1 7 104 APSES fold
NA ref_pro_9 ref_ftr_2 24 107 KilA-N
NA ref_pro_9 ref_ftr_4 106 116 low complexity
NA ref_pro_9 ref_ftr_4 161 183 low complexity
NA ref_pro_9 ref_ftr_4 657 672 low complexity
NA ref_pro_9 ref_ftr_4 776 796 low complexity
NA ref_pro_9 ref_ftr_5 245 274 Ankyrin
NA ref_pro_9 ref_ftr_5 355 384 Ankyrin
NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold
NA ref_pro_9 ref_ftr_7 581 609 coiled coil
# MBP1_WALME
NA ref_pro_10 ref_ftr_1 6 103 APSES fold
NA ref_pro_10 ref_ftr_2 23 106 KilA-N
NA ref_pro_10 ref_ftr_4 149 162 low complexity
NA ref_pro_10 ref_ftr_4 171 188 low complexity
NA ref_pro_10 ref_ftr_4 618 628 low complexity
NA ref_pro_10 ref_ftr_4 634 660 low complexity
NA ref_pro_10 ref_ftr_5 250 279 Ankyrin
NA ref_pro_10 ref_ftr_5 369 398 Ankyrin
NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold
NA ref_pro_10 ref_ftr_7 461 585 coiled coil
ID protein.ID feature.ID start end note
# MBP1_SACCE
NA ref_pro_4 ref_ftr_1 4 102 APSES fold
NA ref_pro_4 ref_ftr_2 22 105 KilA-N
NA ref_pro_4 ref_ftr_4 108 122 low complexity
NA ref_pro_4 ref_ftr_4 236 241 low complexity
NA ref_pro_4 ref_ftr_4 279 307 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_5 394 423 Ankyrin
NA ref_pro_4 ref_ftr_5 427 463 Ankyrin
NA ref_pro_4 ref_ftr_5 512 541 Ankyrin
NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold
NA ref_pro_4 ref_ftr_7 633 655 coiled coil
# MBP1_ASPNI
NA ref_pro_1 ref_ftr_1 9 106 APSES fold
NA ref_pro_1 ref_ftr_2 26 109 KilA-N
NA ref_pro_1 ref_ftr_4 529 534 low complexity
NA ref_pro_1 ref_ftr_5 260 289 Ankyrin
NA ref_pro_1 ref_ftr_5 381 413 Ankyrin
NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold
NA ref_pro_1 ref_ftr_7 509 572 coiled coil
# MBP1_BIPOR
NA ref_pro_2 ref_ftr_1 8 106 APSES fold
NA ref_pro_2 ref_ftr_2 26 109 KilA-N
NA ref_pro_2 ref_ftr_4 134 152 low complexity
NA ref_pro_2 ref_ftr_4 267 278 low complexity
NA ref_pro_2 ref_ftr_4 670 685 low complexity
NA ref_pro_2 ref_ftr_5 266 295 Ankyrin
NA ref_pro_2 ref_ftr_5 387 416 Ankyrin
NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold
NA ref_pro_2 ref_ftr_7 659 681 coiled coil
NA ref_pro_2 ref_ftr_7 500 590 coiled coil
# MBP1_NEUCR
NA ref_pro_3 ref_ftr_1 14 114 APSES fold
NA ref_pro_3 ref_ftr_2 34 117 KilA-N
NA ref_pro_3 ref_ftr_4 130 141 low complexity
NA ref_pro_3 ref_ftr_4 253 266 low complexity
NA ref_pro_3 ref_ftr_4 514 525 low complexity
NA ref_pro_3 ref_ftr_4 554 564 low complexity
NA ref_pro_3 ref_ftr_4 601 618 low complexity
NA ref_pro_3 ref_ftr_4 620 629 low complexity
NA ref_pro_3 ref_ftr_4 636 652 low complexity
NA ref_pro_3 ref_ftr_4 658 672 low complexity
NA ref_pro_3 ref_ftr_4 725 735 low complexity
NA ref_pro_3 ref_ftr_4 752 771 low complexity
NA ref_pro_3 ref_ftr_5 268 297 Ankyrin
NA ref_pro_3 ref_ftr_5 390 419 Ankyrin
NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold
NA ref_pro_3 ref_ftr_7 500 550 coiled coil
# MBP1_SCHPO
NA ref_pro_5 ref_ftr_1 8 104 APSES fold
NA ref_pro_5 ref_ftr_2 25 113 KilA-N
NA ref_pro_5 ref_ftr_4 111 125 low complexity
NA ref_pro_5 ref_ftr_4 136 145 low complexity
NA ref_pro_5 ref_ftr_4 176 191 low complexity
NA ref_pro_5 ref_ftr_4 422 447 low complexity
NA ref_pro_5 ref_ftr_5 247 276 Ankyrin
NA ref_pro_5 ref_ftr_5 368 397 Ankyrin
NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold
NA ref_pro_5 ref_ftr_7 457 538 coiled coil
# MBP1_COPCI
NA ref_pro_6 ref_ftr_1 5 103 APSES fold
NA ref_pro_6 ref_ftr_2 23 106 KilA-N
NA ref_pro_6 ref_ftr_4 170 191 low complexity
NA ref_pro_6 ref_ftr_4 435 450 low complexity
NA ref_pro_6 ref_ftr_4 611 626 low complexity
NA ref_pro_6 ref_ftr_5 270 299 Ankyrin
NA ref_pro_6 ref_ftr_5 389 418 Ankyrin
NA ref_pro_6 ref_ftr_5 474 509 Ankyrin
NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold
NA ref_pro_6 ref_ftr_7 500 570 coiled coil
NA ref_pro_6 ref_ftr_7 651 678 coiled coil
# MBP1_CRYNE
NA ref_pro_7 ref_ftr_1 113 211 APSES fold
NA ref_pro_7 ref_ftr_2 131 215 KilA-N
NA ref_pro_7 ref_ftr_4 66 85 low complexity
NA ref_pro_7 ref_ftr_4 413 423 low complexity
NA ref_pro_7 ref_ftr_4 633 644 low complexity
NA ref_pro_7 ref_ftr_4 697 709 low complexity
NA ref_pro_7 ref_ftr_5 477 506 Ankyrin
NA ref_pro_7 ref_ftr_5 618 647 Ankyrin
NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold
# MBP1_PUCGR
NA ref_pro_8 ref_ftr_1 90 187 APSES fold
NA ref_pro_8 ref_ftr_2 107 190 KilA-N
NA ref_pro_8 ref_ftr_4 208 227 low complexity
NA ref_pro_8 ref_ftr_4 273 291 low complexity
NA ref_pro_8 ref_ftr_5 442 271 Ankyrin
NA ref_pro_8 ref_ftr_5 475 509 Ankyrin
NA ref_pro_8 ref_ftr_5 561 590 Ankyrin
NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold
NA ref_pro_8 ref_ftr_7 827 863 coiled coil
# MBP1_USTMA
NA ref_pro_9 ref_ftr_1 7 104 APSES fold
NA ref_pro_9 ref_ftr_2 24 107 KilA-N
NA ref_pro_9 ref_ftr_4 106 116 low complexity
NA ref_pro_9 ref_ftr_4 161 183 low complexity
NA ref_pro_9 ref_ftr_4 657 672 low complexity
NA ref_pro_9 ref_ftr_4 776 796 low complexity
NA ref_pro_9 ref_ftr_5 245 274 Ankyrin
NA ref_pro_9 ref_ftr_5 355 384 Ankyrin
NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold
NA ref_pro_9 ref_ftr_7 581 609 coiled coil
# MBP1_WALME
NA ref_pro_10 ref_ftr_1 6 103 APSES fold
NA ref_pro_10 ref_ftr_2 23 106 KilA-N
NA ref_pro_10 ref_ftr_4 149 162 low complexity
NA ref_pro_10 ref_ftr_4 171 188 low complexity
NA ref_pro_10 ref_ftr_4 618 628 low complexity
NA ref_pro_10 ref_ftr_4 634 660 low complexity
NA ref_pro_10 ref_ftr_5 250 279 Ankyrin
NA ref_pro_10 ref_ftr_5 369 398 Ankyrin
NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold
NA ref_pro_10 ref_ftr_7 461 585 coiled coil

View File

@ -1,37 +1,37 @@
# functionTemplate.R
#
# Purpose: (General)
#
# ToDo:
# Notes:
#
# ==============================================================================
myFunction <- function(a, b=1) {
# Purpose:
# Describe ...
# Version:
# Date:
# Author:
#
# Parameters:
# a: ...
# b: ...
# Value:
# result: ...
# Example: <example invocation>
# code ...
return(result)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]
# functionTemplate.R
#
# Purpose: (General)
#
# ToDo:
# Notes:
#
# ==============================================================================
myFunction <- function(a, b=1) {
# Purpose:
# Describe ...
# Version:
# Date:
# Author:
#
# Parameters:
# a: ...
# b: ...
# Value:
# result: ...
# Example: <example invocation>
# code ...
return(result)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]

View File

@ -1,21 +1,21 @@
# .myProfile.R
# This contains information which the course framework needs from time to time
# to personalize assignments, validate submissions etc. Make sure that
# the information correctly matches our official records.
# myEmail char A string with your eMail address. Use your official
# UofT eMail address.
# myStudentNumber numeric Your UofT student number. Take care to have this
# correct.
#
# NOTE:
# After you have updated this script, move the file to your "myScripts" folder.
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
#
# ==============================================================================
# options(stringsAsFactors = FALSE)
myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca"
myStudentNumber <- 1005845285 # e.g. 1003141592
MYSPE <- "Cutaneotrichosporon oleaginosum"
# [END]
# .myProfile.R
# This contains information which the course framework needs from time to time
# to personalize assignments, validate submissions etc. Make sure that
# the information correctly matches our official records.
# myEmail char A string with your eMail address. Use your official
# UofT eMail address.
# myStudentNumber numeric Your UofT student number. Take care to have this
# correct.
#
# NOTE:
# After you have updated this script, move the file to your "myScripts" folder.
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
#
# ==============================================================================
# options(stringsAsFactors = FALSE)
myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca"
myStudentNumber <- 1005845285 # e.g. 1003141592
MYSPE <- "Cutaneotrichosporon oleaginosum"
# [END]

View File

@ -1,54 +1,51 @@
myFA <- readFASTA("data/RAB39B_HSa_coding.fa")
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
gen_mutations <- function(seq, N) {
stats <- c()
stats <- cbind(stats, c(0, 0, 0))
rownames(stats) <- c("silent", "missense", "nonsense")
colnames(stats) <- c("occurrences")
# Actual function
for (i in 1:217) {
# select index for mutation
working_seq <- Biostrings::DNAString(seq)
aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE)
mut_action <- sample(c("ins", "del", "sub"), 1, TRUE)
mut_seq <- Biostrings::DNAString(seq)
if (mut_action == "sub") {
mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
possible_mutations <- Biostrings::DNA_BASES
possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))]
mut_change <- sample(possible_mutations, 1, replace = TRUE)
mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change)
} else if (mut_action == "ins") {
mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE)
possible_mutations <- Biostrings::DNA_BASES
mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = ""))
} else {
mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
mut_seq <- mut_seq[-mut_index]
}
mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
# Note: we need silent, nonsense, and missense
mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa))
aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq))
if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) {
stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
} else if (mut_aa == aa_seq) {
stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
} else {
stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
}
}
return(stats)
}
N_test <- 1200
gen_mutations("ATGATGATGATGATGATG", N_test)
gen_mutations("CCCCCCCCCCCCCCCCCC", N_test)
gen_mutations("TATTACTATTACTATTAC", N_test)
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)
gen_mutations <- function(seq, N) {
sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
stats <- c()
stats <- cbind(stats, c(0, 0, 0))
rownames(stats) <- c("silent", "missense", "nonsense")
colnames(stats) <- c("occurrences")
# Actual function
for (i in 1:N) {
original_seq <- Biostrings::DNAString(seq)
aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
mut_seq <- Biostrings::DNAString(seq)
mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
possible_mutations <- Biostrings::DNA_BASES
possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
term_aa <- regexpr(pattern = "\\*", aa_seq)
term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
} else if (mut_aa == aa_seq) {
stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
} else {
stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
}
}
sealKey()
return(stats)
}
gen_mutations("ATGATGATGATGATGATG", 1000)
gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
gen_mutations("TATTACTATTACTATTAC", 500)
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
myFA <- readFASTA("data/RAB39B_HSa_coding.fa")
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
gen_mutations(myFA["RAB39B", 2], 10000)
gen_mutations(myFA["PTPN5", 2], 10000)
gen_mutations(myFA["PTPN11", 2], 10000)
gen_mutations(myFA["KRAS", 2], 10000)

View File

@ -1,41 +1,41 @@
# == 1.3 Task: submit for credit (part 1/2) ================================
# == Submission - Code to add another philosopher to the datamodel:
pID <- autoincrement(philDB$person)
immanuelKant <- data.frame(id = pID,
name = "Immanuel Kant",
born = "1724",
died = "1804",
school = "Enlightenment Philosophy")
philDB$person <- rbind(philDB$person, immanuelKant)
bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID,
title = "Critique of Pure Reason",
published = "1781")
philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID,
title = "Critique of Judgement",
published = "1790")
philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
schools <- unique(philDB$person$school)
schools <- sort(schools)
for (s in schools) {
cat(sprintf("%s\n", s))
authors = which(philDB$person$school == s)
for (author in authors) {
works = which(philDB$works$personID == author)
for (work in works) {
bookId = which(philDB$books$id == philDB$works$bookID[work])
cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
}
}
# == 1.3 Task: submit for credit (part 1/2) ================================
# == Submission - Code to add another philosopher to the datamodel:
pID <- autoincrement(philDB$person)
immanuelKant <- data.frame(id = pID,
name = "Immanuel Kant",
born = "1724",
died = "1804",
school = "Enlightenment Philosophy")
philDB$person <- rbind(philDB$person, immanuelKant)
bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID,
title = "Critique of Pure Reason",
published = "1781")
philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID,
title = "Critique of Judgement",
published = "1790")
philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
schools <- unique(philDB$person$school)
schools <- sort(schools)
for (s in schools) {
cat(sprintf("%s\n", s))
authors = which(philDB$person$school == s)
for (author in authors) {
works = which(philDB$works$personID == author)
for (work in works) {
bookId = which(philDB$books$id == philDB$works$bookID[work])
cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
}
}
}

View File

@ -1,4 +1,4 @@
[{
"ID": 879819,
"species": "Cutaneotrichosporon oleaginosum"}
]
[{
"ID": 879819,
"species": "Cutaneotrichosporon oleaginosum"}
]

View File

@ -1,19 +1,19 @@
[
{ "name" : "MBP1_CUTOL",
"RefSeqID" : "XP_018278493.1",
"UniProtID" : "A0A0J0XLN0",
"taxonomyID" : 879819,
"sequence" : [
"MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
"KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
"PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
"MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
"TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
"YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
"EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
"SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
"AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
"LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
"DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
}
]
[
{ "name" : "MBP1_CUTOL",
"RefSeqID" : "XP_018278493.1",
"UniProtID" : "A0A0J0XLN0",
"taxonomyID" : 879819,
"sequence" : [
"MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
"KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
"PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
"MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
"TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
"YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
"EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
"SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
"AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
"LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
"DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
}
]

View File

@ -1,8 +1,8 @@
README - myScripts folder:
==========================
The "myScripts" folder is a place to keep your personal files
safe. No files will be submitted into this folder on the GitHub, master
copy. Thefore everything you put into this folder is safe from being
inadvertently overwritten by a file with the same name that would be
downloaded in a GitHub "pull" request.
README - myScripts folder:
==========================
The "myScripts" folder is a place to keep your personal files
safe. No files will be submitted into this folder on the GitHub, master
copy. Thefore everything you put into this folder is safe from being
inadvertently overwritten by a file with the same name that would be
downloaded in a GitHub "pull" request.

View File

@ -1,4 +1,4 @@
source("./scripts/ABC-createRefDB.R")
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
source("./scripts/ABC-createRefDB.R")
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))

View File

@ -1,38 +1,38 @@
# myScript.R
#
# --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_
# your saved version.
#
# As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above --------
#
## Purpose: <...>
#
# Version: <...>
#
# Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>)
#
# Versions:
#
# <number> <Features>
#
# TODO:
# <...>
#
# ====================================================================
# [END]
# myScript.R
#
# --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_
# your saved version.
#
# As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above --------
#
## Purpose: <...>
#
# Version: <...>
#
# Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>)
#
# Versions:
#
# <number> <Features>
#
# TODO:
# <...>
#
# ====================================================================
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,75 +1,75 @@
# scriptTemplate.R
#
# Purpose:
# Version:
# Date:
# Author:
#
# Input:
# Output:
# Dependencies:
#
# ToDo:
# Notes:
#
# ==============================================================================
setwd("<your/project/directory>")
# ==== PARAMETERS ============================================================
# Define and explain all parameters. No "magic numbers" in your code below.
# ==== PACKAGES ==============================================================
# Check that required packages have been installed. Install if needed.
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# Note: use package functions with the :: operator - eg.
# seqinr::aaa("K")
# ==== FUNCTIONS =============================================================
# Define functions or source external files
source("<myUtilityFunctionsScript.R>")
myFunction <- function(a, b=1) {
# Purpose:
# Describe ...
# Parameters:
# a: ...
# b: ...
# Value:
# result: ...
# code ...
return(result)
}
# ==== PROCESS ===============================================================
# Enter the step-by-step process of your project here. Strive to write your
# code so that you can simply run this entire file and re-create all
# intermediate results.
# ==== TESTS =================================================================
# Enter your function tests here...
# [END]
# scriptTemplate.R
#
# Purpose:
# Version:
# Date:
# Author:
#
# Input:
# Output:
# Dependencies:
#
# ToDo:
# Notes:
#
# ==============================================================================
setwd("<your/project/directory>")
# ==== PARAMETERS ============================================================
# Define and explain all parameters. No "magic numbers" in your code below.
# ==== PACKAGES ==============================================================
# Check that required packages have been installed. Install if needed.
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# Note: use package functions with the :: operator - eg.
# seqinr::aaa("K")
# ==== FUNCTIONS =============================================================
# Define functions or source external files
source("<myUtilityFunctionsScript.R>")
myFunction <- function(a, b=1) {
# Purpose:
# Describe ...
# Parameters:
# a: ...
# b: ...
# Value:
# result: ...
# code ...
return(result)
}
# ==== PROCESS ===============================================================
# Enter the step-by-step process of your project here. Strive to write your
# code so that you can simply run this entire file and re-create all
# intermediate results.
# ==== TESTS =================================================================
# Enter your function tests here...
# [END]

View File

@ -1,30 +1,30 @@
# ABC-createRefDB.R
#
# Create a reference protein database for Mbp1-like proteins
#
# Boris Steipe for ABC learning units
#
# For the species, see:
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
#
# For the data model, see
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
#
# ==============================================================================
myDB <- dbInit()
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
# [END]
# ABC-createRefDB.R
#
# Create a reference protein database for Mbp1-like proteins
#
# Boris Steipe for ABC learning units
#
# For the species, see:
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
#
# For the data model, see
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
#
# ==============================================================================
myDB <- dbInit()
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
# [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,443 +1,443 @@
# tocID <- "scripts/ABC-makeMYSPElist.R"
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.4
#
# Date: 2016 09 - 2021 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions
# 1.4 New retrieval logic
# 1.3 Rewrite to change datasource. NCBI has not been updated
# since 2012. Use ensembl fungi as initial source.
# 1.2 Change from require() to requireNamespace()
# 1.1.2 Moved BLAST.R to ./scripts directory
# 1.1 Update 2017
# 1.0 First code 2016
#
# TODO:
#
# ==============================================================================
#
# DO NOT source() THIS FILE!
#
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
#
# Run the commands that interact with the NCBI servers only if you want to
# experiment specifically with the code and/or parameters. I have commented out
# those parts. If you only want to study the general workflow, just load()
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------
#TOC> 1 The strategy 55
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
#TOC> 3 ENSEMBL FUNGI 75
#TOC> 3.1 Import 78
#TOC> 4 BLAST SEARCH 155
#TOC> 4.1 find homologous proteins 161
#TOC> 4.2 Identify species in "hits" 192
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
#TOC> 6 STUDENT NUMBERS 375
#TOC>
#TOC> ==========================================================================
# = 1 The strategy ========================================================
# This script will create a list of "MYSPE" species and save it in an R object
# MYSPEspecies that is stored in the data subdirectory of this project from
# where it can be loaded. The strategy is as follows: we download a list of
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
# species that have been annotated.
# Next we perform a BLAST search, to identify fungal species that have
# genes that are homologous to yeast MBP1.
#
# ...
# = 2 PACKAGES AND INITIALIZATIONS ========================================
# httr provides interfaces to Webservers on the Internet
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# = 3 ENSEMBL FUNGI =======================================================
# == 3.1 Import ============================================================
# Navigate to https://fungi.ensembl.org and click on the link to the full
# list of all species: https://fungi.ensembl.org/species.html
# On the page, click on the spreadsheet symbol top right and choose
# "download whole table". The file will be named "Species.csv", in your
# usual downloads folder. Move it to the data folder, and read it.
sDat <- read.csv("./data/Species.csv")
str(sDat)
# The most obvious way to partition these is according to Classification ...
# (poking around a bit in the UniProt taxonomy database shows that the
# classification used here is the taxonomic rank of "order").
# how many classifications do we have?
length(unique(sDat$Classification)) # 66
# To have a good set for the class, we should have about 100.
# Let's see for which of these we can find Mbp1 homologues.
# First, we'll keep only the colums for name, classification, and taxID, and
# drop the rest ...
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
colnames(sDat) <- c("name", "order", "taxID")
# Next, we make an extra column: genus - the first part of the binomial name.
# We'll use the gsub() function, and for that we need a "regular expression"
# that matches to all characters from the first blank to the end of the string:
myPatt <- "\\s.*$" # one whitespace (\\s) ...
# followed by any character (.) 0..n times (*) ...
# until the end of the string
# using gsub() we substitue all matching characters with the empty string "" -
# this deletes the matching characters
# Test this:
gsub(myPatt, "", "Genus") # one word: unchanged
gsub(myPatt, "", "gEnus species") # two words: return only first
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
# apply this to the "name" column and add the result as a separate column
# called "genus"
sDat$genus <- gsub(myPatt, "", sDat$name)
# what do we get?
c(head(unique(sDat$genus)),
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
# is a problem that we have to keep in mind.
# (Always inspect your results!)
# Drop all rows for which the genus contains special chracters -
# like "[Candida]"
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
length(table(sDat$genus)) # how many genus?
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
# most genus have very few, but
# some have very many species.
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
# We should have at least one species from each taxonomic order, but we can
# add a few genus until we have about 100 validated species.
# Let's add a column for species, by changing our regular expression a bit,
# using ^ (start of string), \\S (NOT a whitespace),
# and + (one or more matches), capturing the match (...), and returning
# it as the substitution (\\1) ...
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
sDat$species <- gsub(myPatt, "\\1", sDat$name)
# And we reorder the columns, just for aesthetics:
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
# Final check:
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
#
# Now we check which of these have Mbp1 homologues ...
# = 4 BLAST SEARCH ========================================================
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
# fungus. With the results, we'll annotate our sDat table.
# == 4.1 find homologous proteins ==========================================
#
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them.
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
# standard task of communicating with servers and parsing responses - everyday
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
# parser in currently available packages.
#
# DON'T use this for BLAST searches unless you have read the NCBI policy
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
# server, they will blacklist your IP-address. See:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
#
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in
# nHits = 3000, # 945 hits in 2020
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# saveRDS(BLASThits, file="data/BLASThits.rds")
#
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
#
BLASThits <- readRDS(file = "data/BLASThits.rds")
# == 4.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(BLASThits$hits) # 1,134
# Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]])
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
# in a useful way. The species information is in the $species element of every
# hit.
# Run a loop to extract all the species names into a vector. We subset ...
# Blasthits$hits ... the list of hits, from which we choose ...
# Blasthits$hits[[i]] ... the i-th hit, and get ...
# Blasthits$hits[[i]]$species ... the species element from that.
# Subsetting FTW.
BLASTspecies <- character()
for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <- BLASThits$hits[[i]]$species
}
# You can confirm that BLASTspecies has the expected size.
length(BLASTspecies)
# if we delete some of these later on, we still want to remember which hit
# they came from. Thus we name() the elements with their index, which is the
# same as the index of the hit in BLASThits
names(BLASTspecies) <- 1:length(BLASTspecies)
# let's plot the distribution of E-values
eVals <- numeric()
for (i in seq_along(BLASThits$hits)) {
eVals[i] <- BLASThits$hits[[i]]$E
}
range(eVals)
sum(eVals == 0)
# let's plot the log of all values > 0 to see how they are distributed
# plotting only one vectyor of numbers plots their index as x, and
# their value as y ...
plot(log(eVals[eVals > 0]), col = "#CC0000")
# This is very informative: I would suspect that the first ten or so are
# virtually identical to the yeast protein, then we have about 800 hits with
# decreasing similarity, and then about 200 more that may actually be false
# positives. Also - we plotted them by index, that means the table is SORTED:
# Lower E-values strictly come before higher E-values.
# Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae")
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
# species, we will retain only the protein that has the highest similarity
# to yeast Mbp1, not any of its more distant paralogues.
sel <- ! duplicated(BLASTspecies)
BLASTspecies <- BLASTspecies[sel]
length(BLASTspecies)
# i.e. we got rid of about two thirds of the hits.
tail(BLASTspecies) # see how the names are useful!
# again - there are some special characters ...
# what are they?
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
# remove the brackets ...
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
# drop any new duplicates ...
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
# check the number again:
length(BLASTspecies)
# Think a bit about this: what may be the biological reason to find that
# on average, in 388 fungi across the entire phylogenetic tree, we have
# three sequences that are homologous to yeast Mbp1?
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
# we plot all values that are TRUE in the vector "sel" that we created above,
# AND greater than 0
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
# Next we add the blast result to our sDat dataframe. We'll store the index,
# the E-value, and the Query-bounds from which we can estimate which domains
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
# Mbp1's N-terminal APSES domain.)
#
# First we pull the hits we wanted from the BLASTspecies:
iHits <- as.numeric(names(BLASTspecies))
length(iHits) # one index for each TRUE in sel
# add columns to sDat
l <- nrow(sDat)
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
sDat$eVal <- numeric(l) # E-value of the hit
sDat$lAli <- numeric(l) # length of the aligned region
# extract and merge
for (iHit in iHits) {
thisSp <- BLASThits$hits[[iHit]]$species
sel <- sDat$species == thisSp
sDat$iHit[sel] <- iHit
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
}
# Are all reference species accounted for?
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
REFspecies %in% sDat$species[selA] # yes, all there
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
sum(selA & selB) # How many rows?
# sDat of course includes all duplicates. Some may be multiply sequenced, some
# may be different strains. We'll use the same strategy as before and keep
# only the best hit: order the rows by E-value, then drop all rows which
# are duplicated.
# drop all rows without BLAST hits ...
sDat <- sDat[ ! (sDat$iHit == 0) , ]
# order sDat by E-value ...
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
# drop all rows with duplicated species ...
sDat <- sDat[ ! duplicated(sDat$species) , ]
# Lets look at the E-values ...
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
# and alignment lengths ...
plot(sDat$lAli, col = "#00DDAA")
# How many ...
length(unique(sDat$name))
length(unique(sDat$species))
length(unique(sDat$genus))
length(unique(sDat$order))
# I need an extra species for admin purposes later on ...
sel <- grep("Sporothrix schenckii", sDat$species)
SPOSCdat <- sDat[sel, ]
sDat <- sDat[-sel, ]
# To get the final dataset, we remove the reference species with their
# entire orders ...
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
sel <- sDat$order %in% REForders
REFdat <- sDat[sel , ]
sDat <- sDat[ ! sel , ]
# REFdat should now contain only the REFspecies ...
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
# ... but all of them
sum(REFspecies %in% REFdat$species)
# ... and we have enough left in sDat to prune sDat to unique genus
sDat <- sDat[ ! duplicated(sDat$genus) , ]
nrow(sDat) # 84
# I add back "Sporothrix schenckii" ...
sDat <- rbind(SPOSCdat, sDat)
# ... and save for future use.
# saveRDS(sDat, file = "data/sDat.rds")
# saveRDS(REFdat, file = "data/REFdat.rds")
# = 6 STUDENT NUMBERS =====================================================
#
# An asymmetric function to retrieve a MYSPE species
#
sDat <- readRDS(file = "data/sDat.rds")
students <- read.csv("../BCH441-2021-students.csv")
sN <- students$Integration.ID
sN <- sN[! is.na(sN)]
sN <- as.character(sN)
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
set.seed(112358)
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
all(sort(theseSpecies$name) == sort(sDat$name))
nrow((theseSpecies))
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
rndMin <- 992000000
rndMax <- 1020000000
N <- 10000
keys <- as.character(sample(rndMin:rndMax, N + 1000))
keys <- keys[! (keys %in% sN)]
keys <- keys[1:N]
keys[1:length(sN)] <- sN
nRep <- floor(N/nrow(theseSpecies))
MYSPEdat <- theseSpecies
for(i in 1:nRep) {
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
}
MYSPEdat <- MYSPEdat[1:N, ]
for (i in 1:N) {
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
}
set.seed(NULL)
MYSPEdat <- MYSPEdat[sample(1:N), ]
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
# === validate
x <- character()
for (n in sN) {
sp <- getMYSPE(n)
if (length(sp) != 1) {
stop(print(as.character(n)))
} else {
x <- c(x, sp)
}
}
# === species for late-comers
y <- unique(MYSPEdat$species)
print(y[!(y %in% x)])
# === validate
l <- length(sN)
sp <- character(l)
for(i in 1:l) {
sp[i] <- getMYSPE(sN[i])
}
any(duplicated(sp))
length(unique(sp))
which(! sDat$species %in% sp) # these can be assigned to late-comers
# Done.
# [END]
# tocID <- "scripts/ABC-makeMYSPElist.R"
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.4
#
# Date: 2016 09 - 2021 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions
# 1.4 New retrieval logic
# 1.3 Rewrite to change datasource. NCBI has not been updated
# since 2012. Use ensembl fungi as initial source.
# 1.2 Change from require() to requireNamespace()
# 1.1.2 Moved BLAST.R to ./scripts directory
# 1.1 Update 2017
# 1.0 First code 2016
#
# TODO:
#
# ==============================================================================
#
# DO NOT source() THIS FILE!
#
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
#
# Run the commands that interact with the NCBI servers only if you want to
# experiment specifically with the code and/or parameters. I have commented out
# those parts. If you only want to study the general workflow, just load()
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------
#TOC> 1 The strategy 55
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
#TOC> 3 ENSEMBL FUNGI 75
#TOC> 3.1 Import 78
#TOC> 4 BLAST SEARCH 155
#TOC> 4.1 find homologous proteins 161
#TOC> 4.2 Identify species in "hits" 192
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
#TOC> 6 STUDENT NUMBERS 375
#TOC>
#TOC> ==========================================================================
# = 1 The strategy ========================================================
# This script will create a list of "MYSPE" species and save it in an R object
# MYSPEspecies that is stored in the data subdirectory of this project from
# where it can be loaded. The strategy is as follows: we download a list of
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
# species that have been annotated.
# Next we perform a BLAST search, to identify fungal species that have
# genes that are homologous to yeast MBP1.
#
# ...
# = 2 PACKAGES AND INITIALIZATIONS ========================================
# httr provides interfaces to Webservers on the Internet
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
# = 3 ENSEMBL FUNGI =======================================================
# == 3.1 Import ============================================================
# Navigate to https://fungi.ensembl.org and click on the link to the full
# list of all species: https://fungi.ensembl.org/species.html
# On the page, click on the spreadsheet symbol top right and choose
# "download whole table". The file will be named "Species.csv", in your
# usual downloads folder. Move it to the data folder, and read it.
sDat <- read.csv("./data/Species.csv")
str(sDat)
# The most obvious way to partition these is according to Classification ...
# (poking around a bit in the UniProt taxonomy database shows that the
# classification used here is the taxonomic rank of "order").
# how many classifications do we have?
length(unique(sDat$Classification)) # 66
# To have a good set for the class, we should have about 100.
# Let's see for which of these we can find Mbp1 homologues.
# First, we'll keep only the colums for name, classification, and taxID, and
# drop the rest ...
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
colnames(sDat) <- c("name", "order", "taxID")
# Next, we make an extra column: genus - the first part of the binomial name.
# We'll use the gsub() function, and for that we need a "regular expression"
# that matches to all characters from the first blank to the end of the string:
myPatt <- "\\s.*$" # one whitespace (\\s) ...
# followed by any character (.) 0..n times (*) ...
# until the end of the string
# using gsub() we substitue all matching characters with the empty string "" -
# this deletes the matching characters
# Test this:
gsub(myPatt, "", "Genus") # one word: unchanged
gsub(myPatt, "", "gEnus species") # two words: return only first
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
# apply this to the "name" column and add the result as a separate column
# called "genus"
sDat$genus <- gsub(myPatt, "", sDat$name)
# what do we get?
c(head(unique(sDat$genus)),
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
# is a problem that we have to keep in mind.
# (Always inspect your results!)
# Drop all rows for which the genus contains special chracters -
# like "[Candida]"
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
length(table(sDat$genus)) # how many genus?
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
# most genus have very few, but
# some have very many species.
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
# We should have at least one species from each taxonomic order, but we can
# add a few genus until we have about 100 validated species.
# Let's add a column for species, by changing our regular expression a bit,
# using ^ (start of string), \\S (NOT a whitespace),
# and + (one or more matches), capturing the match (...), and returning
# it as the substitution (\\1) ...
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
sDat$species <- gsub(myPatt, "\\1", sDat$name)
# And we reorder the columns, just for aesthetics:
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
# Final check:
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
#
# Now we check which of these have Mbp1 homologues ...
# = 4 BLAST SEARCH ========================================================
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
# fungus. With the results, we'll annotate our sDat table.
# == 4.1 find homologous proteins ==========================================
#
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them.
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
# standard task of communicating with servers and parsing responses - everyday
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
# parser in currently available packages.
#
# DON'T use this for BLAST searches unless you have read the NCBI policy
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
# server, they will blacklist your IP-address. See:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
#
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in
# nHits = 3000, # 945 hits in 2020
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# saveRDS(BLASThits, file="data/BLASThits.rds")
#
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
#
BLASThits <- readRDS(file = "data/BLASThits.rds")
# == 4.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(BLASThits$hits) # 1,134
# Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]])
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
# in a useful way. The species information is in the $species element of every
# hit.
# Run a loop to extract all the species names into a vector. We subset ...
# Blasthits$hits ... the list of hits, from which we choose ...
# Blasthits$hits[[i]] ... the i-th hit, and get ...
# Blasthits$hits[[i]]$species ... the species element from that.
# Subsetting FTW.
BLASTspecies <- character()
for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <- BLASThits$hits[[i]]$species
}
# You can confirm that BLASTspecies has the expected size.
length(BLASTspecies)
# if we delete some of these later on, we still want to remember which hit
# they came from. Thus we name() the elements with their index, which is the
# same as the index of the hit in BLASThits
names(BLASTspecies) <- 1:length(BLASTspecies)
# let's plot the distribution of E-values
eVals <- numeric()
for (i in seq_along(BLASThits$hits)) {
eVals[i] <- BLASThits$hits[[i]]$E
}
range(eVals)
sum(eVals == 0)
# let's plot the log of all values > 0 to see how they are distributed
# plotting only one vectyor of numbers plots their index as x, and
# their value as y ...
plot(log(eVals[eVals > 0]), col = "#CC0000")
# This is very informative: I would suspect that the first ten or so are
# virtually identical to the yeast protein, then we have about 800 hits with
# decreasing similarity, and then about 200 more that may actually be false
# positives. Also - we plotted them by index, that means the table is SORTED:
# Lower E-values strictly come before higher E-values.
# Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae")
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
# species, we will retain only the protein that has the highest similarity
# to yeast Mbp1, not any of its more distant paralogues.
sel <- ! duplicated(BLASTspecies)
BLASTspecies <- BLASTspecies[sel]
length(BLASTspecies)
# i.e. we got rid of about two thirds of the hits.
tail(BLASTspecies) # see how the names are useful!
# again - there are some special characters ...
# what are they?
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
# remove the brackets ...
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
# drop any new duplicates ...
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
# check the number again:
length(BLASTspecies)
# Think a bit about this: what may be the biological reason to find that
# on average, in 388 fungi across the entire phylogenetic tree, we have
# three sequences that are homologous to yeast Mbp1?
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
# we plot all values that are TRUE in the vector "sel" that we created above,
# AND greater than 0
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
# Next we add the blast result to our sDat dataframe. We'll store the index,
# the E-value, and the Query-bounds from which we can estimate which domains
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
# Mbp1's N-terminal APSES domain.)
#
# First we pull the hits we wanted from the BLASTspecies:
iHits <- as.numeric(names(BLASTspecies))
length(iHits) # one index for each TRUE in sel
# add columns to sDat
l <- nrow(sDat)
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
sDat$eVal <- numeric(l) # E-value of the hit
sDat$lAli <- numeric(l) # length of the aligned region
# extract and merge
for (iHit in iHits) {
thisSp <- BLASThits$hits[[iHit]]$species
sel <- sDat$species == thisSp
sDat$iHit[sel] <- iHit
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
}
# Are all reference species accounted for?
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
REFspecies %in% sDat$species[selA] # yes, all there
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
sum(selA & selB) # How many rows?
# sDat of course includes all duplicates. Some may be multiply sequenced, some
# may be different strains. We'll use the same strategy as before and keep
# only the best hit: order the rows by E-value, then drop all rows which
# are duplicated.
# drop all rows without BLAST hits ...
sDat <- sDat[ ! (sDat$iHit == 0) , ]
# order sDat by E-value ...
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
# drop all rows with duplicated species ...
sDat <- sDat[ ! duplicated(sDat$species) , ]
# Lets look at the E-values ...
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
# and alignment lengths ...
plot(sDat$lAli, col = "#00DDAA")
# How many ...
length(unique(sDat$name))
length(unique(sDat$species))
length(unique(sDat$genus))
length(unique(sDat$order))
# I need an extra species for admin purposes later on ...
sel <- grep("Sporothrix schenckii", sDat$species)
SPOSCdat <- sDat[sel, ]
sDat <- sDat[-sel, ]
# To get the final dataset, we remove the reference species with their
# entire orders ...
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
sel <- sDat$order %in% REForders
REFdat <- sDat[sel , ]
sDat <- sDat[ ! sel , ]
# REFdat should now contain only the REFspecies ...
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
# ... but all of them
sum(REFspecies %in% REFdat$species)
# ... and we have enough left in sDat to prune sDat to unique genus
sDat <- sDat[ ! duplicated(sDat$genus) , ]
nrow(sDat) # 84
# I add back "Sporothrix schenckii" ...
sDat <- rbind(SPOSCdat, sDat)
# ... and save for future use.
# saveRDS(sDat, file = "data/sDat.rds")
# saveRDS(REFdat, file = "data/REFdat.rds")
# = 6 STUDENT NUMBERS =====================================================
#
# An asymmetric function to retrieve a MYSPE species
#
sDat <- readRDS(file = "data/sDat.rds")
students <- read.csv("../BCH441-2021-students.csv")
sN <- students$Integration.ID
sN <- sN[! is.na(sN)]
sN <- as.character(sN)
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
set.seed(112358)
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
all(sort(theseSpecies$name) == sort(sDat$name))
nrow((theseSpecies))
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
rndMin <- 992000000
rndMax <- 1020000000
N <- 10000
keys <- as.character(sample(rndMin:rndMax, N + 1000))
keys <- keys[! (keys %in% sN)]
keys <- keys[1:N]
keys[1:length(sN)] <- sN
nRep <- floor(N/nrow(theseSpecies))
MYSPEdat <- theseSpecies
for(i in 1:nRep) {
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
}
MYSPEdat <- MYSPEdat[1:N, ]
for (i in 1:N) {
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
}
set.seed(NULL)
MYSPEdat <- MYSPEdat[sample(1:N), ]
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
# === validate
x <- character()
for (n in sN) {
sp <- getMYSPE(n)
if (length(sp) != 1) {
stop(print(as.character(n)))
} else {
x <- c(x, sp)
}
}
# === species for late-comers
y <- unique(MYSPEdat$species)
print(y[!(y %in% x)])
# === validate
l <- length(sN)
sp <- character(l)
for(i in 1:l) {
sp[i] <- getMYSPE(sN[i])
}
any(duplicated(sp))
length(unique(sp))
which(! sDat$species %in% sp) # these can be assigned to late-comers
# Done.
# [END]

View File

@ -1,168 +1,168 @@
# tocID <- "scripts/ABC-makeSTRINGedges.R"
#
# Create a subnetwork of high-confidence human STRING edges.
#
# Notes:
#
# The large source- datafile is NOT posted to github. If you want to
# experiment with the original data, download it and place it into your
# local ./data directory.
#
# STRING data source:
# Download page:
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
# Data: (127.6 Mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
#
# Version: 1.0
#
# Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 Rewrite
#
# TODO:
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Initialize 44
#TOC> 2 Read STRING Data 51
#TOC> 3 Define cutoff and subset 63
#TOC> 4 Drop duplicates 103
#TOC> 5 Simple statistics 127
#TOC> 6 Write to file 160
#TOC>
#TOC> ==========================================================================
# = 1 Initialize ==========================================================
if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr")
}
# = 2 Read STRING Data ====================================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 127.6MB, the uncompressed version is probably
# 848 Mb. Fortunately readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
nrow(STR) # 11,759,454 rows
head(STR)
# = 3 Define cutoff and subset ============================================
# approximate distribution of combined_score
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
# Let's table the counts >= 850 and plot them for better resolution.
myTb <- table(STR$combined_score[STR$combined_score >= 850])
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
# frequently assigns a combined score of 0.900
# Let's plot these counts as cumulative sums, in reverse order, scaled
# as combined scores.
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
plot(myX,
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
xlim = c(1.0, 0.85), # reverse x-axis
type = "l",
main = "STRING interactions for 9606 (top 600,000)",
xlab = "combined_score",
ylab = "cumulative counts",
col = "#CC0000")
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
# What's the cutoff for 100,000 edges?
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
# confirm
sum(STR$combined_score >= 964) # 101,348
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
# subset the table, and use only the protein IDs and the combined_score
STR <- STR[STR$combined_score >= 964,
c("protein1", "protein2", "combined_score")]
colnames(STR) <- c("a", "b", "score")
# = 4 Drop duplicates ====================================================
# identify duplicate interactions by creating keys in a defined alphabetical
# sort order, then checking for duplicated().
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
# (X:U, X:U) has a duplicate.
AB <- STR$a < STR$b # logical vector: genes we need to swap
tmp <- STR$b # copy column b
STR$b[AB] <- STR$a[AB] # copy a's into b
STR$a[AB] <- tmp[AB] # copy tmp's into a
all(STR$a >= STR$b) # confirm: TRUE
# now, make combined keys, like this:
paste0(STR$a[1:10], ":", STR$b[1:10])
tmp <- paste0(STR$a, ":", STR$b)
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
# both a:b and b:a !
# drop all duplicated interactions from tmp
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
# = 5 Simple statistics ===================================================
# how many unique genes?
length(unique(c(STR$a, STR$b))) # 8,445
# how many self-edges?
sum(STR$a == STR$b) # none
# log(rank) / log(frequency)
myTbl <- table(c(STR$a, STR$b))
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
hist(myTbl, breaks = 40, col = "#FFEEBB")
# number of singletons
sum(myTbl == 1) # almost a quarter
# maximum?
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
# Google: CDC5L
# Zipf-plot
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
type = "b", cex = 0.7,
main = "STRINGedges - degrees",
xlab = "log(rank)",
ylab = "log(frequency)",
col = "#FFBB88")
sprintf("Average number of interactions: %5.2f",
nrow(STR) / length(unique(c(STR$a, STR$b))))
# = 6 Write to file =======================================================
saveRDS(STR, file = "./data/STRINGedges.rds")
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
# object when needed
# [END]
# tocID <- "scripts/ABC-makeSTRINGedges.R"
#
# Create a subnetwork of high-confidence human STRING edges.
#
# Notes:
#
# The large source- datafile is NOT posted to github. If you want to
# experiment with the original data, download it and place it into your
# local ./data directory.
#
# STRING data source:
# Download page:
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
# Data: (127.6 Mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
#
# Version: 1.0
#
# Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 Rewrite
#
# TODO:
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Initialize 44
#TOC> 2 Read STRING Data 51
#TOC> 3 Define cutoff and subset 63
#TOC> 4 Drop duplicates 103
#TOC> 5 Simple statistics 127
#TOC> 6 Write to file 160
#TOC>
#TOC> ==========================================================================
# = 1 Initialize ==========================================================
if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr")
}
# = 2 Read STRING Data ====================================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 127.6MB, the uncompressed version is probably
# 848 Mb. Fortunately readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
nrow(STR) # 11,759,454 rows
head(STR)
# = 3 Define cutoff and subset ============================================
# approximate distribution of combined_score
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
# Let's table the counts >= 850 and plot them for better resolution.
myTb <- table(STR$combined_score[STR$combined_score >= 850])
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
# frequently assigns a combined score of 0.900
# Let's plot these counts as cumulative sums, in reverse order, scaled
# as combined scores.
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
plot(myX,
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
xlim = c(1.0, 0.85), # reverse x-axis
type = "l",
main = "STRING interactions for 9606 (top 600,000)",
xlab = "combined_score",
ylab = "cumulative counts",
col = "#CC0000")
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
# What's the cutoff for 100,000 edges?
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
# confirm
sum(STR$combined_score >= 964) # 101,348
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
# subset the table, and use only the protein IDs and the combined_score
STR <- STR[STR$combined_score >= 964,
c("protein1", "protein2", "combined_score")]
colnames(STR) <- c("a", "b", "score")
# = 4 Drop duplicates ====================================================
# identify duplicate interactions by creating keys in a defined alphabetical
# sort order, then checking for duplicated().
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
# (X:U, X:U) has a duplicate.
AB <- STR$a < STR$b # logical vector: genes we need to swap
tmp <- STR$b # copy column b
STR$b[AB] <- STR$a[AB] # copy a's into b
STR$a[AB] <- tmp[AB] # copy tmp's into a
all(STR$a >= STR$b) # confirm: TRUE
# now, make combined keys, like this:
paste0(STR$a[1:10], ":", STR$b[1:10])
tmp <- paste0(STR$a, ":", STR$b)
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
# both a:b and b:a !
# drop all duplicated interactions from tmp
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
# = 5 Simple statistics ===================================================
# how many unique genes?
length(unique(c(STR$a, STR$b))) # 8,445
# how many self-edges?
sum(STR$a == STR$b) # none
# log(rank) / log(frequency)
myTbl <- table(c(STR$a, STR$b))
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
hist(myTbl, breaks = 40, col = "#FFEEBB")
# number of singletons
sum(myTbl == 1) # almost a quarter
# maximum?
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
# Google: CDC5L
# Zipf-plot
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
type = "b", cex = 0.7,
main = "STRINGedges - degrees",
xlab = "log(rank)",
ylab = "log(frequency)",
col = "#FFBB88")
sprintf("Average number of interactions: %5.2f",
nrow(STR) / length(unique(c(STR$a, STR$b))))
# = 6 Write to file =======================================================
saveRDS(STR, file = "./data/STRINGedges.rds")
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
# object when needed
# [END]

View File

@ -1,167 +1,167 @@
# tocID <- "scripts/ABC-makeScCCnet.R"
#
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
# GOSlim annotation.
#
# Boris Steipe for ABC learning units
#
# Notes:
#
# The large source- datafiles are NOT posted to github. If you want to
# experiment with your own code, download them and place them into your
# local ./data directory.
#
# STRING data source:
# Download page:
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
# Data: (20.1 mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
#
# GOSlim data source: (Note: this has moved from GO to SGD)
# Info page: https://www.yeastgenome.org/downloads
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
# Data: (3 mb)
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
#
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Update. GO Slim Yeast mow at SGD
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First code copied from 2016 material.
#
# TODO:
#
# ==============================================================================
# SRCDIR <- "./instructor"
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 INITIALIZE 58
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
#TOC> 3.1 Intersect interactions and annotations 122
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
#TOC>
#TOC> ==========================================================================
# = 1 INITIALIZE ==========================================================
SRCDIR <- "./data"
if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr")
}
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
# really not necessary to uncompress since readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
# Subset only IDs and combined_score column
STR <- STR[ , c("protein1", "protein2", "combined_score")]
# head(STR)
# sum(STR$combined_score > 909) # 100270 edges
# subset for 100,000 highest confidence edges
STR <- STR[(STR$combined_score > 909), ]
head(STR)
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
head(STR)
# get a vector of gene names in this list
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
# names
length(myIntxGenes)
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
#
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
Gsl <- readr::read_tsv(fn,
col_names = c("ID",
"name",
"SGDId",
"Ontology",
"termName",
"termID",
"status"))
head(Gsl)
# What cell cycle names does it contain?
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
myGslTermNames[grep("cycle", myGslTermNames)]
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
# Choose "mitotic cell cycle" as the GOslim term to subset with
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
length(scCCgenes) # 324 genes annotated to that term
# == 3.1 Intersect interactions and annotations ============================
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
# # functional interactions
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
#
# Define scCCnet ... the S. Cervisiae Cell Cycle network
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
#
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
(STR$protein2 %in% scCCgenes), ]
# How many genes are there?
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
# Each edge is listed twice - now remove duplicates.
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
# smaller Than the second one. This brings the two names into a defined
# order. Then concatenate them with a "." - the resulting string
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
# and c("B", "A") also gives "A.B". This identifies duplicates.
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
1,
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
sum(duplicated(x)) # 1453
# Step 2: drop all rows that contain duplicates in x
scCCnet <- scCCnet[! duplicated(x), ]
# Confirm we didn't loose genes
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
nrow(scCCnet)
# Network has 283 nodes, 1453 edges
saveRDS(scCCnet, file = "./data/scCCnet.rds")
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
# object when needed
# [END]
# tocID <- "scripts/ABC-makeScCCnet.R"
#
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
# GOSlim annotation.
#
# Boris Steipe for ABC learning units
#
# Notes:
#
# The large source- datafiles are NOT posted to github. If you want to
# experiment with your own code, download them and place them into your
# local ./data directory.
#
# STRING data source:
# Download page:
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
# Data: (20.1 mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
#
# GOSlim data source: (Note: this has moved from GO to SGD)
# Info page: https://www.yeastgenome.org/downloads
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
# Data: (3 mb)
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
#
#
# Version: 1.2
#
# Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 2020 Update. GO Slim Yeast mow at SGD
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First code copied from 2016 material.
#
# TODO:
#
# ==============================================================================
# SRCDIR <- "./instructor"
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 INITIALIZE 58
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
#TOC> 3.1 Intersect interactions and annotations 122
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
#TOC>
#TOC> ==========================================================================
# = 1 INITIALIZE ==========================================================
SRCDIR <- "./data"
if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr")
}
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
# really not necessary to uncompress since readr:: can read from compressed
# files, and does so automatically, based on the file extension.
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ")
# Subset only IDs and combined_score column
STR <- STR[ , c("protein1", "protein2", "combined_score")]
# head(STR)
# sum(STR$combined_score > 909) # 100270 edges
# subset for 100,000 highest confidence edges
STR <- STR[(STR$combined_score > 909), ]
head(STR)
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
head(STR)
# get a vector of gene names in this list
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
# names
length(myIntxGenes)
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
#
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
Gsl <- readr::read_tsv(fn,
col_names = c("ID",
"name",
"SGDId",
"Ontology",
"termName",
"termID",
"status"))
head(Gsl)
# What cell cycle names does it contain?
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
myGslTermNames[grep("cycle", myGslTermNames)]
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
# Choose "mitotic cell cycle" as the GOslim term to subset with
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
length(scCCgenes) # 324 genes annotated to that term
# == 3.1 Intersect interactions and annotations ============================
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
# # functional interactions
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
#
# Define scCCnet ... the S. Cervisiae Cell Cycle network
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
#
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
(STR$protein2 %in% scCCgenes), ]
# How many genes are there?
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
# Each edge is listed twice - now remove duplicates.
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
# smaller Than the second one. This brings the two names into a defined
# order. Then concatenate them with a "." - the resulting string
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
# and c("B", "A") also gives "A.B". This identifies duplicates.
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
1,
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
sum(duplicated(x)) # 1453
# Step 2: drop all rows that contain duplicates in x
scCCnet <- scCCnet[! duplicated(x), ]
# Confirm we didn't loose genes
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
nrow(scCCnet)
# Network has 283 nodes, 1453 edges
saveRDS(scCCnet, file = "./data/scCCnet.rds")
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
# object when needed
# [END]

View File

@ -1,135 +1,135 @@
# tocID <- "scripts/ABC-writeALN.R"
#
# ToDo: calculate consensus line
# append sequence numbers
# Notes:
#
# ==============================================================================
writeALN <- function(ali,
range,
note = "",
myCon = stdout(),
blockWidth = 60) {
# Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format.
# Version: 2.0
# Date: 2017 10
# Author: Boris Steipe
#
# Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character
# vector.
# range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g.
# a domain. Defaults to the full alignment length.
# note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one
# is added to all headers.
# myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters.
# Value:
# NA the function is invoked for its side effect of printing an
# alignment to stdout() or file.
blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.")
}
if (blockWidth < 1) {
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
}
if (blockWidth > 60) {
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
}
# Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked
}
# Process XStringSet
if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings
} else if (class(ali) == "character") {
sSet <- ali
} else {
stop(paste("Input object of class",
class(ali),
"can't be handled by this function."))
}
if (missing(range)) {
range <- 1
range[2] <- max(nchar(sSet))
} else {
range <- as.integer(range)
if(length(range) != 2 ||
any(is.na(range)) ||
range[1] > range[2] ||
range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.")
}
}
# Right-pad any sequence with "-" that is shorter than ranges[2]
for (i in seq_along(sSet)) {
if (nchar(sSet[i]) < range[2]) {
sSet[i] <- paste0(sSet[i],
paste0(rep("-", range[2] - nchar(sSet[i])),
collapse = ""))
}
}
# Right-pad sequence names
sNames <- names(sSet)
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
for (i in seq_along(sNames)) {
sNames[i] <- paste0(sNames[i],
paste0(rep(" ", len - nchar(sNames[i])),
collapse = ""))
}
# Process each sequence
txt <- paste0("CLUSTAL W format. ", note)
txt[2] <- ""
iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2])
for (i in seq_along(iStarts)) {
for (j in seq_along(sSet)) {
txt <- c(txt,
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
}
txt <- c(txt, "") # append a blank consenus line
txt <- c(txt, "") # append a separator line
}
writeLines(txt, con= myCon)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]
# tocID <- "scripts/ABC-writeALN.R"
#
# ToDo: calculate consensus line
# append sequence numbers
# Notes:
#
# ==============================================================================
writeALN <- function(ali,
range,
note = "",
myCon = stdout(),
blockWidth = 60) {
# Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format.
# Version: 2.0
# Date: 2017 10
# Author: Boris Steipe
#
# Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character
# vector.
# range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g.
# a domain. Defaults to the full alignment length.
# note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one
# is added to all headers.
# myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters.
# Value:
# NA the function is invoked for its side effect of printing an
# alignment to stdout() or file.
blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.")
}
if (blockWidth < 1) {
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
}
if (blockWidth > 60) {
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
}
# Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked
}
# Process XStringSet
if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings
} else if (class(ali) == "character") {
sSet <- ali
} else {
stop(paste("Input object of class",
class(ali),
"can't be handled by this function."))
}
if (missing(range)) {
range <- 1
range[2] <- max(nchar(sSet))
} else {
range <- as.integer(range)
if(length(range) != 2 ||
any(is.na(range)) ||
range[1] > range[2] ||
range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.")
}
}
# Right-pad any sequence with "-" that is shorter than ranges[2]
for (i in seq_along(sSet)) {
if (nchar(sSet[i]) < range[2]) {
sSet[i] <- paste0(sSet[i],
paste0(rep("-", range[2] - nchar(sSet[i])),
collapse = ""))
}
}
# Right-pad sequence names
sNames <- names(sSet)
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
for (i in seq_along(sNames)) {
sNames[i] <- paste0(sNames[i],
paste0(rep(" ", len - nchar(sNames[i])),
collapse = ""))
}
# Process each sequence
txt <- paste0("CLUSTAL W format. ", note)
txt[2] <- ""
iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2])
for (i in seq_along(iStarts)) {
for (j in seq_along(sSet)) {
txt <- c(txt,
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
}
txt <- c(txt, "") # append a blank consenus line
txt <- c(txt, "") # append a separator line
}
writeLines(txt, con= myCon)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]

View File

@ -1,121 +1,121 @@
# ABC-writeMFA.R
#
# ToDo:
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
#
#
# ==============================================================================
writeMFA <- function(ali,
range,
note = "",
myCon = stdout(),
blockWidth = 80) {
# Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format.
# Version: 2.1
# Date: 2017 10
# Author: Boris Steipe
#
# Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character
# vector
# range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g.
# a domain. Defaults to the full sequence length.
# note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one
# is added to all headers.
# myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters.
# Value:
# NA the function is invoked for its side effect of printing an
# alignment to stdout() or file.
blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.")
}
if (! blockWidth > 0){
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
}
# Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked
}
# Process XStringSet
if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings
} else if (class(ali) == "character") {
sSet <- ali
} else {
stop(paste("Input object of class",
class(ali),
"can't be handled by this function."))
}
if (missing(range)) {
range <- 1
range[2] <- max(nchar(sSet))
} else {
range <- as.integer(range)
if(length(range) != 2 ||
any(is.na(range)) ||
range[1] > range[2] ||
range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.")
}
}
# Process each sequence
txt <- character()
if (note != "") { # construct header line
headers <- paste(names(sSet), note)
} else {
headers <- names(sSet)
}
for (i in seq_along(sSet)) {
# output FASTA header
txt <- c(txt, sprintf(">%s", headers[i]))
# output the sequence in blocks of blockWidth per line ...
iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2])
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
txt <- c(txt, thisSeq)
txt <- c(txt, "") # append an empty line for readability
}
writeLines(txt, con = myCon)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]
# ABC-writeMFA.R
#
# ToDo:
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
#
#
# ==============================================================================
writeMFA <- function(ali,
range,
note = "",
myCon = stdout(),
blockWidth = 80) {
# Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format.
# Version: 2.1
# Date: 2017 10
# Author: Boris Steipe
#
# Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character
# vector
# range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g.
# a domain. Defaults to the full sequence length.
# note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one
# is added to all headers.
# myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters.
# Value:
# NA the function is invoked for its side effect of printing an
# alignment to stdout() or file.
blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.")
}
if (! blockWidth > 0){
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
}
# Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked
}
# Process XStringSet
if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings
} else if (class(ali) == "character") {
sSet <- ali
} else {
stop(paste("Input object of class",
class(ali),
"can't be handled by this function."))
}
if (missing(range)) {
range <- 1
range[2] <- max(nchar(sSet))
} else {
range <- as.integer(range)
if(length(range) != 2 ||
any(is.na(range)) ||
range[1] > range[2] ||
range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.")
}
}
# Process each sequence
txt <- character()
if (note != "") { # construct header line
headers <- paste(names(sSet), note)
} else {
headers <- names(sSet)
}
for (i in seq_along(sSet)) {
# output FASTA header
txt <- c(txt, sprintf(">%s", headers[i]))
# output the sequence in blocks of blockWidth per line ...
iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2])
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
txt <- c(txt, thisSeq)
txt <- c(txt, "") # append an empty line for readability
}
writeLines(txt, con = myCon)
}
# ==== TESTS =================================================================
# Enter your function tests here...
if (FALSE) {
# test ...
}
# [END]

View File

@ -1,384 +1,384 @@
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
#
#
# Version: 3.2
# Date: 2016 09 - 2020 09
# Author: Boris Steipe
#
# Versions:
# 3.2 2020 updates
# 3.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 3.0 parsing logic had not been fully implemented; Fixed.
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
# 1.0 first version posted for BCH441 2016, based on BLAST - API
#
# ToDo: Return the organism/strain name in the output, and propagate
# into MYSPE selection script.
#
# Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code.
#
# ==============================================================================
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
BLAST <- function(Q,
db = "refseq_protein",
nHits = 30,
E = 0.1,
limits = "",
rid = "",
query = "",
quietly = FALSE,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
#
# Parameters:
# Q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal values include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earlier search results
# query: the actual query string (needed when retrieving results
# with an rid)
# quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds)
# Value:
# result: list of process status or resulting hits, and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list()
results$query = query
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # If no rid is available, spawn a search.
# Else, proceed directly to retrieval.
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Put",
"&PROGRAM=", "blastp",
"&QUERY=", URLencode(Q),
"&DATABASE=", db,
"&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text")
if (limits != "") {
results$query <- paste0(
results$query,
"&ENTREZ_QUERY=", limits)
}
# send it off ...
response <- httr::GET(results$query)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
} # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo",
sep = "")
while (TRUE) {
# Check whether the result is ready
response <- httr::GET(checkStatus)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
"Trying checking back later with >",
results$rid))
return(results)
}
if (quietly) {
Sys.sleep(EXTRAWAIT)
} else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT,
myTimeout))
waitTimer(EXTRAWAIT)
next
}
} else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n")
return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n")
return(results)
} else {
break # done ... retrieve search result
}
}
} # end result-check loop
# retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"&CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
sep = "")
response <- httr::GET(retrieve)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt)
iLast <- x[length(x)]
# Get the alignments block
txt <- txt[iFirst:iLast]
# Drop empty lines
txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
if (length(x) > 0) {
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
}
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt)
x <- rle(iKeep)
x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE
txt <- txt[iKeep]
}
# After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the
# first character is ">"
# - There is only one defline in each block.
# - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt)
blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks
results$hits <- list()
for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock)
}
return(results)
}
parseBLASTalignment <- function(hit) {
# Parse data from a character vector containing a BLAST hit
# Parameters:
# hit char one BLAST hit as char vector
# Value:
# list $def chr defline
# $accession chr accession number
# $organism chr complete organism definition
# $species chr binomial species
# $E num E value
# $lengthAli num length of the alignment
# $nIdentitites num number of identities
# $nGaps num number of gaps
# $Qbounds num 2-element vector of query start-end
# $Sbounds num 2-element vector of subject start-end
# $Qseq chr query sequence
# $midSeq chr midline string
# $Sseq chr subject sequence
getToken <- function(patt, v) {
# get the first token identified by pattern patt in character vector v
v <- v[grep(patt, v)]
if (length(v) > 1) { v <- v[1] }
if (length(v) == 0) { token <- NA
} else {
token <- regmatches(v, regexec(patt, v))[[1]][2] }
return(token)
}
h <- list()
# FASTA defline
h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism
patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) >= 2) {
h$species <- paste(x[1], x[2])
} else if (length(x) == 1) {
h$species <- paste(x[1], "sp.")
} else {
h$species <- NA
}
# E-value
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
# length of alignment
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
# number of identities
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
# number of gaps
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
# split up alignment section
idx <- grep("^Query ", hit)
Que <- hit[idx]
Mid <- hit[idx + 1]
Sbj <- hit[idx + 2]
# first and last positions
h$Qbounds <- c(start = 0, end = 0)
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
h$Sbounds <- c(start = 0, end = 0)
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
# aligned sequences
for (i in seq_along(Que)) {
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
m <- regexec(patt, Que[i])
iFirst <- m[[1]][2]
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
Que[i] <- substring(Que[i], iFirst, iLast)
Mid[i] <- substring(Mid[i], iFirst, iLast)
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
}
h$Qseq <- paste0(Que, collapse = "")
h$midSeq <- paste0(Mid, collapse = "")
h$Sseq <- paste0(Sbj, collapse = "")
return(h)
}
# ==== TESTS ===================================================================
if (FALSE) {
# define query:
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
sep="")
# or ...
q <- "NP_010227" # refseq ID
test <- BLAST(q,
nHits = 100,
E = 0.001,
rid = "",
limits = "txid4751[ORGN]") # Fungi
str(test)
length(test$hits)
}
# [END]
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
#
#
# Version: 3.2
# Date: 2016 09 - 2020 09
# Author: Boris Steipe
#
# Versions:
# 3.2 2020 updates
# 3.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 3.0 parsing logic had not been fully implemented; Fixed.
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
# 1.0 first version posted for BCH441 2016, based on BLAST - API
#
# ToDo: Return the organism/strain name in the output, and propagate
# into MYSPE selection script.
#
# Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code.
#
# ==============================================================================
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
}
BLAST <- function(Q,
db = "refseq_protein",
nHits = 30,
E = 0.1,
limits = "",
rid = "",
query = "",
quietly = FALSE,
myTimeout = 120) {
# Purpose:
# Basic BLAST search
#
# Parameters:
# Q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal values include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earlier search results
# query: the actual query string (needed when retrieving results
# with an rid)
# quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds)
# Value:
# result: list of process status or resulting hits, and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list()
results$query = query
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # If no rid is available, spawn a search.
# Else, proceed directly to retrieval.
# prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Put",
"&PROGRAM=", "blastp",
"&QUERY=", URLencode(Q),
"&DATABASE=", db,
"&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text")
if (limits != "") {
results$query <- paste0(
results$query,
"&ENTREZ_QUERY=", limits)
}
# send it off ...
response <- httr::GET(results$query)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
} # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo",
sep = "")
while (TRUE) {
# Check whether the result is ready
response <- httr::GET(checkStatus)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
"Trying checking back later with >",
results$rid))
return(results)
}
if (quietly) {
Sys.sleep(EXTRAWAIT)
} else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT,
myTimeout))
waitTimer(EXTRAWAIT)
next
}
} else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n")
return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n")
return(results)
} else {
break # done ... retrieve search result
}
}
} # end result-check loop
# retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"&CMD=Get",
"&RID=", results$rid,
"&FORMAT_TYPE=Text",
sep = "")
response <- httr::GET(retrieve)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
httr::http_status(response)$message))
}
txt <- httr::content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt)
iLast <- x[length(x)]
# Get the alignments block
txt <- txt[iFirst:iLast]
# Drop empty lines
txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them.
#
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
if (length(x) > 0) {
txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x]
}
# Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt)
x <- rle(iKeep)
x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE
txt <- txt[iKeep]
}
# After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the
# first character is ">"
# - There is only one defline in each block.
# - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt)
blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks
results$hits <- list()
for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock)
}
return(results)
}
parseBLASTalignment <- function(hit) {
# Parse data from a character vector containing a BLAST hit
# Parameters:
# hit char one BLAST hit as char vector
# Value:
# list $def chr defline
# $accession chr accession number
# $organism chr complete organism definition
# $species chr binomial species
# $E num E value
# $lengthAli num length of the alignment
# $nIdentitites num number of identities
# $nGaps num number of gaps
# $Qbounds num 2-element vector of query start-end
# $Sbounds num 2-element vector of subject start-end
# $Qseq chr query sequence
# $midSeq chr midline string
# $Sseq chr subject sequence
getToken <- function(patt, v) {
# get the first token identified by pattern patt in character vector v
v <- v[grep(patt, v)]
if (length(v) > 1) { v <- v[1] }
if (length(v) == 0) { token <- NA
} else {
token <- regmatches(v, regexec(patt, v))[[1]][2] }
return(token)
}
h <- list()
# FASTA defline
h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism
patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species
x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) >= 2) {
h$species <- paste(x[1], x[2])
} else if (length(x) == 1) {
h$species <- paste(x[1], "sp.")
} else {
h$species <- NA
}
# E-value
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
# length of alignment
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
# number of identities
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
# number of gaps
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
# split up alignment section
idx <- grep("^Query ", hit)
Que <- hit[idx]
Mid <- hit[idx + 1]
Sbj <- hit[idx + 2]
# first and last positions
h$Qbounds <- c(start = 0, end = 0)
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
h$Sbounds <- c(start = 0, end = 0)
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
# aligned sequences
for (i in seq_along(Que)) {
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
m <- regexec(patt, Que[i])
iFirst <- m[[1]][2]
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
Que[i] <- substring(Que[i], iFirst, iLast)
Mid[i] <- substring(Mid[i], iFirst, iLast)
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
}
h$Qseq <- paste0(Que, collapse = "")
h$midSeq <- paste0(Mid, collapse = "")
h$Sseq <- paste0(Sbj, collapse = "")
return(h)
}
# ==== TESTS ===================================================================
if (FALSE) {
# define query:
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
sep="")
# or ...
q <- "NP_010227" # refseq ID
test <- BLAST(q,
nHits = 100,
E = 0.001,
rid = "",
limits = "txid4751[ORGN]") # Fungi
str(test)
length(test$hits)
}
# [END]

View File

@ -1,32 +1,32 @@
# test_biCode.R
#
context("biCode() utility function tests") # A set of tests for some
# functionality
test_that("expected input is processed correctly", { # Related expectations
expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU"))
})
test_that("unexpected input is managed", {
expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA")
})
test_that("NA values are preserved", {
expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST."))
})
test_that("Missing argument throws an error", {
expect_error(biCode(), "argument \"s\" is missing, with no default")
})
# [END]
# test_biCode.R
#
context("biCode() utility function tests") # A set of tests for some
# functionality
test_that("expected input is processed correctly", { # Related expectations
expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU"))
})
test_that("unexpected input is managed", {
expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA")
})
test_that("NA values are preserved", {
expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST."))
})
test_that("Missing argument throws an error", {
expect_error(biCode(), "argument \"s\" is missing, with no default")
})
# [END]