Line termination change and old code.

This commit is contained in:
Harrison Deng 2021-11-16 00:31:48 -05:00
parent b1e00f52f7
commit affe00f6fb
86 changed files with 37873 additions and 37876 deletions

258
.Rprofile
View File

@ -1,129 +1,129 @@
# .Rprofile # .Rprofile
# #
# This script is automatically executed on startup # This script is automatically executed on startup
# ============================================================================== # ==============================================================================
init <- function() { init <- function() {
# Create a local copy of myScript.R if not done yet. # Create a local copy of myScript.R if not done yet.
if (! file.exists("myScript.R") && file.exists(".tmp.R")) { if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
file.copy(".tmp.R", "myScript.R") file.copy(".tmp.R", "myScript.R")
cat("A new file \"myScript.R\" was created. You can use it for\n") cat("A new file \"myScript.R\" was created. You can use it for\n")
cat("notes and code experiments.\n\n") cat("notes and code experiments.\n\n")
} }
cat("\n\n") cat("\n\n")
cat("Please open the file \".myProfile.R\" (click on the file-name in the\n") cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
cat("\"files\" pane), edit it and save it.\n") cat("\"files\" pane), edit it and save it.\n")
cat("Then click the checkbox, and use the More -> Move... dialogue\n") cat("Then click the checkbox, and use the More -> Move... dialogue\n")
cat("to move it into the \"myScripts\" folder.\n\n") cat("to move it into the \"myScripts\" folder.\n\n")
file.edit("ABC-units.R") file.edit("ABC-units.R")
return(invisible(NULL)) return(invisible(NULL))
} }
if (! file.exists("./myScripts/.myProfile.R")) { if (! file.exists("./myScripts/.myProfile.R")) {
cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
cat(" =================") cat(" =================")
cat("\n\n") cat("\n\n")
cat(" WELCOME !\n") cat(" WELCOME !\n")
cat("\n") cat("\n")
cat(" Type 'init()' to begin\n\n") cat(" Type 'init()' to begin\n\n")
cat("\n") cat("\n")
cat(" =================") cat(" =================")
cat("\n\n") cat("\n\n")
} else { # local profile exists ... validate state: } else { # local profile exists ... validate state:
cat("\n\nLoading local functions ...") cat("\n\nLoading local functions ...")
source(".utilities.R") # local profile appears sane, source utilities source(".utilities.R") # local profile appears sane, source utilities
source("./myScripts/.myProfile.R") source("./myScripts/.myProfile.R")
if (! exists("myEMail")) { # ... has eMail been defined? if (! exists("myEMail")) { # ... has eMail been defined?
cat("ERROR !\n") cat("ERROR !\n")
cat("=======\n") cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n") cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myEMail\" was not loaded.\n") cat("the variable \"myEMail\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n") cat("Please contact your instructor to continue.\n\n")
} }
if (! exists("myStudentNumber")) { # ... has the Student Number been defined? if (! exists("myStudentNumber")) { # ... has the Student Number been defined?
cat("ERROR !\n") cat("ERROR !\n")
cat("=======\n") cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n") cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("the variable \"myStudentNumber\" was not loaded.\n") cat("the variable \"myStudentNumber\" was not loaded.\n")
cat("Please contact your instructor to continue.\n\n") cat("Please contact your instructor to continue.\n\n")
} }
if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) { if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
cat("ERROR !\n") # is the Student Number valid? cat("ERROR !\n") # is the Student Number valid?
cat("=======\n") cat("=======\n")
cat("The file \"./myScripts/.myProfile.R\" exists, but\n") cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
cat("your Student Number could not be validated.\n") cat("your Student Number could not be validated.\n")
cat("Please examine the file \"./myScripts/.myProfile.R\"\n") cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
cat(" and fix the problem or contact your instructor to continue.\n\n") cat(" and fix the problem or contact your instructor to continue.\n\n")
} }
if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now if (! exists("MYSPE")) { # if MYSPE has not yet been defined, define it now
# ... and write it into the profile. # ... and write it into the profile.
prf <- readLines("./myScripts/.myProfile.R") prf <- readLines("./myScripts/.myProfile.R")
iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf) iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
out <- prf[1:iEmail] out <- prf[1:iEmail]
out <- c(out, sprintf("MYSPE <- \"%s\" ", out <- c(out, sprintf("MYSPE <- \"%s\" ",
getMYSPE(myStudentNumber))) getMYSPE(myStudentNumber)))
out <- c(out, prf[(iEmail+1):length(prf)]) out <- c(out, prf[(iEmail+1):length(prf)])
writeLines(out, "./myScripts/.myProfile.R") writeLines(out, "./myScripts/.myProfile.R")
cat("\n") cat("\n")
cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n", cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
getMYSPE(myStudentNumber))) getMYSPE(myStudentNumber)))
MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use MYSPE <- getMYSPE(myStudentNumber) # ... define it for immediate use
rm(prf, iEmail, out) # cleanup rm(prf, iEmail, out) # cleanup
} }
cat("... done.\n\n") cat("... done.\n\n")
} }
if (default.stringsAsFactors()) { if (default.stringsAsFactors()) {
cat("WARNING.\n") cat("WARNING.\n")
cat("========\n") cat("========\n")
cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n") cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
cat("This will break some of the code.\n") cat("This will break some of the code.\n")
cat("Please contact your instructor to troubleshoot and fix this issue.\n") cat("Please contact your instructor to troubleshoot and fix this issue.\n")
cat("\n") cat("\n")
} }
errText <- list() errText <- list()
errText[["noProfileFile"]] <- ' errText[["noProfileFile"]] <- '
Your PROFILE FILE does not exist. This problem must be fixed to continue. Your PROFILE FILE does not exist. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. Detailed contain your correct eMail address and student number. Detailed
instructions were given when you first ran the init() command. instructions were given when you first ran the init() command.
Try running init() again and follow the instructions. Reload youR RStudio Try running init() again and follow the instructions. Reload youR RStudio
session and start over with this file. session and start over with this file.
If this does not fix the problem, ask for help. If this does not fix the problem, ask for help.
' '
errText[["noStudentNumber"]] <- ' errText[["noStudentNumber"]] <- '
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue. Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
The code expects the file "./myScripts/.myProfile.R" to exist and to The code expects the file "./myScripts/.myProfile.R" to exist and to
contain your correct eMail address and student number. This file gets contain your correct eMail address and student number. This file gets
sourced when you start a new R-session, but since you see this error sourced when you start a new R-session, but since you see this error
message there was a problem. message there was a problem.
Perhaps you need to restart your R-session. Try closing the RStudio Perhaps you need to restart your R-session. Try closing the RStudio
project and reopening it from the File > Recent Projects menu. project and reopening it from the File > Recent Projects menu.
Perhaps there was a syntax error in your file. Then not all the Perhaps there was a syntax error in your file. Then not all the
instructions in the file are executed. Check the file: is your instructions in the file are executed. Check the file: is your
email perhpas not defined? Or did you type it without qwuoataion email perhpas not defined? Or did you type it without qwuoataion
marks? marks?
Try fixing problems, and then restart R as described above. Try fixing problems, and then restart R as described above.
If none of this fixes the problem, ask for help. If none of this fixes the problem, ask for help.
' '
# [END] # [END]

88
.gitignore vendored
View File

@ -1,44 +1,44 @@
# Miscellaneous # Miscellaneous
.Ds_store .Ds_store
instructor/ instructor/
dev/ dev/
# myScripts/ # We don't want to ignore this so we can save our work to our own fork. # myScripts/ # We don't want to ignore this so we can save our work to our own fork.
# History files # History files
.Rhistory .Rhistory
.Rapp.history .Rapp.history
# Session Data files # Session Data files
# .RData # .RData
# Files produced in assingments # Files produced in assingments
data/APSESphyloSet.mfa data/APSESphyloSet.mfa
data/APSEStreeRproml.rds data/APSEStreeRproml.rds
# Example code in package build process # Example code in package build process
*-Ex.R *-Ex.R
# Output files from R CMD build # Output files from R CMD build
/*.tar.gz /*.tar.gz
# Output files from R CMD check # Output files from R CMD check
/*.Rcheck/ /*.Rcheck/
# RStudio files # RStudio files
.Rproj.user/ .Rproj.user/
# produced vignettes # produced vignettes
vignettes/*.html vignettes/*.html
vignettes/*.pdf vignettes/*.pdf
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth .httr-oauth
# knitr and R markdown default cache directories # knitr and R markdown default cache directories
/*_cache/ /*_cache/
/cache/ /cache/
# Temporary files created by R markdown # Temporary files created by R markdown
*.utf8.md *.utf8.md
*.knit.md *.knit.md
.Rproj.user .Rproj.user

76
.tmp.R
View File

@ -1,38 +1,38 @@
# myScript.R # myScript.R
# #
# --- As you work with this file, you can delete the instructions below -------- # --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it # Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_ # from time to time - however I recommend that you do not _commit_
# your saved version. # your saved version.
# #
# As long as you do not _commit_ this script to version control, # As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub # you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once # by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require # you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates. # you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above -------- # --- As you work with this file, you can delete the instructions above --------
# #
## Purpose: <...> ## Purpose: <...>
# #
# Version: <...> # Version: <...>
# #
# Date: <...> # Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>) # Author: <Name> (<namee@mail.utoronto.ca>)
# #
# Versions: # Versions:
# #
# <number> <Features> # <number> <Features>
# #
# TODO: # TODO:
# <...> # <...>
# #
# ==================================================================== # ====================================================================
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,257 +1,257 @@
# 2021-10-12_In-Class_exploration.R # 2021-10-12_In-Class_exploration.R
# #
# ===== T H E E V E N B E T T E R A M I N O A C I D ===== # ===== T H E E V E N B E T T E R A M I N O A C I D =====
# #
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 # Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
# Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu # Explorers: Jocelyn Nurtanto, Yuzi Li, and Jerry Gu
# Scribe: boris.steipe@utoronto.ca # Scribe: boris.steipe@utoronto.ca
# #
# ============================================================================== # ==============================================================================
# #
# In our last session we explored some properties of amino acids and noted that # In our last session we explored some properties of amino acids and noted that
# we can arrange them in a scatter-plot according to some properties. But can # we can arrange them in a scatter-plot according to some properties. But can
# we also arrange them according to generic properties, i.e. taking all # we also arrange them according to generic properties, i.e. taking all
# published property scales into account? We will try to use all tables from # published property scales into account? We will try to use all tables from
# the seqinr package. # the seqinr package.
# First we load the package - this makes all datasets immediately available and # First we load the package - this makes all datasets immediately available and
# we don't have to load them one by one. # we don't have to load them one by one.
library(seqinr) library(seqinr)
# Determine what datasets are available # Determine what datasets are available
# #
# Using "find in topic" ... "amino acid" # Using "find in topic" ... "amino acid"
data(aacost) data(aacost)
data(aaindex) data(aaindex)
data(pK) data(pK)
# We note that datasets may be sorted in different ways: for example # We note that datasets may be sorted in different ways: for example
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, # alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino # Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
# acids are sorted in the same way. # acids are sorted in the same way.
# Build a datastructure ... # Build a datastructure ...
# rows: amino acids # rows: amino acids
# columns: properties # columns: properties
# Are all lists in aaindex organized in the same way? # Are all lists in aaindex organized in the same way?
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
# index as a reference list # index as a reference list
# Loop over each list in aaindex # Loop over each list in aaindex
for (i in 1:length(aaindex)) { for (i in 1:length(aaindex)) {
# get the I-vector # get the I-vector
x <- aaindex[[i]]$I x <- aaindex[[i]]$I
# get the names # get the names
x <- names(x) x <- names(x)
# compare with the names of our reference list # compare with the names of our reference list
# the == and != operators are vectorized. Applying them to two vectors # the == and != operators are vectorized. Applying them to two vectors
# gives TRUE or FALSE for each pair of elements. any() or all() can be # gives TRUE or FALSE for each pair of elements. any() or all() can be
# applied to logical vectors to anylise them and return a soingle result. # applied to logical vectors to anylise them and return a soingle result.
# if (...) conditions evaluate only a single value and will throw a warning if # if (...) conditions evaluate only a single value and will throw a warning if
# there is more than one. # there is more than one.
if (any(x != refNames)) { if (any(x != refNames)) {
# There was at least one not-equal pair - so: complain # There was at least one not-equal pair - so: complain
print(sprintf("Problem in list %d: names don't match", i)) print(sprintf("Problem in list %d: names don't match", i))
} }
} }
# If we get here without identifying problems, it means all pairs of # If we get here without identifying problems, it means all pairs of
# rownames match throughout the aainfex list. # rownames match throughout the aainfex list.
# Next: what is the cvorrect syntax to add one vector (the "I" vector of # Next: what is the cvorrect syntax to add one vector (the "I" vector of
# one of the list elements) to our dataframe? # one of the list elements) to our dataframe?
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
aaData[,2] <- aaindex[[2]]$I # ... add the secondf index aaData[,2] <- aaindex[[2]]$I # ... add the secondf index
str(aaData) # Confirm: we now have a two-column dataframe str(aaData) # Confirm: we now have a two-column dataframe
# Next: add the rest ... # Next: add the rest ...
for (i in 3:length(aaindex)) { for (i in 3:length(aaindex)) {
# get the I-vector and write it into our dataframe # get the I-vector and write it into our dataframe
aaData[,i] <- aaindex[[i]]$I aaData[,i] <- aaindex[[i]]$I
} }
# Sanity check # Sanity check
plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other plot(aaData[,37], aaData[,544]) # plot two arbitray inices against each other
# Looks good. # Looks good.
# We finished building our data structure ... but let's add the aacost table # We finished building our data structure ... but let's add the aacost table
# aacost is ordered differently: # aacost is ordered differently:
rownames(aaData) rownames(aaData)
aacost[ , 1] aacost[ , 1]
# using order(), applied to aacost - ordering the column with column-name # using order(), applied to aacost - ordering the column with column-name
# "aaa" # "aaa"
sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes sel <- order(aacost[ , "aaa"]) # alphebetic ordering of three-letter codes
aacost[sel, "aaa"] # applying the order vector sorts the column aacost[sel, "aaa"] # applying the order vector sorts the column
# Is this the same order as refNames? # Is this the same order as refNames?
refNames == aacost[sel, "aaa"] # Yes! refNames == aacost[sel, "aaa"] # Yes!
# add the data from column "tot" (i.e. total metabolic cost) after the # add the data from column "tot" (i.e. total metabolic cost) after the
# last column of aaData # last column of aaData
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
# Done. # Done.
str(aaData) # A dataframe with 20 rows and 545 columns str(aaData) # A dataframe with 20 rows and 545 columns
# To answer the question "Which amino acids are similar to each other?" we # To answer the question "Which amino acids are similar to each other?" we
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise # need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
# we will succumb to the "Curse of Dimensionality": # we will succumb to the "Curse of Dimensionality":
# #
# "in high dimensional data, however, all objects appear # "in high dimensional data, however, all objects appear
# to be sparse and dissimilar in many ways..." # to be sparse and dissimilar in many ways..."
# https://en.wikipedia.org/wiki/Curse_of_dimensionality # https://en.wikipedia.org/wiki/Curse_of_dimensionality
# #
# A classic way to do this is Principal Component Analysis (PCA) ... # A classic way to do this is Principal Component Analysis (PCA) ...
# (Principal components analysis) # (Principal components analysis)
# #
# PCA expects objects in columns, properties in rows. Therefore we need to # PCA expects objects in columns, properties in rows. Therefore we need to
# transpose our dataset: # transpose our dataset:
aaPCA <- prcomp(t(aaData)) aaPCA <- prcomp(t(aaData))
# This creates an error, because some of our indicews contain NA values! # This creates an error, because some of our indicews contain NA values!
# Which indices are this? # Which indices are this?
# We create a vector "sel" for which we check whether any element in each # We create a vector "sel" for which we check whether any element in each
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can # column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
# then use this vector to subset ourt dataframe. # then use this vector to subset ourt dataframe.
sel <- logical() sel <- logical()
for (i in 1:ncol(aaData)) { # for each index for (i in 1:ncol(aaData)) { # for each index
if (any(is.na(aaData[,i]))) { # if there is any NA value ... if (any(is.na(aaData[,i]))) { # if there is any NA value ...
sel <- c(sel, FALSE) # add a FALSE element to the vector sel <- c(sel, FALSE) # add a FALSE element to the vector
} else { # else } else { # else
sel <- c(sel, TRUE) # add a TRUE element sel <- c(sel, TRUE) # add a TRUE element
} }
} }
# Done. sel now subsets only the NA-free columns # Done. sel now subsets only the NA-free columns
545 - sum(sel) # 13 columns excluded 545 - sum(sel) # 13 columns excluded
# Do the PCA ... use the prcomp() function # Do the PCA ... use the prcomp() function
aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set aaPCA <- prcomp(t(aaData[ ,sel])) # PCA of the transposed, selected data set
str(aaPCA) # structure of the result str(aaPCA) # structure of the result
plot(aaPCA) # plot the contributions of the plot(aaPCA) # plot the contributions of the
# components to the variance # components to the variance
plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC plot(aaPCA$rotation[ , 1], # plot the first PC against the second PC
aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame aaPCA$rotation[ , 2], # in a scatterplot, in an empty frame
type ="n") # just to set up the coordinate system type ="n") # just to set up the coordinate system
text(aaPCA$rotation[ , 1], # plot the names of the amino acids into text(aaPCA$rotation[ , 1], # plot the names of the amino acids into
aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions aaPCA$rotation[ , 2], # their respective (PC1, PC2) positions
labels = rownames(aaPCA$rotation)) labels = rownames(aaPCA$rotation))
# PCA results are sensitive to the absolute numeric value of the features that # PCA results are sensitive to the absolute numeric value of the features that
# we are comparing. The prcomp() function has an option scale. = TRUE that # we are comparing. The prcomp() function has an option scale. = TRUE that
# scales each row of features so that the variance of the value is 1.0 This # scales each row of features so that the variance of the value is 1.0 This
# ensures that each feature is given approximately equal weight # ensures that each feature is given approximately equal weight
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
plot(aaPCA) plot(aaPCA)
plot(aaPCA$rotation[ , 1], plot(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2], aaPCA$rotation[ , 2],
type ="n") type ="n")
text(aaPCA$rotation[ , 1], text(aaPCA$rotation[ , 1],
aaPCA$rotation[ , 2], aaPCA$rotation[ , 2],
labels = rownames(aaPCA$rotation)) labels = rownames(aaPCA$rotation))
# Next we try to identify what the PCs correspond to. We see whether there are # Next we try to identify what the PCs correspond to. We see whether there are
# specific features that are highly correlated with the PCs # specific features that are highly correlated with the PCs
# ==== Rotation 1 =================== # ==== Rotation 1 ===================
# #
(PC1 <- aaPCA$rotation[ , 1]) # Assign PC1 (PC1 <- aaPCA$rotation[ , 1]) # Assign PC1
# The function cor() calculates Pearson coefficients of correlation # The function cor() calculates Pearson coefficients of correlation
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
# Iterate over all columns and calculate correlations # Iterate over all columns and calculate correlations
cors <- numeric() cors <- numeric()
for (i in 1:ncol(aaData)) { for (i in 1:ncol(aaData)) {
cors[i] <- cor(PC1, aaData[ , i]) cors[i] <- cor(PC1, aaData[ , i])
} }
summary(cors) summary(cors)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13 # -0.54072 -0.13703 0.05654 0.03729 0.21349 0.59589 13
# #
# The max correlation is ~0.6. That is not very high. Which ijndex is it? # The max correlation is ~0.6. That is not very high. Which ijndex is it?
which(cors == max(cors, na.rm = TRUE)) which(cors == max(cors, na.rm = TRUE))
aaindex[[504]] # Linker propensity ??? aaindex[[504]] # Linker propensity ???
cor(PC1, aaindex[[504]]$I) # Did we get the right index? cor(PC1, aaindex[[504]]$I) # Did we get the right index?
# Plot this ... # Plot this ...
plot(aaPCA$rotation[ , 1], plot(aaPCA$rotation[ , 1],
aaindex[[504]]$I, aaindex[[504]]$I,
type ="n") type ="n")
text(aaPCA$rotation[ , 1], text(aaPCA$rotation[ , 1],
aaindex[[504]]$I, aaindex[[504]]$I,
labels = rownames(aaPCA$rotation)) labels = rownames(aaPCA$rotation))
# This is essentially a random correlation but for Cysteine ... # This is essentially a random correlation but for Cysteine ...
# ==== Rotation 2 =================== # ==== Rotation 2 ===================
# #
# same process # same process
PC2 <- aaPCA$rotation[ , 2] PC2 <- aaPCA$rotation[ , 2]
cors2 <- numeric() cors2 <- numeric()
for (i in 1:ncol(aaData)) { for (i in 1:ncol(aaData)) {
cors2[i] <- cor(PC2, aaData[ , i]) cors2[i] <- cor(PC2, aaData[ , i])
} }
summary(cors2) summary(cors2)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13 # -0.95214 -0.56067 -0.12817 -0.05787 0.43046 0.94346 13
# Here we have quite strong correlations # Here we have quite strong correlations
which(cors2 == max(cors2, na.rm = TRUE)) which(cors2 == max(cors2, na.rm = TRUE))
aaindex[[148]] aaindex[[148]]
# this index itself is correlated with many other indices # this index itself is correlated with many other indices
cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index cor(PC2, aaindex[[148]]$I) # confirmn that we have the right index
# Plot this too... # Plot this too...
plot(aaPCA$rotation[ , 2], plot(aaPCA$rotation[ , 2],
aaindex[[148]]$I, aaindex[[148]]$I,
type ="n") type ="n")
text(aaPCA$rotation[ , 2], text(aaPCA$rotation[ , 2],
aaindex[[148]]$I, aaindex[[148]]$I,
labels = rownames(aaPCA$rotation)) labels = rownames(aaPCA$rotation))
# This correlates well with hydrophobicity measures. In this case the # This correlates well with hydrophobicity measures. In this case the
# PC is to a certain degree interpretable - but this is not always the case # PC is to a certain degree interpretable - but this is not always the case
# with PCA (see the example of the first PC). # with PCA (see the example of the first PC).
# [END] # [END]

View File

@ -1,161 +1,161 @@
# tocID <- "ABC-Install_all_packages.R" # tocID <- "ABC-Install_all_packages.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# Installing all packages in this course # Installing all packages in this course
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2021 10 # Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.0 New code # 1.0 New code
# #
# #
# TODO: # TODO:
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------- #TOC> ----------------------------------------------
#TOC> 1 Packages 33 #TOC> 1 Packages 33
#TOC> 2 CRAN packages 98 #TOC> 2 CRAN packages 98
#TOC> 3 Bioconductor packages 127 #TOC> 3 Bioconductor packages 127
#TOC> 4 Other package sources 142 #TOC> 4 Other package sources 142
#TOC> 5 Updating packages 148 #TOC> 5 Updating packages 148
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Packages ============================================================ # = 1 Packages ============================================================
# Much of R's functionality is contributed in packages: bundles of R scripts # Much of R's functionality is contributed in packages: bundles of R scripts
# or code in other languages, pre-configured objects, and datasets. Making this # or code in other languages, pre-configured objects, and datasets. Making this
# functionality available is often done by issuing a library(<package-name>) # functionality available is often done by issuing a library(<package-name>)
# command, however this is not the preferred way, since it may override other # command, however this is not the preferred way, since it may override other
# R functions and it makes it harder to understand where the source code of # R functions and it makes it harder to understand where the source code of
# a particular function is located. In this course we call the function name # a particular function is located. In this course we call the function name
# prefixed with the package name and two colons: # prefixed with the package name and two colons:
# <package-name>::<function-name>() # <package-name>::<function-name>()
# This is the preferred way, since it is explicit. # This is the preferred way, since it is explicit.
# #
# Regardless of which idiom one uses to call the actual function, the package # Regardless of which idiom one uses to call the actual function, the package
# needs to be "installed" first, i.e. the code must have been downloaded # needs to be "installed" first, i.e. the code must have been downloaded
# from CRAN, or using the BiocManager::install() function. # from CRAN, or using the BiocManager::install() function.
# #
# This script contains download commands for all packages that are used in the # This script contains download commands for all packages that are used in the
# course. You can execute the script line by line (or even source the entire # course. You can execute the script line by line (or even source the entire
# script) to make sure all packages can be installed on your computer. Just # script) to make sure all packages can be installed on your computer. Just
# one reminder: if you are ever asked to install from source, the correct # one reminder: if you are ever asked to install from source, the correct
# answer is usually "no" - except if you really know what you are doing and why. # answer is usually "no" - except if you really know what you are doing and why.
# #
# Once packages are installed you can get additional information about # Once packages are installed you can get additional information about
# the contents of a package with the commands: # the contents of a package with the commands:
# library(help=<package-name>) # basic information # library(help=<package-name>) # basic information
# browseVignettes("<package-name>") # available vignettes # browseVignettes("<package-name>") # available vignettes
# data(package = "<package-name>") # available datasets # data(package = "<package-name>") # available datasets
# #
# ... and you can load data sets with: # ... and you can load data sets with:
# data(<data-set-name>, package = "<package-name>") # data(<data-set-name>, package = "<package-name>")
# #
# All packages here are installed only when they have not been installed # All packages here are installed only when they have not been installed
# before, using the following idiom: # before, using the following idiom:
# #
# if (! requireNamespace("<package-name>", quietly=TRUE)) { # if (! requireNamespace("<package-name>", quietly=TRUE)) {
# install.packages("<package-name>") # install.packages("<package-name>")
# } # }
# #
# ... or its BiocManager::install() equivalent: # ... or its BiocManager::install() equivalent:
# #
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) { # if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
# BiocManager::install("<bioconductor-package-name>") # BiocManager::install("<bioconductor-package-name>")
# } # }
# #
# If you want to _force_ a re-installation of the package, simply issue # If you want to _force_ a re-installation of the package, simply issue
# the install.packages("<package-name>") command on its own. For compactness # the install.packages("<package-name>") command on its own. For compactness
# we wrap the idiom into a function, which can also switch between CRAN # we wrap the idiom into a function, which can also switch between CRAN
# and BIOconductor sources: # and BIOconductor sources:
installIfNeeded <- function(package, s = "CRAN") { installIfNeeded <- function(package, s = "CRAN") {
# s: "CRAN" or "BIO" # s: "CRAN" or "BIO"
if (s == "CRAN") { if (s == "CRAN") {
if (! requireNamespace(package, quietly=TRUE)) { if (! requireNamespace(package, quietly=TRUE)) {
install.packages(package) install.packages(package)
} }
} else if (s == "BIO") { } else if (s == "BIO") {
if (! requireNamespace("BiocManager", quietly=TRUE)) { if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace(package, quietly=TRUE)) { if (! requireNamespace(package, quietly=TRUE)) {
BiocManager::install(package) BiocManager::install(package)
} }
} else { } else {
stop(sprintf("Unknown source \"%s\".", s)) stop(sprintf("Unknown source \"%s\".", s))
} }
} }
# = 2 CRAN packages ======================================================= # = 2 CRAN packages =======================================================
installIfNeeded("ape") installIfNeeded("ape")
installIfNeeded("BiocManager") installIfNeeded("BiocManager")
installIfNeeded("bio3d") installIfNeeded("bio3d")
installIfNeeded("evd") installIfNeeded("evd")
installIfNeeded("ggseqlogo") installIfNeeded("ggseqlogo")
installIfNeeded("ggtern") installIfNeeded("ggtern")
installIfNeeded("hexbin") installIfNeeded("hexbin")
installIfNeeded("httr") installIfNeeded("httr")
installIfNeeded("igraph") installIfNeeded("igraph")
installIfNeeded("jsonlite") installIfNeeded("jsonlite")
installIfNeeded("magrittr") installIfNeeded("magrittr")
installIfNeeded("MASS") installIfNeeded("MASS")
installIfNeeded("microbenchmark") installIfNeeded("microbenchmark")
installIfNeeded("phangorn") installIfNeeded("phangorn")
installIfNeeded("plotly") installIfNeeded("plotly")
installIfNeeded("plotrix") installIfNeeded("plotrix")
installIfNeeded("profvis") installIfNeeded("profvis")
installIfNeeded("robustbase") installIfNeeded("robustbase")
installIfNeeded("RColorBrewer") installIfNeeded("RColorBrewer")
installIfNeeded("Rphylip") installIfNeeded("Rphylip")
installIfNeeded("rvest") installIfNeeded("rvest")
installIfNeeded("seqinr") installIfNeeded("seqinr")
installIfNeeded("stringi") installIfNeeded("stringi")
installIfNeeded("taxize") installIfNeeded("taxize")
installIfNeeded("testthat") installIfNeeded("testthat")
installIfNeeded("xml2") installIfNeeded("xml2")
# = 3 Bioconductor packages =============================================== # = 3 Bioconductor packages ===============================================
installIfNeeded("Biobase", s = "BIO") installIfNeeded("Biobase", s = "BIO")
installIfNeeded("biomaRt", s = "BIO") installIfNeeded("biomaRt", s = "BIO")
installIfNeeded("Biostrings", s = "BIO") installIfNeeded("Biostrings", s = "BIO")
installIfNeeded("DECIPHER", s = "BIO") installIfNeeded("DECIPHER", s = "BIO")
installIfNeeded("GEOquery", s = "BIO") installIfNeeded("GEOquery", s = "BIO")
installIfNeeded("GOSim", s = "BIO") installIfNeeded("GOSim", s = "BIO")
installIfNeeded("limma", s = "BIO") installIfNeeded("limma", s = "BIO")
installIfNeeded("msa", s = "BIO") installIfNeeded("msa", s = "BIO")
installIfNeeded("org.Sc.sgd.db", s = "BIO") installIfNeeded("org.Sc.sgd.db", s = "BIO")
installIfNeeded("prada", s = "BIO") installIfNeeded("prada", s = "BIO")
installIfNeeded("topGO", s = "BIO") installIfNeeded("topGO", s = "BIO")
# = 4 Other package sources =============================================== # = 4 Other package sources ===============================================
# Using sources other than CRAN or Bioconductor to download general-purpose # Using sources other than CRAN or Bioconductor to download general-purpose
# programs that run on your computer is not generally recommended. # programs that run on your computer is not generally recommended.
# = 5 Updating packages =================================================== # = 5 Updating packages ===================================================
# From time to time, update CRAN packages with the following command ... # From time to time, update CRAN packages with the following command ...
update.packages() update.packages()
# ... and also update Bioconductor packages as follows: # ... and also update Bioconductor packages as follows:
BiocManager::install() BiocManager::install()
# [END] # [END]

View File

@ -1,100 +1,100 @@
# addSACCE_APSESproteins.R # addSACCE_APSESproteins.R
# Adds the Saccharomyces cerevisiae APSES proteins to myDB # Adds the Saccharomyces cerevisiae APSES proteins to myDB
# #
myDB$protein <- myDB$protein <-
rbind(myDB$protein, rbind(myDB$protein,
data.frame( data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SWI4_SACCE", name = "SWI4_SACCE",
RefSeqID = "NP_011036", RefSeqID = "NP_011036",
UniProtID = "P25302", UniProtID = "P25302",
taxonomy.ID = as.integer(4932), taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence(" sequence = dbSanitizeSequence("
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv 1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf 61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss 121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn 181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln 241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs 301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd 361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey 421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml 481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie 541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr 601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq 661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl 721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns 781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve 841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl 901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt 961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids 1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
1081 klddiekdlr ana"), 1081 klddiekdlr ana"),
stringsAsFactors = FALSE)) stringsAsFactors = FALSE))
myDB$protein <- myDB$protein <-
rbind(myDB$protein, rbind(myDB$protein,
data.frame( data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "PHD1_SACCE", name = "PHD1_SACCE",
RefSeqID = "NP_012881", RefSeqID = "NP_012881",
UniProtID = "P36093", UniProtID = "P36093",
taxonomy.ID = as.integer(4932), taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence(" sequence = dbSanitizeSequence("
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv 1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm 61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst 121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs 181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl 241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr 301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
361 aknels"), 361 aknels"),
stringsAsFactors = FALSE)) stringsAsFactors = FALSE))
myDB$protein <- myDB$protein <-
rbind(myDB$protein, rbind(myDB$protein,
data.frame( data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SOK2_SACCE", name = "SOK2_SACCE",
RefSeqID = "NP_013729", RefSeqID = "NP_013729",
UniProtID = "P53438", UniProtID = "P53438",
taxonomy.ID = as.integer(4932), taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence(" sequence = dbSanitizeSequence("
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq 1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna 61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt 121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp 181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip 241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt 301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw 361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm 421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp 481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp 541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp 601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq 661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt 721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
781 kkqek"), 781 kkqek"),
stringsAsFactors = FALSE)) stringsAsFactors = FALSE))
myDB$protein <- myDB$protein <-
rbind(myDB$protein, rbind(myDB$protein,
data.frame( data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "XBP1_SACCE", name = "XBP1_SACCE",
RefSeqID = "NP_012165", RefSeqID = "NP_012165",
UniProtID = "P40489", UniProtID = "P40489",
taxonomy.ID = as.integer(4932), taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence(" sequence = dbSanitizeSequence("
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf 1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal 61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh 121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae 181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann 241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl 301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa 361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd 421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr 481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh 541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"), 601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
stringsAsFactors = FALSE)) stringsAsFactors = FALSE))
# [END] # [END]

View File

@ -1,69 +1,69 @@
# ABC-units.R # ABC-units.R
# #
# Purpose: A Bioinformatics Course: R code for learning units # Purpose: A Bioinformatics Course: R code for learning units
# #
# Version: 4.0 # Version: 4.0
# #
# Date: 2020 09 16 # Date: 2020 09 16
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# V 4.0 2020 version # V 4.0 2020 version
# V 3.0 2019 version # V 3.0 2019 version
# V 2.0 2018 version # V 2.0 2018 version
# V 1.0 2017 version # V 1.0 2017 version
# V 0.1 First code # V 0.1 First code
# #
# TODO: # TODO:
# #
# #
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# The R-scripts and datasets in this project will be continuously updated, # The R-scripts and datasets in this project will be continuously updated,
# and updates will be posted on GitHub. To bring your version into the latest # and updates will be posted on GitHub. To bring your version into the latest
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the # state use the Git-pane (top left) and "pull" (blue downward arrow) from the
# repository. However, this will overwrite locally edited version of files. # repository. However, this will overwrite locally edited version of files.
# To edit code and experiment with it, for example to add your own comments and # To edit code and experiment with it, for example to add your own comments and
# examples, save your edited version into the "myScripts" folder. Otherwise you # examples, save your edited version into the "myScripts" folder. Otherwise you
# may have problems with git when you update the project to a new version. It's # may have problems with git when you update the project to a new version. It's
# good practice to change the filename, for example by prepending your initials. # good practice to change the filename, for example by prepending your initials.
# This helps distinguish the files you are working with e.g. in a list of # This helps distinguish the files you are working with e.g. in a list of
# recent files. For example if your name is Honjo Tasuku, your edited # recent files. For example if your name is Honjo Tasuku, your edited
# BIN-Sequence.R might be named HT-BIN-Sequence.R # BIN-Sequence.R might be named HT-BIN-Sequence.R
# If you pull from github and get the following type of error ... # If you pull from github and get the following type of error ...
# --------------- # ---------------
# error: Your local changes to the following files would be # error: Your local changes to the following files would be
# overwritten by merge # overwritten by merge
# ... # ...
# Please commit your changes or stash them before you can merge. # Please commit your changes or stash them before you can merge.
# --------------- # ---------------
# ... then, you need to bring the offending file into its original state. # ... then, you need to bring the offending file into its original state.
# Open the Commit window, select the file, and click on the Revert button. # Open the Commit window, select the file, and click on the Revert button.
# #
# When working with these script DO NOT SIMPLY source() THESE FILES! # When working with these script DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# #
# ============================================================================== # ==============================================================================
# Once you have typed and executed the function init(), you will find a file # Once you have typed and executed the function init(), you will find a file
# called myScript.R in the project directory. # called myScript.R in the project directory.
# #
# Open it, you can place all of your code-experiments and notes into that # Open it, you can place all of your code-experiments and notes into that
# file. This will complement your "Course Journal". If you keep all contents in # file. This will complement your "Course Journal". If you keep all contents in
# this one file, you can find everything by using the <cmd>-F find function. To # this one file, you can find everything by using the <cmd>-F find function. To
# cross-reference code in your journal, create section headings. # cross-reference code in your journal, create section headings.
# #
# ============================================================================== # ==============================================================================
# The individual learning units' files can be opened by simply clicking on them # The individual learning units' files can be opened by simply clicking on them
# in the File pane. # in the File pane.
# [END] # [END]

View File

@ -1,16 +1,16 @@
Version: 1.0 Version: 1.0
RestoreWorkspace: No RestoreWorkspace: No
SaveWorkspace: No SaveWorkspace: No
AlwaysSaveHistory: No AlwaysSaveHistory: No
EnableCodeIndexing: Yes EnableCodeIndexing: Yes
UseSpacesForTab: Yes UseSpacesForTab: Yes
NumSpacesForTab: 2 NumSpacesForTab: 2
Encoding: UTF-8 Encoding: UTF-8
RnwWeave: knitr RnwWeave: knitr
LaTeX: XeLaTeX LaTeX: XeLaTeX
AutoAppendNewline: Yes AutoAppendNewline: Yes
StripTrailingWhitespace: Yes StripTrailingWhitespace: Yes

View File

@ -1,111 +1,111 @@
# tocID <- "BIN-ALI-BLAST.R" # tocID <- "BIN-ALI-BLAST.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit. # R code accompanying the BIN-ALI-BLAST unit.
# #
# ============================================================================== # ==============================================================================
# #
# Version: 1.3 # Version: 1.3
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.3 2020 Maintenance # 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(), # 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.1 Fixed parsing logic. # 1.1 Fixed parsing logic.
# 1.0 First live version 2017. # 1.0 First live version 2017.
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------- #TOC> ---------------------------------------------------
#TOC> 1 Defining the APSES domain 45 #TOC> 1 Defining the APSES domain 45
#TOC> 2 Executing the BLAST search 75 #TOC> 2 Executing the BLAST search 75
#TOC> 3 Analysing results 97 #TOC> 3 Analysing results 97
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Defining the APSES domain =========================================== # = 1 Defining the APSES domain ===========================================
# Load your protein database # Load your protein database
source("makeProteinDB.R") source("makeProteinDB.R")
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You # Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
# have entered this data into your database in the # have entered this data into your database in the
# BIN-ALI-Optimal_sequence_alignment unit.) # BIN-ALI-Optimal_sequence_alignment unit.)
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct ( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
# name of the Mbp1 orthologue # name of the Mbp1 orthologue
# of Mbp1 in your protein # of Mbp1 in your protein
# database, DON'T continue. We # database, DON'T continue. We
# need to fix this problem. # need to fix this problem.
# Get in touch. # Get in touch.
(proID <- myDB$protein$ID[myDB$protein$name == myOrth]) (proID <- myDB$protein$ID[myDB$protein$name == myOrth])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID]) myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID]) (start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID]) (end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start, start,
end)) end))
# The MYSPE "apses" sequence is the sequence that we will use for our reverse # The MYSPE "apses" sequence is the sequence that we will use for our reverse
# BLAST search. # BLAST search.
# = 2 Executing the BLAST search ========================================== # = 2 Executing the BLAST search ==========================================
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface # The ./scripts/BLAST.R code defines two functions to access the BLAST interface
# through its Web API, and to parse results. Have a look at the script, then # through its Web API, and to parse results. Have a look at the script, then
# source it: # source it:
source("./scripts/BLAST.R") source("./scripts/BLAST.R")
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
# cerevisiae: # cerevisiae:
BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
db = "refseq_protein", # database to search in db = "refseq_protein", # database to search in
nHits = 10, # nHits = 10, #
E = 0.01, # E = 0.01, #
limits = "txid559292[ORGN]") # S. cerevisiae S288c limits = "txid559292[ORGN]") # S. cerevisiae S288c
length(BLASTresults$hits) # There should be at least one hit there. Ask for length(BLASTresults$hits) # There should be at least one hit there. Ask for
# advice in case this step fails. # advice in case this step fails.
# = 3 Analysing results =================================================== # = 3 Analysing results ===================================================
(topHit <- BLASTresults$hits[[1]]) # Get the top hit (topHit <- BLASTresults$hits[[1]]) # Get the top hit
# What is the refseq ID of the top hit # What is the refseq ID of the top hit
topHit$accession topHit$accession
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
# domain. If it is not, ask me for advice. # domain. If it is not, ask me for advice.
# [END] # [END]

View File

@ -1,195 +1,195 @@
# tocID <- "BIN-ALI-Dotplot.R" # tocID <- "BIN-ALI-Dotplot.R"
# #
# #
# ============================================================================== # ==============================================================================
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Dotplot unit. # R code accompanying the BIN-ALI-Dotplot unit.
# #
# Version: 0.2 # Version: 0.2
# #
# Date: 2019 01 07 # Date: 2019 01 07
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 0.2 Change from require() to requireNamespace(), # 0.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------- #TOC> --------------------------------------
#TOC> 1 ___Section___ 42 #TOC> 1 ___Section___ 42
#TOC> 2 Tasks 190 #TOC> 2 Tasks 190
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 ___Section___ ======================================================= # = 1 ___Section___ =======================================================
if (!requireNamespace("BiocManager", quietly=TRUE)) { if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (!requireNamespace("Biostrings", quietly=TRUE)) { if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
if (!requireNamespace("seqinr", quietly=TRUE)) { if (!requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr") install.packages("seqinr")
} }
# Let's load BLOSUM62 # Let's load BLOSUM62
data(BLOSUM62, package = "Biostrings") data(BLOSUM62, package = "Biostrings")
# Now let's craft code for a dotplot. That's surprisingly simple. We build a # Now let's craft code for a dotplot. That's surprisingly simple. We build a
# matrix that has as many rows as one sequence, as many columns as another. Then # matrix that has as many rows as one sequence, as many columns as another. Then
# we go through every cell of the matrix and enter the pairscore we encounter # we go through every cell of the matrix and enter the pairscore we encounter
# for the amino acid pair whose position corresponds to the row and column # for the amino acid pair whose position corresponds to the row and column
# index. Finally we visualize the matrix in a plot. # index. Finally we visualize the matrix in a plot.
# #
# First we fetch our sequences and split them into single characters. # First we fetch our sequences and split them into single characters.
sel <- myDB$protein$name == "MBP1_SACCE" sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length. # Check that we have two character vectors of the expected length.
str(MBP1_SACCE) str(MBP1_SACCE)
str(MBP1_MYSPE) str(MBP1_MYSPE)
# How do we get the pairscore values? Consider: a single pair of amino acids can # How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ... # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
MBP1_SACCE[13] MBP1_SACCE[13]
MBP1_MYSPE[21] MBP1_MYSPE[21]
# ... using these as subsetting expressions, we can pull the pairscore # ... using these as subsetting expressions, we can pull the pairscore
# from the MDM # from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]] BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
# First we build an empty matrix that will hold all pairscores ... # First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)), dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE)) nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
# ... then we loop over the sequences and store the scores in the matrix. # ... then we loop over the sequences and store the scores in the matrix.
# #
for (i in 1:length(MBP1_SACCE)) { for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_MYSPE)) { for (j in 1:length(MBP1_MYSPE)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]] dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
} }
} }
# Even though this is a large matrix, this does not take much time ... # Even though this is a large matrix, this does not take much time ...
# Let's have a look at a small block of the values: # Let's have a look at a small block of the values:
dotMat[1:10, 1:10] dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_MYSPE. # the matrix correspond to an amino acid from MBP1_MYSPE.
# To plot this, we use the image() function. Here, with default parameters. # To plot this, we use the image() function. Here, with default parameters.
image(dotMat) image(dotMat)
# Be patient, this takes a few moments to render: more than 500,000 values. # Be patient, this takes a few moments to render: more than 500,000 values.
# Nice. # Nice.
# What do you expect? # What do you expect?
# What would similar sequences look like? # What would similar sequences look like?
# What do you see? # What do you see?
#You migh notice a thin line of yellow along the diagonal, moving approximately #You migh notice a thin line of yellow along the diagonal, moving approximately
# from bottom left to top right, fading in and out of existence. This is the # from bottom left to top right, fading in and out of existence. This is the
# signature of extended sequence similarity. # signature of extended sequence similarity.
# Let's magnify this a bit by looking at only the first 200 amino acids ... # Let's magnify this a bit by looking at only the first 200 amino acids ...
image(dotMat[1:200, 1:200]) image(dotMat[1:200, 1:200])
# ... and, according to our normal writing convention, we would like the # ... and, according to our normal writing convention, we would like the
# diagonal to run from top-left to bottom-right since we write from left to # diagonal to run from top-left to bottom-right since we write from left to
# right and from top to bottom... # right and from top to bottom...
image(dotMat[1:200, 1:200], ylim = 1.0:0.0) image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
# ... and we would like the range of the x- and y- axis to correspond to the # ... and we would like the range of the x- and y- axis to correspond to the
# sequence position ... # sequence position ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1)) image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ... # ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1), image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" ) xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right # ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ... # too ...
len <- 200 len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1), image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE) xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
box() box()
axis(1, at = c(1, seq(10, len, by=10))) axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10))) axis(2, at = c(1, seq(10, len, by=10)))
axis(3, at = c(1, seq(10, len, by=10))) axis(3, at = c(1, seq(10, len, by=10)))
axis(4, at = c(1, seq(10, len, by=10))) axis(4, at = c(1, seq(10, len, by=10)))
# ... you get the idea, we can infinitely customize our plot. However a good way # ... you get the idea, we can infinitely customize our plot. However a good way
# to do this is to develop a particular view for, say, a report or publication # to do this is to develop a particular view for, say, a report or publication
# in a script and then put it into a function. I have put a function into the # in a script and then put it into a function. I have put a function into the
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package: # there already is a dotplot function in the seqinr package:
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise # Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to # from segments of repetitive, low complexity sequence. But you probably have to
# look very closely to discern the faint diagonals that correspond to similar # look very closely to discern the faint diagonals that correspond to similar
# sequence. # sequence.
# Let's see if we can enhance the contrast between distributed noise and the # Let's see if we can enhance the contrast between distributed noise and the
# actual alignment of conserved residues. We can filter the dot matrix with a # actual alignment of conserved residues. We can filter the dot matrix with a
# pattern that enhances diagonally repeated values. Every value in the matrix # pattern that enhances diagonally repeated values. Every value in the matrix
# will be replaced by a weighted average of its neighborhood. Here is a # will be replaced by a weighted average of its neighborhood. Here is a
# diagonal-filter: # diagonal-filter:
myFilter <- matrix(numeric(25), nrow = 5) myFilter <- matrix(numeric(25), nrow = 5)
myFilter[1, ] <- c( 1, 0, 0, 0, 0) myFilter[1, ] <- c( 1, 0, 0, 0, 0)
myFilter[2, ] <- c( 0, 1, 0, 0, 0) myFilter[2, ] <- c( 0, 1, 0, 0, 0)
myFilter[3, ] <- c( 0, 0, 1, 0, 0) myFilter[3, ] <- c( 0, 0, 1, 0, 0)
myFilter[4, ] <- c( 0, 0, 0, 1, 0) myFilter[4, ] <- c( 0, 0, 0, 1, 0)
myFilter[5, ] <- c( 0, 0, 0, 0, 1) myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally # I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit # related and where the regions of sequence similarity are. Play with this a bit
# ... Can you come up with a better filter? If so, eMail us. # ... Can you come up with a better filter? If so, eMail us.
# = 2 Tasks =============================================================== # = 2 Tasks ===============================================================
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,365 +1,365 @@
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R" # tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. # R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
# #
# ============================================================================== # ==============================================================================
# Version: 1.7.1 # Version: 1.7.1
# #
# Date: 2017-09 - 2020-10 # Date: 2017-09 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/ # 1.7.1 add jsonlite:: to fromjJSON() in code sample and ./myScripts/
# 1.7 2020 updates # 1.7 2020 updates
# 1.6 Maintenance # 1.6 Maintenance
# 1.5 Change from require() to requireNamespace(), # 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the # 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library. # entire library.
# 1.3 Updated confirmation task with correct logic # 1.3 Updated confirmation task with correct logic
# 1.2 Added missing load of seqinr package # 1.2 Added missing load of seqinr package
# 1.1 Update annotation file logic - it could already have been # 1.1 Update annotation file logic - it could already have been
# prepared in the BIN-FUNC-Annotation unit. # prepared in the BIN-FUNC-Annotation unit.
# 1.0.1 bugfix # 1.0.1 bugfix
# 1.0 First 2017 live version. # 1.0 First 2017 live version.
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------------------------- #TOC> --------------------------------------------------------------------------
#TOC> 1 Prepare 58 #TOC> 1 Prepare 58
#TOC> 2 Biostrings Pairwise Alignment 75 #TOC> 2 Biostrings Pairwise Alignment 75
#TOC> 2.1 Optimal global alignment 93 #TOC> 2.1 Optimal global alignment 93
#TOC> 2.2 Optimal local alignment 156 #TOC> 2.2 Optimal local alignment 156
#TOC> 3 APSES Domain annotation by alignment 180 #TOC> 3 APSES Domain annotation by alignment 180
#TOC> 4 Update your database script 261 #TOC> 4 Update your database script 261
#TOC> 4.1 Preparing an annotation file ... 267 #TOC> 4.1 Preparing an annotation file ... 267
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269 #TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 269
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314 #TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 314
#TOC> 4.2 Execute and Validate 338 #TOC> 4.2 Execute and Validate 338
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Prepare ============================================================= # = 1 Prepare =============================================================
if (! requireNamespace("seqinr", quietly=TRUE)) { if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr") install.packages("seqinr")
} }
# You can get package information with the following commands: # You can get package information with the following commands:
# library(help = seqinr) # basic information # library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes # browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets # data(package = "seqinr") # available datasets
# You need to recreate the protein database that you have constructed in the # You need to recreate the protein database that you have constructed in the
# BIN-Storing_data unit. # BIN-Storing_data unit.
source("./myScripts/makeProteinDB.R") source("./myScripts/makeProteinDB.R")
# = 2 Biostrings Pairwise Alignment ======================================= # = 2 Biostrings Pairwise Alignment =======================================
if (!requireNamespace("BiocManager", quietly=TRUE)) { if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (!requireNamespace("Biostrings", quietly=TRUE)) { if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
# Biostrings stores sequences in "XString" objects. Once we have converted our # Biostrings stores sequences in "XString" objects. Once we have converted our
# target sequences to AAString objects, the alignment itself is straightforward. # target sequences to AAString objects, the alignment itself is straightforward.
# == 2.1 Optimal global alignment ========================================== # == 2.1 Optimal global alignment ==========================================
# The pairwiseAlignment() function was written to behave # The pairwiseAlignment() function was written to behave
# exactly like the functions you encountered on the EMBOSS server. # exactly like the functions you encountered on the EMBOSS server.
# First: make AAString objects ... # First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE" sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel]) aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
?pairwiseAlignment ?pairwiseAlignment
# ... and align. # ... and align.
# Global optimal alignment with end-gap penalties is default. # Global optimal alignment with end-gap penalties is default.
ali1 <- Biostrings::pairwiseAlignment( ali1 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE, aaMBP1_SACCE,
aaMBP1_MYSPE, aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 10, gapOpening = 10,
gapExtension = 0.5) gapExtension = 0.5)
str(ali1) # ... it's complicated str(ali1) # ... it's complicated
# This is a Biostrings alignment object. But we can use Biostrings functions to # This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it: # tame it:
ali1 ali1
Biostrings::writePairwiseAlignments(ali1) # That should look familiar Biostrings::writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as # And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...) # $ is for lists ...)
str(ali1@pattern) str(ali1@pattern)
ali1@pattern ali1@pattern
ali1@pattern@range ali1@pattern@range
ali1@pattern@indel ali1@pattern@indel
ali1@pattern@mismatch ali1@pattern@mismatch
# or work with "normal" R functions # or work with "normal" R functions
# the alignment length # the alignment length
nchar(as.character(ali1@pattern)) nchar(as.character(ali1@pattern))
# the number of identities # the number of identities
sum(seqinr::s2c(as.character(ali1@pattern)) == sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) seqinr::s2c(as.character(ali1@subject)))
# ... e.g. to calculate the percentage of identities # ... e.g. to calculate the percentage of identities
100 * 100 *
sum(seqinr::s2c(as.character(ali1@pattern)) == sum(seqinr::s2c(as.character(ali1@pattern)) ==
seqinr::s2c(as.character(ali1@subject))) / seqinr::s2c(as.character(ali1@subject))) /
nchar(as.character(ali1@pattern)) nchar(as.character(ali1@pattern))
# ... which should be the same as reported in the writePairwiseAlignments() # ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function: # output. Awkward to type? Then it calls for a function:
# #
percentID <- function(al) { percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object # returns the percent-identity of a Biostrings alignment object
return(100 * return(100 *
sum(seqinr::s2c(as.character(al@pattern)) == sum(seqinr::s2c(as.character(al@pattern)) ==
seqinr::s2c(as.character(al@subject))) / seqinr::s2c(as.character(al@subject))) /
nchar(as.character(al@pattern))) nchar(as.character(al@pattern)))
} }
percentID(ali1) percentID(ali1)
# == 2.2 Optimal local alignment =========================================== # == 2.2 Optimal local alignment ===========================================
# Compare with local optimal alignment (like EMBOSS Water) # Compare with local optimal alignment (like EMBOSS Water)
ali2 <- Biostrings::pairwiseAlignment( ali2 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE, aaMBP1_SACCE,
aaMBP1_MYSPE, aaMBP1_MYSPE,
type = "local", type = "local",
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 50, gapOpening = 50,
gapExtension = 10) gapExtension = 10)
Biostrings::writePairwiseAlignments(ali2) Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that # This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity: # one has quite high sequence identity:
percentID(ali2) percentID(ali2)
# == TASK: == # == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily # Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing # towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change, # the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change... # how does the length of indels change...
# = 3 APSES Domain annotation by alignment ================================ # = 3 APSES Domain annotation by alignment ================================
# In this section we define the MYSPE APSES sequence by performing a global, # In this section we define the MYSPE APSES sequence by performing a global,
# optimal sequence alignment of the yeast APSES domain with the full length # optimal sequence alignment of the yeast APSES domain with the full length
# protein sequence of the protein that was the most similar to the yeast APSES # protein sequence of the protein that was the most similar to the yeast APSES
# domain. # domain.
# #
# I have annotated the yeast APSES domain as a feature in the # I have annotated the yeast APSES domain as a feature in the
# database. To view the annotation, we can retrieve it via the proteinID and # database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID: # featureID. Here is the yeast protein ID:
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]) (proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
# ... and if you look at the feature table, you can identify the feature ID # ... and if you look at the feature table, you can identify the feature ID
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
# ... and with the two annotations we can get the corresponding ID from the # ... and with the two annotations we can get the corresponding ID from the
# annotation table # annotation table
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID]) myDB$annotation$featureID == ftrID])
myDB$annotation[myDB$annotation$ID == proID & myDB$annotation[myDB$annotation$ID == proID &
myDB$annotation$ID == ftrID, ] myDB$annotation$ID == ftrID, ]
# The annotation record contains the start and end coordinates which we can use # The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression. # to define the APSES domain sequence with a substr() expression.
(start <- myDB$annotation$start[myDB$annotation$ID == fanID]) (start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID]) (end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start, start,
end)) end))
# Lots of code. But don't get lost. Let's recapitulate what we have done: we # Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose # have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the annotation table the start # name is "MBP1_SACCE", and selected from the annotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with # and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring. # the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it: # Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- Biostrings::AAString(apses) aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
# Now let's align these two sequences of very different length without end-gap # Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the # penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since # end-gap penalties off and that is crucially important since
# the sequences have very different length. # the sequences have very different length.
aliApses <- Biostrings::pairwiseAlignment( aliApses <- Biostrings::pairwiseAlignment(
aaMB1_SACCE_APSES, aaMB1_SACCE_APSES,
aaMBP1_MYSPE, aaMBP1_MYSPE,
type = "overlap", type = "overlap",
substitutionMatrix = "BLOSUM62", substitutionMatrix = "BLOSUM62",
gapOpening = 10, gapOpening = 10,
gapExtension = 0.5) gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly # Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern" # homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched # sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct? # with the "query". Is this correct?
Biostrings::writePairwiseAlignments(aliApses) Biostrings::writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from # If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what # the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No # you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string. # worries: as.character() returns a normal string.
as.character(aliApses@subject) as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from # Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of # the output of writePairwiseAlignments(), or you can get them from the range of
# the match. # the match.
str(aliApses@subject@range) str(aliApses@subject@range)
# start is: # start is:
aliApses@subject@range@start aliApses@subject@range@start
# ... and end is: # ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1 aliApses@subject@range@start + aliApses@subject@range@width - 1
# = 4 Update your database script ========================================= # = 4 Update your database script =========================================
# Since we have this feature defined now, we can create a feature annotation # Since we have this feature defined now, we can create a feature annotation
# right away and store it in myDB. # right away and store it in myDB.
# == 4.1 Preparing an annotation file ... ================================== # == 4.1 Preparing an annotation file ... ==================================
# #
# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit # === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit
# #
# #
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the # You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory: # ./myScripts/ directory:
# #
# - Make a copy of the file "./data/refAnnotations.json" and put it in your # - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory. # myScripts/ directory.
# #
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. # - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called # if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is # "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE"). # "MBP1_CRYNE").
# #
# - Open the file in the RStudio editor and delete all blocks for # - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one. # the Mbp1 protein annotations except the first one.
# #
# - From that block, delete all lines except for the line that says: # - From that block, delete all lines except for the line that says:
# #
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
# #
# - Then delete the comma at the end of the line (your file will just have # - Then delete the comma at the end of the line (your file will just have
# this one annotation). # this one annotation).
# #
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the # - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the # "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence. # APSES domain in your sequence.
# #
# - Save the file in your myScripts/ directory # - Save the file in your myScripts/ directory
# #
## - Validate your file online at https://jsonlint.com/ ## - Validate your file online at https://jsonlint.com/
# #
# - Update your "./myScripts/makeProteinDB.R" script to load your new # - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the # annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end: # RStudio editor, and add the following command at the end:
# #
# myDB <- dbAddAnnotation(myDB, # myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) # jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^ # ^^^^^^^
# edit this! # edit this!
# - save and close the file. # - save and close the file.
# #
# Then SKIP the next section. # Then SKIP the next section.
# #
# #
# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit # === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit
# #
# #
# You DO already have a file called "<MYSPE>-Annotations.json" in the # You DO already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory: # ./myScripts/ directory:
# #
# - Open the file in the RStudio editor. # - Open the file in the RStudio editor.
# #
# - Below the last feature lines (but before the closing "]") add the # - Below the last feature lines (but before the closing "]") add the
# following feature line (without the "#") # following feature line (without the "#")
# #
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"} # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
# #
# - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the # - Edit that annotation: change MBP1_SACCE to MBP1_<MYSPE> and change the
# "start" and "end" features to the coordinates you just discovered for the # "start" and "end" features to the coordinates you just discovered for the
# APSES domain in your sequence. # APSES domain in your sequence.
# #
# - Add a comma after the preceding feature line. # - Add a comma after the preceding feature line.
# #
# - Save your file. # - Save your file.
# #
# - Validate your file online at https://jsonlint.com/ # - Validate your file online at https://jsonlint.com/
# #
# #
# == 4.2 Execute and Validate ============================================== # == 4.2 Execute and Validate ==============================================
# #
# - source() your database creation script: # - source() your database creation script:
# #
# source("./myScripts/makeProteinDB.R") # source("./myScripts/makeProteinDB.R")
# #
# This should run without errors or warnings. If it doesn't work and you # This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask on the mailing list for # can't figure out quickly what's happening, ask on the mailing list for
# help. # help.
# #
# - Confirm # - Confirm
# The following commands should retrieve the correct start and end # The following commands should retrieve the correct start and end
# coordinates and sequence of the MBP1_MYSPE APSES domain: # coordinates and sequence of the MBP1_MYSPE APSES domain:
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")) sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
(proID <- myDB$protein$ID[sel]) (proID <- myDB$protein$ID[sel])
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
myDB$annotation$featureID == ftrID]) myDB$annotation$featureID == ftrID])
(start <- myDB$annotation$start[myDB$annotation$ID == fanID]) (start <- myDB$annotation$start[myDB$annotation$ID == fanID])
(end <- myDB$annotation$end[myDB$annotation$ID == fanID]) (end <- myDB$annotation$end[myDB$annotation$ID == fanID])
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
start, start,
end)) end))
# [END] # [END]

View File

@ -1,313 +1,313 @@
# tocID <- "BIN-ALI-Similarity.R" # tocID <- "BIN-ALI-Similarity.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit. # R code accompanying the BIN-ALI-Similarity unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Updates # 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot. # 1.0 Refactored for 2017; add aaindex, ternary plot.
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# Update ggtern:: ternary plot to use aacol dots under text # Update ggtern:: ternary plot to use aacol dots under text
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------- #TOC> ----------------------------------------------
#TOC> 1 Amino Acid Properties 43 #TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 189 #TOC> 2 Mutation Data matrix 189
#TOC> 3 Background score 230 #TOC> 3 Background score 230
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Amino Acid Properties =============================================== # = 1 Amino Acid Properties ===============================================
# A large collection of amino acid property tables is available via the seqinr # A large collection of amino acid property tables is available via the seqinr
# package: # package:
if (! requireNamespace("seqinr", quietly=TRUE)) { if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr") install.packages("seqinr")
} }
# Package information: # Package information:
# library(help = seqinr) # basic information # library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes # browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets # data(package = "seqinr") # available datasets
# A true Labor of Love has gone into the compilation of the seqinr "aaindex" # A true Labor of Love has gone into the compilation of the seqinr "aaindex"
# data: # data:
?aaindex ?aaindex
data(aaindex, package = "seqinr") # load the aaindex list from the package data(aaindex, package = "seqinr") # load the aaindex list from the package
length(aaindex) length(aaindex)
# Here are all the index descriptions # Here are all the index descriptions
for (i in 1:length(aaindex)) { for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
} }
# It's a bit cumbersome to search through the descriptions ... here is a # It's a bit cumbersome to search through the descriptions ... here is a
# function to make this easier: # function to make this easier:
searchAAindex <- function(patt) { searchAAindex <- function(patt) {
# Searches the aaindex descriptions for regular expression "patt" # Searches the aaindex descriptions for regular expression "patt"
# and prints index number and description. # and prints index number and description.
hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0)) hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
for (i in seq_along(hits)) { for (i in seq_along(hits)) {
cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D)) cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
} }
} }
searchAAindex("free energy") # Search for "free energy" searchAAindex("free energy") # Search for "free energy"
searchAAindex("(size)|(volume)") # Search for "size" or "volume": searchAAindex("(size)|(volume)") # Search for "size" or "volume":
# Let's examine ... # Let's examine ...
# ... a hydrophobicity index # ... a hydrophobicity index
(Y <- aaindex[[528]][c("D", "I")]) (Y <- aaindex[[528]][c("D", "I")])
# ... a volume index # ... a volume index
(V <- aaindex[[150]][c("D", "I")]) (V <- aaindex[[150]][c("D", "I")])
# ... and one of our own: side-chain pK values as reported by # ... and one of our own: side-chain pK values as reported by
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set # Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
# to 7.4 (physiological pH) # to 7.4 (physiological pH)
K <- list(I = c( 7.4, # Ala K <- list(I = c( 7.4, # Ala
12.3, # Arg 12.3, # Arg
7.4, # Asn 7.4, # Asn
3.9, # Asp 3.9, # Asp
8.6, # Cys 8.6, # Cys
7.4, # Gln 7.4, # Gln
4.3, # Glu 4.3, # Glu
7.4, # Gly 7.4, # Gly
6.5, # His 6.5, # His
7.4, # Ile 7.4, # Ile
7.4, # Leu 7.4, # Leu
10.4, # Lys 10.4, # Lys
7.4, # Met 7.4, # Met
7.4, # Phe 7.4, # Phe
7.4, # Pro 7.4, # Pro
7.4, # Ser 7.4, # Ser
7.4, # Thr 7.4, # Thr
7.4, # Trp 7.4, # Trp
9.8, # Tyr 9.8, # Tyr
7.4)) # Val 7.4)) # Val
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile", names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
"Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val") "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ... # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
# pull the names from Y$I, convert them to single letter code, and reorder the # pull the names from Y$I, convert them to single letter code, and reorder the
# AACOLS palette accordingly ... # AACOLS palette accordingly ...
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))] aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
plot(Y$I, V$I, plot(Y$I, V$I,
xlab = "hydrophobicity", ylab = "volume", xlab = "hydrophobicity", ylab = "volume",
pch = 21, pch = 21,
cex = 6, cex = 6,
col = aac, col = aac,
bg = aac) bg = aac)
text(Y$I, V$I, names(Y$I), cex = 0.8) text(Y$I, V$I, names(Y$I), cex = 0.8)
plot(Y$I, K$I, plot(Y$I, K$I,
xlab = "hydrophobicity", ylab = "pK", xlab = "hydrophobicity", ylab = "pK",
pch = 21, pch = 21,
cex = 6, cex = 6,
col = aac, col = aac,
bg = aac) bg = aac)
text(Y$I, K$I, names(Y$I), cex = 0.8) text(Y$I, K$I, names(Y$I), cex = 0.8)
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
# plots are in general unintuitive and hard to interpret. One alternative is a # plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot": # so-called "ternary plot":
if (! requireNamespace("ggtern", quietly=TRUE)) { if (! requireNamespace("ggtern", quietly=TRUE)) {
install.packages("ggtern") install.packages("ggtern")
} }
# Package information: # Package information:
# library(help = ggtern) # basic information # library(help = ggtern) # basic information
# browseVignettes("ggtern") # available vignettes # browseVignettes("ggtern") # available vignettes
# data(package = "ggtern") # available datasets # data(package = "ggtern") # available datasets
# collect into data frame, normalize to (0.05, 0.95) # collect into data frame, normalize to (0.05, 0.95)
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
"vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05, "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
"pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05, "pK" = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
rownames(myDat) <- names(Y$I) rownames(myDat) <- names(Y$I)
ggtern::ggtern(data = myDat, ggtern::ggtern(data = myDat,
ggplot2::aes(x = vol, ggplot2::aes(x = vol,
y = phi, y = phi,
z = pK, z = pK,
label = rownames(myDat))) + ggplot2::geom_text() label = rownames(myDat))) + ggplot2::geom_text()
# This results in a mapping of amino acids relative to each other that is # This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes. # similar to the Venn diagram you have seen in the notes.
# ... or we could use principal components analysis, to pull out the # ... or we could use principal components analysis, to pull out the
# best projection of the three feature dimensions into two. (Done here without delving # best projection of the three feature dimensions into two. (Done here without delving
# into the theory ...) # into the theory ...)
prc <- prcomp(myDat) prc <- prcomp(myDat)
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n", plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
pch=19, cex=6, col=aad, cex.main=0.7, pch=19, cex=6, col=aad, cex.main=0.7,
main="Principal Component Analysis of Amino Acid Features") main="Principal Component Analysis of Amino Acid Features")
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088") text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
# This matches the intuition rather well in that "similar" amino acids are close # This matches the intuition rather well in that "similar" amino acids are close
# on the plot. But we can't interpret the distances in terms of just one of the # on the plot. But we can't interpret the distances in terms of just one of the
# parameters. Whatever - nature has a different way to define similarity: # parameters. Whatever - nature has a different way to define similarity:
# mutations to similar amino acids are less likely to break the protein. # mutations to similar amino acids are less likely to break the protein.
# = 2 Mutation Data matrix ================================================ # = 2 Mutation Data matrix ================================================
# A mutation data matrix encodes all amino acid pairscores in a matrix. # A mutation data matrix encodes all amino acid pairscores in a matrix.
# The Biostrings package contains the most common mutation data matrices. # The Biostrings package contains the most common mutation data matrices.
if (! requireNamespace("BiocManager", quietly=TRUE)) { if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly=TRUE)) { if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help=Biostrings) # basic information # library(help=Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
# Let's attach the BLOSUM62 mutation data matrix from the package # Let's attach the BLOSUM62 mutation data matrix from the package
data(BLOSUM62, package = "Biostrings") data(BLOSUM62, package = "Biostrings")
# ... and see what it contains. (You've seen this matrix before.) # ... and see what it contains. (You've seen this matrix before.)
BLOSUM62 BLOSUM62
# We can simply access values via the row/column names. # We can simply access values via the row/column names.
# Identical amino acids have high scores ... # Identical amino acids have high scores ...
BLOSUM62["H", "H"] # Score for a pair of two histidines BLOSUM62["H", "H"] # Score for a pair of two histidines
BLOSUM62["S", "S"] # Score for a pair of two serines BLOSUM62["S", "S"] # Score for a pair of two serines
# Similar amino acids have low positive scores ... # Similar amino acids have low positive scores ...
BLOSUM62["L", "I"] # Score for a leucine / lysine pair BLOSUM62["L", "I"] # Score for a leucine / lysine pair
BLOSUM62["F", "Y"] # etc. BLOSUM62["F", "Y"] # etc.
# Dissimilar amino acids have negative scores ... # Dissimilar amino acids have negative scores ...
BLOSUM62["L", "K"] # Score for a leucine / lysine pair BLOSUM62["L", "K"] # Score for a leucine / lysine pair
BLOSUM62["Q", "P"] # etc. BLOSUM62["Q", "P"] # etc.
BLOSUM62["R", "W"] # the matrix is symmetric! BLOSUM62["R", "W"] # the matrix is symmetric!
BLOSUM62["W", "R"] BLOSUM62["W", "R"]
# = 3 Background score ==================================================== # = 3 Background score ====================================================
# The mutation data matrix is designed to give high scores to homologous # The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average # sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence? # should we expect for a random sequence?
# If we sample amino acid pairs at random, we will get a score that is the # If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity # average of the individual pairscores in the matrix. Omitting the ambiguity
# codes and the gap character: # codes and the gap character:
sum(BLOSUM62[1:20, 1:20])/400 sum(BLOSUM62[1:20, 1:20])/400
# But that score could be higher for real sequences, for which the amino acid # But that score could be higher for real sequences, for which the amino acid
# distribution is not random. For example membrane proteins have a large number # distribution is not random. For example membrane proteins have a large number
# of hydrophobic residues - an alignment of unrelated proteins might produce # of hydrophobic residues - an alignment of unrelated proteins might produce
# positive scores. And there are other proteins with biased amino acid # positive scores. And there are other proteins with biased amino acid
# compositions, in particular poteins that interact with multiple other # compositions, in particular poteins that interact with multiple other
# proteins. Let's test how this impacts the background score by comparing a # proteins. Let's test how this impacts the background score by comparing a
# sequence with shuffled sequences. These have the same composition, but are # sequence with shuffled sequences. These have the same composition, but are
# obvioulsy not homologous. The data directory contains the FASTA file for the # obvioulsy not homologous. The data directory contains the FASTA file for the
# PDB ID 3FG7 - a villin headpiece structure with a large amount of # PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ... # low-complexity amino acid sequence ...
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues. # with an exceptionally high percentage of hydrophobic residues.
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
# Here is a function that takes two sequences and # Here is a function that takes two sequences and
# returns their average pairscore. # returns their average pairscore.
averagePairScore <- function(a, b, MDM = BLOSUM62) { averagePairScore <- function(a, b, MDM = BLOSUM62) {
# Returns average pairscore of two sequences. # Returns average pairscore of two sequences.
# Parameters: # Parameters:
# a, b chr amino acid sequence string # a, b chr amino acid sequence string
# MDM mutation data matrix. Default is BLOSUM62 # MDM mutation data matrix. Default is BLOSUM62
# Value: num average pairscore. # Value: num average pairscore.
a <- unlist(strsplit(a, "")) a <- unlist(strsplit(a, ""))
b <- unlist(strsplit(b, "")) b <- unlist(strsplit(b, ""))
v <- 0 v <- 0
for (i in seq_along(a)) { for (i in seq_along(a)) {
v <- v + MDM[ a[i], b[i] ] v <- v + MDM[ a[i], b[i] ]
} }
return(v / length(a)) return(v / length(a))
} }
orig3FG7 <- toString(aa3FG7) orig3FG7 <- toString(aa3FG7)
orig2F1C <- toString(aa2F1C) orig2F1C <- toString(aa2F1C)
N <- 1000 N <- 1000
scores3FG7 <- numeric(N) scores3FG7 <- numeric(N)
scores2F1C <- numeric(N) scores2F1C <- numeric(N)
for (i in 1:N) { for (i in 1:N) {
scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7))) scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C))) scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
} }
# Plot the distributions # Plot the distributions
hist(scores3FG7, hist(scores3FG7,
col="#5599EE33", col="#5599EE33",
breaks = seq(-1.5, 0, by=0.1), breaks = seq(-1.5, 0, by=0.1),
main = "Pairscores for randomly shuffled sequences", main = "Pairscores for randomly shuffled sequences",
xlab = "Average pairscore from BLOSUM 62") xlab = "Average pairscore from BLOSUM 62")
hist(scores2F1C, hist(scores2F1C,
col="#55EE9933", col="#55EE9933",
breaks = seq(-1.5, 0, by=0.1), breaks = seq(-1.5, 0, by=0.1),
add = TRUE) add = TRUE)
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2) abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
legend('topright', legend('topright',
c("3FG7 (villin)", "2F1C (OmpG)"), c("3FG7 (villin)", "2F1C (OmpG)"),
fill = c("#5599EE33", "#55EE9933"), bty = 'n', fill = c("#5599EE33", "#55EE9933"), bty = 'n',
inset = 0.1) inset = 0.1)
# This is an important result: even though we have shuffled significantly biased # This is an important result: even though we have shuffled significantly biased
# sequences, and the average scores trend above the average of the mutation data # sequences, and the average scores trend above the average of the mutation data
# matrix, the average scores still remain comfortably below zero. This means # matrix, the average scores still remain comfortably below zero. This means
# that we can't (in general) improve a high-scoring alignment by simply # that we can't (in general) improve a high-scoring alignment by simply
# extending it with randomly matched residues. We will only improve the score if # extending it with randomly matched residues. We will only improve the score if
# the similarity of newly added residues is larger than what we expect to get by # the similarity of newly added residues is larger than what we expect to get by
# random chance! # random chance!
# [END] # [END]

View File

@ -1,216 +1,216 @@
# tocID <- "BIN-Data_integration.R" # tocID <- "BIN-Data_integration.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Data_integration unit. # R code accompanying the BIN-Data_integration unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2018-10 - 2020-09 # Date: 2018-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance and updates # 1.2 2020 Maintenance and updates
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0.1 Bugfix: UniProt ID Mapping service API change # 1.0.1 Bugfix: UniProt ID Mapping service API change
# 1.0 First live version # 1.0 First live version
# #
# #
# TODO: # TODO:
# Develop a fungi-specific BioMart example. # Develop a fungi-specific BioMart example.
# (cf. # (cf.
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html ) # https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------- #TOC> -------------------------------------------------
#TOC> 1 Identifier mapping 42 #TOC> 1 Identifier mapping 42
#TOC> 2 Cross-referencing tables 165 #TOC> 2 Cross-referencing tables 165
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Identifier mapping ================================================== # = 1 Identifier mapping ==================================================
# UniProt provides a well-designed ID mapping tool that can be accessed # UniProt provides a well-designed ID mapping tool that can be accessed
# online at http://www.uniprot.org/mapping/ # online at http://www.uniprot.org/mapping/
# #
# Here we will use the UniProt Web API for this tool to map identifiers. The # Here we will use the UniProt Web API for this tool to map identifiers. The
# UniProt ID mapping service supports a "RESTful API": responses can be obtained # UniProt ID mapping service supports a "RESTful API": responses can be obtained
# simply via a Web- browsers request. Such requests are commonly sent via the # simply via a Web- browsers request. Such requests are commonly sent via the
# GET or POST verbs that a Webserver responds to, when a client asks for data. # GET or POST verbs that a Webserver responds to, when a client asks for data.
# GET requests are visible in the URL of the request; POST requests are not # GET requests are visible in the URL of the request; POST requests are not
# directly visible, they are commonly used to send the contents of forms, or # directly visible, they are commonly used to send the contents of forms, or
# when transmitting larger, complex data items. The UniProt ID mapping sevice # when transmitting larger, complex data items. The UniProt ID mapping sevice
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET() # can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
# and POST() functions are part of the httr package. # and POST() functions are part of the httr package.
# To begin, we load httr, which supports sending and receiving data via the # To begin, we load httr, which supports sending and receiving data via the
# http protocol, just like a Web browser. # http protocol, just like a Web browser.
if (! requireNamespace("httr", quietly=TRUE)) { if (! requireNamespace("httr", quietly=TRUE)) {
install.packages("httr") install.packages("httr")
} }
# Package information: # Package information:
# library(help = httr) # basic information # library(help = httr) # basic information
# browseVignettes("httr") # available vignettes # browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets # data(package = "httr") # available datasets
# We will walk through the process with the refSeqID # We will walk through the process with the refSeqID
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what # of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
# happens if the ID can't be mapped: # happens if the ID can't be mapped:
myQueryIDs <- "NP_010227 NP_00000 NP_011036" myQueryIDs <- "NP_010227 NP_00000 NP_011036"
# The UniProt ID mapping service API is very straightforward to use: just define # The UniProt ID mapping service API is very straightforward to use: just define
# the URL of the server and send a list of items labelled as "query" in the body # the URL of the server and send a list of items labelled as "query" in the body
# of the request. GET() and POST() are functions from httr. # of the request. GET() and POST() are functions from httr.
# Note. A recent bug in the interaction between the server expectations and the # Note. A recent bug in the interaction between the server expectations and the
# curl client libraries requires the following initialization # curl client libraries requires the following initialization
httr::set_config(httr::config(http_version = 0)) httr::set_config(httr::config(http_version = 0))
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b # cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
URL <- "https://www.uniprot.org/mapping/" URL <- "https://www.uniprot.org/mapping/"
response <- httr::POST(URL, response <- httr::POST(URL,
body = list(from = "P_REFSEQ_AC", # Refseq Protein body = list(from = "P_REFSEQ_AC", # Refseq Protein
to = "ACC", # UniProt ID to = "ACC", # UniProt ID
format = "tab", format = "tab",
query = myQueryIDs)) query = myQueryIDs))
cat(httr::content(response)) cat(httr::content(response))
# We need to check the status code - if it is not 200, an error ocurred and we # We need to check the status code - if it is not 200, an error ocurred and we
# can't process the result: # can't process the result:
httr::status_code(response) httr::status_code(response)
# If the query is successful, tabbed text is returned. We can assign that to a # If the query is successful, tabbed text is returned. We can assign that to a
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
myMappedIDs <- read.delim(file = textConnection(httr::content(response)), myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
sep = "\t", sep = "\t",
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
myMappedIDs myMappedIDs
# If this works as expected, you should see: # If this works as expected, you should see:
# From To # From To
# 1 NP_010227 P39678 # 1 NP_010227 P39678
# 2 NP_011036 P25302 # 2 NP_011036 P25302
# #
# ... and note that there are only two entries, because nothing was returned # ... and note that there are only two entries, because nothing was returned
# for the dummy "RefSeq ID" NP_00000 # for the dummy "RefSeq ID" NP_00000
# If the query can't be fulfilled because of a problem with the server, a # If the query can't be fulfilled because of a problem with the server, a
# WebPage is returned. But the server status is also returned and we can check # WebPage is returned. But the server status is also returned and we can check
# the status code. I have lately gotten many "503" status codes: Server Not # the status code. I have lately gotten many "503" status codes: Server Not
# Available... # Available...
# We wrap this into a function: # We wrap this into a function:
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# Use UniProt ID mapping service to map one or more IDs # Use UniProt ID mapping service to map one or more IDs
# Parameters: # Parameters:
# s char A string of separated IDs # s char A string of separated IDs
# mapFrom char the database in which the IDs in s are valid. Default # mapFrom char the database in which the IDs in s are valid. Default
# is RefSeq protein # is RefSeq protein
# mapTo char the database in which the target IDs are valid. Default # mapTo char the database in which the target IDs are valid. Default
# is UniProtKB # is UniProtKB
# Value # Value
# a data frame of mapped IDs, with column names From and To, or an # a data frame of mapped IDs, with column names From and To, or an
# empty data frame if the mapping was unsuccessful. No rows are returned # empty data frame if the mapping was unsuccessful. No rows are returned
# for IDs that are not mapped. # for IDs that are not mapped.
# Initialize curl # Initialize curl
httr::set_config(httr::config(http_version = 0)) httr::set_config(httr::config(http_version = 0))
URL <- "https://www.uniprot.org/uploadlists/" URL <- "https://www.uniprot.org/uploadlists/"
response <- httr::POST(URL, response <- httr::POST(URL,
body = list(from = mapFrom, body = list(from = mapFrom,
to = mapTo, to = mapTo,
format = "tab", format = "tab",
query = s)) query = s))
if (httr::status_code(response) == 200) { # 200: oK if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)), myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t", sep = "\t",
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
colnames(myMap) <- c("From", "To") colnames(myMap) <- c("From", "To")
} else { } else {
myMap <- data.frame() myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:", warning(paste("No uniProt ID mapping returned:",
"server sent status", "server sent status",
httr::status_code(response))) httr::status_code(response)))
} }
return(myMap) return(myMap)
} }
# Try it out ... # Try it out ...
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded # A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
# into your workspace on startup. # into your workspace on startup.
# = 2 Cross-referencing tables ============================================ # = 2 Cross-referencing tables ============================================
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of # Sometimes we get the IDs we need to map in a large table, e.g. from a list of
# genes in a model organism database such as SGD, or from the Human Genen # genes in a model organism database such as SGD, or from the Human Genen
# Nomenclature commission. How do we map one set of identifiers to another one? # Nomenclature commission. How do we map one set of identifiers to another one?
# The function to use is match(). # The function to use is match().
# Here is a tiny set of identifiers taken from a much larger table to # Here is a tiny set of identifiers taken from a much larger table to
# illustrate the principle: # illustrate the principle:
# #
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747", myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
"Q08641", "P47129", "P52910", "P00330", "P81450"), "Q08641", "P47129", "P52910", "P00330", "P81450"),
name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4", name = c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
"AB140", "ACF4", "ACS2", "ADH1", "ATP18"), "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
refID = c("NP_014657", "NP_009386", refID = c("NP_014657", "NP_009386",
"NP_012683", "NP_012559", "NP_012683", "NP_012559",
"NP_010038", "NP_014882", "NP_010038", "NP_014882",
"NP_012616", "NP_013254", "NP_012616", "NP_013254",
"NP_014555", "NP_013629")) "NP_014555", "NP_013629"))
myIDs myIDs
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to # Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
# their gene names. # their gene names.
myQuery <- c("NP_010038", "NP_999999", "NP_013629") myQuery <- c("NP_010038", "NP_999999", "NP_013629")
# %in% will only tell us if these IDs are present in the table: # %in% will only tell us if these IDs are present in the table:
myQuery %in% myIDs$refID myQuery %in% myIDs$refID
# ... but not where they are located. But match() does what we need here: # ... but not where they are located. But match() does what we need here:
match(myQuery, myIDs$refID) match(myQuery, myIDs$refID)
# ... and we can use the result to subset the column that we want to map to: # ... and we can use the result to subset the column that we want to map to:
myIDs$name[match(myQuery, myIDs$refID)] myIDs$name[match(myQuery, myIDs$refID)]
# Note that the output preserves the NA - i.e. the length of the mapped # Note that the output preserves the NA - i.e. the length of the mapped
# values is exactly the same as the length of the query. # values is exactly the same as the length of the query.
# task: map the three genes to their UniProt Identifier. # task: map the three genes to their UniProt Identifier.
# #
# Note: if you want to do very many queries in very large tables, use the # Note: if you want to do very many queries in very large tables, use the
# fmatch() function in the "fastmatch" package for a considerable # fmatch() function in the "fastmatch" package for a considerable
# speedup. # speedup.
# [END] # [END]

View File

@ -1,435 +1,435 @@
# tocID <- "BIN-FUNC-Domain_annotation.R" # tocID <- "BIN-FUNC-Domain_annotation.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit. # R code accompanying the BIN-FUNC-Domain_annotation unit.
# #
# ============================================================================== # ==============================================================================
# Version: 1.4 # Version: 1.4
# #
# Date: 2017-11 - 2020-10 # Date: 2017-11 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.4 Add code for shared data import from the Wiki # 1.4 Add code for shared data import from the Wiki
# 1.3 Add code for database export to JSON and instructions # 1.3 Add code for database export to JSON and instructions
# for uploading annotations to the Public Student Wiki page # for uploading annotations to the Public Student Wiki page
# 1.2 Consistently: data in ./myScripts/ ; # 1.2 Consistently: data in ./myScripts/ ;
# begin SHARING DATA section # begin SHARING DATA section
# 1.1 2020 Updates # 1.1 2020 Updates
# 1.0 Live version 2017 # 1.0 Live version 2017
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# TODO: # TODO:
# Put the domain plot into a function # Put the domain plot into a function
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------------------- #TOC> ---------------------------------------------------------------------
#TOC> 1 Update your database script 51 #TOC> 1 Update your database script 51
#TOC> 1.1 Preparing an annotation file ... 58 #TOC> 1.1 Preparing an annotation file ... 58
#TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61 #TOC> 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" 61
#TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109 #TOC> 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" 109
#TOC> 1.2 Execute and Validate 136 #TOC> 1.2 Execute and Validate 136
#TOC> 2 Plot Annotations 161 #TOC> 2 Plot Annotations 161
#TOC> 3 SHARING DATA 287 #TOC> 3 SHARING DATA 287
#TOC> 3.1 Post MBP1_MYSPE as JSON data 303 #TOC> 3.1 Post MBP1_MYSPE as JSON data 303
#TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326 #TOC> 3.2 Import shared MBP1_MYSPE from the Wiki 326
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Update your database script ========================================= # = 1 Update your database script =========================================
# Since you have recorded domain features at the SMART database, we can store # Since you have recorded domain features at the SMART database, we can store
# the feature annotations in myDB ... # the feature annotations in myDB ...
# == 1.1 Preparing an annotation file ... ================================== # == 1.1 Preparing an annotation file ... ==================================
# === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment" # === 1.1.1 BEFORE "BIN-ALI-Optimal_sequence_alignment"
# #
# IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: # IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
# #
# You DON'T already have a file called "<MYSPE>-Annotations.json" in the # You DON'T already have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory: # ./myScripts/ directory:
# #
# - Make a copy of the file "./data/refAnnotations.json" and put it in your # - Make a copy of the file "./data/refAnnotations.json" and put it in your
# myScripts/ directory. # myScripts/ directory.
# #
# - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. # - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
# if MYSPE is called "Crptycoccus neoformans", your file should be called # if MYSPE is called "Crptycoccus neoformans", your file should be called
# "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is # "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
# "MBP1_CRYNE"). # "MBP1_CRYNE").
# #
# - Open the file in the RStudio editor and delete all blocks for # - Open the file in the RStudio editor and delete all blocks for
# the Mbp1 protein annotations except the first one. # the Mbp1 protein annotations except the first one.
# #
# - From that block, delete all lines that have annotations you did not # - From that block, delete all lines that have annotations you did not
# find in SMART for MBP1_MYSPE. # find in SMART for MBP1_MYSPE.
# #
# - Make enough copies of the "Ankyrin fold" and "low complexity" region # - Make enough copies of the "Ankyrin fold" and "low complexity" region
# lines to have a line for each feature you found. # lines to have a line for each feature you found.
# #
# - Then delete the comma at the end of the last line. # - Then delete the comma at the end of the last line.
# #
# - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere # - Edit the annotations: change MBP1_SACCE to MBP1_<MYSPE> everywhere
# and change the "start" and "end" features to the coordinates you # and change the "start" and "end" features to the coordinates you
# recorded in the SMART database. # recorded in the SMART database.
# #
# - Save your file in the ./myScripts/ folder. # - Save your file in the ./myScripts/ folder.
# #
# - Validate your file online at https://jsonlint.com/ # - Validate your file online at https://jsonlint.com/
# #
# - Update your "./myScripts/makeProteinDB.R" script to load your new # - Update your "./myScripts/makeProteinDB.R" script to load your new
# annotation when you recreate the database. Open the script in the # annotation when you recreate the database. Open the script in the
# RStudio editor, and add the following command at the end: # RStudio editor, and add the following command at the end:
# #
# myDB <- dbAddAnnotation(myDB, # myDB <- dbAddAnnotation(myDB,
# jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) # jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
# ^^^^^^^ # ^^^^^^^
# edit this! # edit this!
# #
# - save and close the file. # - save and close the file.
# #
# Then SKIP the next section. # Then SKIP the next section.
# #
# #
# === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment" # === 1.1.2 AFTER "BIN-ALI-Optimal_sequence_alignment"
# #
# IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: # IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
# #
# You SHOULD have a file called "<MYSPE>-Annotations.json" in the # You SHOULD have a file called "<MYSPE>-Annotations.json" in the
# ./myScripts/ directory: # ./myScripts/ directory:
# #
# - Open the file in the RStudio editor. # - Open the file in the RStudio editor.
# #
# - Make as many copies of the "APSES fold" line as you have found # - Make as many copies of the "APSES fold" line as you have found
# features in SMART. # features in SMART.
# #
# - Add a comma after every line except for the last one # - Add a comma after every line except for the last one
# #
# - Edit the annotations but include only features that are in the # - Edit the annotations but include only features that are in the
# myDB$feature table. Check which features are in the database by executing # myDB$feature table. Check which features are in the database by executing
# #
# myDB$feature$name # myDB$feature$name
# #
# - Update the "start" and "end" coordinates for each feature to the # - Update the "start" and "end" coordinates for each feature to the
# values you found. # values you found.
# #
# - Save your file. # - Save your file.
# #
# - Validate your file online at https://jsonlint.com/ # - Validate your file online at https://jsonlint.com/
# #
# #
# == 1.2 Execute and Validate ============================================== # == 1.2 Execute and Validate ==============================================
# #
# - source() your database creation script: # - source() your database creation script:
# #
# source("./myScripts/makeProteinDB.R") # source("./myScripts/makeProteinDB.R")
# #
# This should run without errors or warnings. If it doesn't work and you # This should run without errors or warnings. If it doesn't work and you
# can't figure out quickly what's happening, ask for help on the # can't figure out quickly what's happening, ask for help on the
# Discussion Board. # Discussion Board.
# #
# - Confirm # - Confirm
# The following commands should retrieve all of the features that have been # The following commands should retrieve all of the features that have been
# annotated for MBP1_MYSPE # annotated for MBP1_MYSPE
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
(proID <- myDB$protein$ID[sel]) (proID <- myDB$protein$ID[sel])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) (ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
# (once). If not, consider what could have gone wrong # (once). If not, consider what could have gone wrong
# and ask on the list if you have difficulties fixing # and ask on the list if you have difficulties fixing
# it. # it.
# = 2 Plot Annotations ==================================================== # = 2 Plot Annotations ====================================================
# In this section we will plot domain annotations as colored rectangles on a # In this section we will plot domain annotations as colored rectangles on a
# sequence, as an example of using the R plotting system for generic, data # sequence, as an example of using the R plotting system for generic, data
# driven images. # driven images.
# We need a small utility function that draws the annotation boxes on a # We need a small utility function that draws the annotation boxes on a
# representation of sequence. It should accept the start and end coordinates, # representation of sequence. It should accept the start and end coordinates,
# the y value where it should be plotted and the color of the box, and plot a # the y value where it should be plotted and the color of the box, and plot a
# rectangle using R's rect() function. # rectangle using R's rect() function.
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
# Draw a box from xStart to xEnd at y, filled with colour myCol # Draw a box from xStart to xEnd at y, filled with colour myCol
# The height of the box is y +- DELTA # The height of the box is y +- DELTA
rect(xStart, (y - DELTA), xEnd, (y + DELTA), rect(xStart, (y - DELTA), xEnd, (y + DELTA),
border = "black", col = myCol) border = "black", col = myCol)
} }
# test this: # test this:
plot(c(-1.5, 1.5), c(0, 0), type = "l") plot(c(-1.5, 1.5), c(0, 0), type = "l")
drawBox(-1, 1, 0.0, "peachpuff") drawBox(-1, 1, 0.0, "peachpuff")
# Next, we define a function to plot annotations for one protein: the name of # Next, we define a function to plot annotations for one protein: the name of
# the protein, a horizontal grey line for its length, and all of its features. # the protein, a horizontal grey line for its length, and all of its features.
plotProtein <- function(DB, name, y) { plotProtein <- function(DB, name, y) {
# DB: protein database # DB: protein database
# name: the name of the protein in the database. # name: the name of the protein in the database.
# y: height where to draw the plot # y: height where to draw the plot
# #
# Define colors: we create a vector of color values, one for # Define colors: we create a vector of color values, one for
# each feature, and we give it names of the feature ID. Then we # each feature, and we give it names of the feature ID. Then we
# can easily get the color value from the feature name. # can easily get the color value from the feature name.
# A: make a vector of color values. The syntax may appear unusual - # A: make a vector of color values. The syntax may appear unusual -
# colorRampPalette() returns a function, and we simply append # colorRampPalette() returns a function, and we simply append
# the parameter (number-of-features) without assigning the function # the parameter (number-of-features) without assigning the function
# to its own variable name. # to its own variable name.
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
"#62C923", "#0A9A9B", "#1958C3", "#62C923", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"), "#8000D3", "#D0007F"),
space="Lab", space="Lab",
interpolate="linear")(nrow(DB$feature)) interpolate="linear")(nrow(DB$feature))
# B: Features may overlap, so we make the colors transparent by setting # B: Features may overlap, so we make the colors transparent by setting
# their "alpha channel" to 1/3 (hex: 55) # their "alpha channel" to 1/3 (hex: 55)
ftrCol <- paste0(ftrCol, "55") ftrCol <- paste0(ftrCol, "55")
# C: we asssign names # C: we asssign names
names(ftrCol) <- DB$feature$ID names(ftrCol) <- DB$feature$ID
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
# find the row-index of the protein ID in the protein table of DB # find the row-index of the protein ID in the protein table of DB
iProtein <- which(DB$protein$name == name) iProtein <- which(DB$protein$name == name)
# write the name of the protein # write the name of the protein
text(-30, y, adj=1, labels=name, cex=0.75 ) text(-30, y, adj=1, labels=name, cex=0.75 )
#draw a line from 0 to nchar(sequence-of-the-protein) #draw a line from 0 to nchar(sequence-of-the-protein)
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
lwd=3, col="#999999") lwd=3, col="#999999")
# get the rows of feature annotations for the protein # get the rows of feature annotations for the protein
iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
# draw a colored box for each feature # draw a colored box for each feature
for (i in iFtr) { for (i in iFtr) {
drawBox(DB$annotation$start[i], drawBox(DB$annotation$start[i],
DB$annotation$end[i], DB$annotation$end[i],
y, y,
ftrCol[ DB$annotation$featureID[i] ]) ftrCol[ DB$annotation$featureID[i] ])
} }
} }
# Plot each annotated protein: # Plot each annotated protein:
# Get the rows of all unique annotated Mbp1 proteins in myDB # Get the rows of all unique annotated Mbp1 proteins in myDB
iRows <- grep("^MBP1_", myDB$protein$name) iRows <- grep("^MBP1_", myDB$protein$name)
# define the size of the plot-frame to accomodate all proteins # define the size of the plot-frame to accomodate all proteins
yMax <- length(iRows) * 1.1 yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame # plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) # save the current plot parameters and
# decrease margins # decrease margins
plot(1, 1, plot(1, 1,
xlim = c(-200, xMax + 100), xlim = c(-200, xMax + 100),
ylim = c(0, yMax), ylim = c(0, yMax),
type = "n", type = "n",
axes = FALSE, axes = FALSE,
bty = "n", bty = "n",
main = "Mbp1 orthologue domain annotations", main = "Mbp1 orthologue domain annotations",
xlab = "sequence position", xlab = "sequence position",
cex.axis = 0.8, cex.axis = 0.8,
ylab="") ylab="")
axis(1, at = seq(0, xMax, by = 100)) axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200", myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923", "#f0ea00", "#62C923",
"#0A9A9B", "#1958C3", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"), "#8000D3", "#D0007F"),
space="Lab", space="Lab",
interpolate="linear")(nrow(myDB$feature)) interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55") myCol <- paste0(myCol, "55")
legend(xMax - 150, 7, legend(xMax - 150, 7,
legend = myDB$feature$name, legend = myDB$feature$name,
cex = 0.7, cex = 0.7,
fill = myCol, fill = myCol,
bty = "n") bty = "n")
# Finally, iterate over all proteins and call plotProtein() # Finally, iterate over all proteins and call plotProtein()
for (i in seq_along(iRows)) { for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i) plotProtein(myDB, myDB$protein$name[iRows[i]], i)
} }
par(oPar) # reset the plot parameters par(oPar) # reset the plot parameters
# The plot shows what is variable and what is constant about the annotations in # The plot shows what is variable and what is constant about the annotations in
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the # a group of related proteins. Your MBP1_MYSPE annotations should appear at the
# top. # top.
# Task: # Task:
# Put a copy of the plot into your journal and interpret it with respect # Put a copy of the plot into your journal and interpret it with respect
# to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. # to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
# Task: # Task:
# It would be better to align the motif borders, at least approximately (not # It would be better to align the motif borders, at least approximately (not
# all proteins have all motifs). How would you go about doing that? # all proteins have all motifs). How would you go about doing that?
# = 3 SHARING DATA ======================================================== # = 3 SHARING DATA ========================================================
# It's particularly interesting to compare such annotations across many # It's particularly interesting to compare such annotations across many
# homologous proteins. I have created a page on the Student Wiki () that you can # homologous proteins. I have created a page on the Student Wiki () that you can
# edit, and then download the data from the entire class directly to your # edit, and then download the data from the entire class directly to your
# RStudio project. # RStudio project.
# #
# I have provided a function that extracts all information that refers to a # I have provided a function that extracts all information that refers to a
# single protein from the database, and prints it out as well-formatted JSON, # single protein from the database, and prints it out as well-formatted JSON,
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of # suitable to be pasted into our shareable Wiki-page. There is a fair amount of
# bookkeeping involved, but the code is not otherwise very enlightening so I # bookkeeping involved, but the code is not otherwise very enlightening so I
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
# would want to have a look. # would want to have a look.
# == 3.1 Post MBP1_MYSPE as JSON data ====================================== # == 3.1 Post MBP1_MYSPE as JSON data ======================================
# Task: # Task:
# ===== # =====
# 1: Run the following code: # 1: Run the following code:
cat("{{Vspace}}", cat("{{Vspace}}",
"<!-- ==== BEGIN PROTEIN ==== -->", "<!-- ==== BEGIN PROTEIN ==== -->",
"<pre class=\"protein-data\">", "<pre class=\"protein-data\">",
dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))), dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
"</pre>", "</pre>",
"<!-- ===== END PROTEIN ====== -->", "<!-- ===== END PROTEIN ====== -->",
"", sep = "\n" "", sep = "\n"
) )
# 2: Copy the entire output from the console. # 2: Copy the entire output from the console.
# 3: Navigate to # 3: Navigate to
# http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public # http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
# ... edit the page, and paste your output at the top. # ... edit the page, and paste your output at the top.
# 4: Save your edits. # 4: Save your edits.
# == 3.2 Import shared MBP1_MYSPE from the Wiki ============================ # == 3.2 Import shared MBP1_MYSPE from the Wiki ============================
# Once we have collected a number of protein annotations, we can access the # Once we have collected a number of protein annotations, we can access the
# Wiki-page and import the data into our database. The Wiki page is an html # Wiki-page and import the data into our database. The Wiki page is an html
# document with lots of MediaWiki specific stuff - but the contents we are # document with lots of MediaWiki specific stuff - but the contents we are
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These # interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
# work like normal HTML <pre> tags, but we have defined a special class for them # work like normal HTML <pre> tags, but we have defined a special class for them
# to make it easy to parse out the contents we want. The rvest:: package in # to make it easy to parse out the contents we want. The rvest:: package in
# combination with xml2:: provides us with all the tools we need for such # combination with xml2:: provides us with all the tools we need for such
# "Webscraping" of data.... # "Webscraping" of data....
if (! requireNamespace("rvest", quietly=TRUE)) { if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest") install.packages("rvest")
} }
if (! requireNamespace("xml2", quietly=TRUE)) { if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2") install.packages("xml2")
} }
# Here's the process: # Here's the process:
# The URL is an "open" page on the student Wiki. Users that are not logged in # The URL is an "open" page on the student Wiki. Users that are not logged in
# can view the contents, but you can only edit if you are logged in. # can view the contents, but you can only edit if you are logged in.
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public" myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
# First thing is to retrieve the HTML from the url... # First thing is to retrieve the HTML from the url...
x <- xml2::read_html(myURL) x <- xml2::read_html(myURL)
# This retrieves the page source, but that still needs to be parsed into its # This retrieves the page source, but that still needs to be parsed into its
# logical elements. HTML is a subset of XML and such documents are structured as # logical elements. HTML is a subset of XML and such documents are structured as
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes() # trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
# parses out the document structure and then uses a so-called "xpath" expression # parses out the document structure and then uses a so-called "xpath" expression
# to select nodes we are interested in. Now, xpath is one of those specialized # to select nodes we are interested in. Now, xpath is one of those specialized
# languages of which there are a few more to learn than one would care for. You # languages of which there are a few more to learn than one would care for. You
# MUST know how to format sprintf() expressions, and you SHOULD be competent # MUST know how to format sprintf() expressions, and you SHOULD be competent
# with regular expressions. But if you want to be really competent in your work, # with regular expressions. But if you want to be really competent in your work,
# basic HTML and CSS is required ... and enough knowledge about xpath to be able # basic HTML and CSS is required ... and enough knowledge about xpath to be able
# to search on Stackoverflow for what you need for parsing data out of Web # to search on Stackoverflow for what you need for parsing data out of Web
# documents... # documents...
# The expression we use below is: # The expression we use below is:
# - get any node anywhere in the tree ("//*") ... # - get any node anywhere in the tree ("//*") ...
# - that has a particular attribute("[@ ... ]"). # - that has a particular attribute("[@ ... ]").
# - The attribute we want is that the class of the node is "protein-data"; # - The attribute we want is that the class of the node is "protein-data";
# that is the class we have defined for our <pre> tags. # that is the class we have defined for our <pre> tags.
# As a result of this selection, we get a list of pointers to the document tree. # As a result of this selection, we get a list of pointers to the document tree.
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]') y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
# Next we fetch the actual payload - the text - from the tree: # Next we fetch the actual payload - the text - from the tree:
# rvest::html_text() gets the text from the list of pointers. The result is a # rvest::html_text() gets the text from the list of pointers. The result is a
# normal list of character strings. # normal list of character strings.
z <- rvest::html_text(y) z <- rvest::html_text(y)
# Finally we can iterate over the list, and add all proteins we don't already # Finally we can iterate over the list, and add all proteins we don't already
# have to our database. There may well be items that are rejected because they # have to our database. There may well be items that are rejected because they
# are already present in the database - for example, unless somebody has # are already present in the database - for example, unless somebody has
# annotated new features, all of the features are already there. Don't worry - # annotated new features, all of the features are already there. Don't worry -
# that is intended; we don't want duplicate entries. # that is intended; we don't want duplicate entries.
for (thisJSON in z) { for (thisJSON in z) {
thisData <- jsonlite::fromJSON(thisJSON) thisData <- jsonlite::fromJSON(thisJSON)
if (! thisData$protein$name %in% myDB$protein$name) { if (! thisData$protein$name %in% myDB$protein$name) {
myDB <- dbAddProtein(myDB, thisData$protein) myDB <- dbAddProtein(myDB, thisData$protein)
myDB <- dbAddTaxonomy(myDB, thisData$taxonomy) myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
myDB <- dbAddFeature(myDB, thisData$feature) myDB <- dbAddFeature(myDB, thisData$feature)
myDB <- dbAddAnnotation(myDB, thisData$annotation) myDB <- dbAddAnnotation(myDB, thisData$annotation)
} }
} }
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins: # Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
iRows <- grep("^MBP1_", myDB$protein$name) iRows <- grep("^MBP1_", myDB$protein$name)
yMax <- length(iRows) * 1.1 yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame # plot an empty frame
oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
plot(1, 1, plot(1, 1,
xlim = c(-200, xMax + 100), xlim = c(-200, xMax + 100),
ylim = c(0, yMax), ylim = c(0, yMax),
type = "n", type = "n",
axes = FALSE, axes = FALSE,
bty = "n", bty = "n",
main = "Mbp1 orthologue domain annotations", main = "Mbp1 orthologue domain annotations",
xlab = "sequence position", xlab = "sequence position",
cex.axis = 0.8, cex.axis = 0.8,
ylab="") ylab="")
axis(1, at = seq(0, xMax, by = 100)) axis(1, at = seq(0, xMax, by = 100))
myCol <- colorRampPalette(c("#f2003c", "#F0A200", myCol <- colorRampPalette(c("#f2003c", "#F0A200",
"#f0ea00", "#62C923", "#f0ea00", "#62C923",
"#0A9A9B", "#1958C3", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"), "#8000D3", "#D0007F"),
space="Lab", space="Lab",
interpolate="linear")(nrow(myDB$feature)) interpolate="linear")(nrow(myDB$feature))
myCol <- paste0(myCol, "55") myCol <- paste0(myCol, "55")
legend(xMax - 150, 7, legend(xMax - 150, 7,
legend = myDB$feature$name, legend = myDB$feature$name,
cex = 0.7, cex = 0.7,
fill = myCol, fill = myCol,
bty = "n") bty = "n")
for (i in seq_along(iRows)) { for (i in seq_along(iRows)) {
plotProtein(myDB, myDB$protein$name[iRows[i]], i) plotProtein(myDB, myDB$protein$name[iRows[i]], i)
} }
par(oPar) # reset the plot parameters par(oPar) # reset the plot parameters
# ... the more proteins we can compare, the more we learn about the # ... the more proteins we can compare, the more we learn about the
# architectural principles of this family's domains. # architectural principles of this family's domains.
# [END] # [END]

View File

@ -1,169 +1,169 @@
# tocID <- "BIN-FUNC-Semantic_similarity.R" # tocID <- "BIN-FUNC-Semantic_similarity.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC_Semantic_similarity unit. # R code accompanying the BIN-FUNC_Semantic_similarity unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-11 - 2020-09 # Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance # 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0 New code. # 1.0 New code.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------------------- #TOC> --------------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 43 #TOC> 1 Preparations: Packages, AnnotationDB, Setup 43
#TOC> 2 Fetch GO Annotations 100 #TOC> 2 Fetch GO Annotations 100
#TOC> 3 Semantic Similarities 109 #TOC> 3 Semantic Similarities 109
#TOC> 4 GO Term Enrichment in Gene Sets 127 #TOC> 4 GO Term Enrichment in Gene Sets 127
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Preparations: Packages, AnnotationDB, Setup ========================= # = 1 Preparations: Packages, AnnotationDB, Setup =========================
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
# GOSim is an R-package in the Bioconductor project. # GOSim is an R-package in the Bioconductor project.
if (! requireNamespace("GOSim", quietly = TRUE)) { if (! requireNamespace("GOSim", quietly = TRUE)) {
BiocManager::install("GOSim") BiocManager::install("GOSim")
} }
# Package information: # Package information:
# library(help = GOSim) # basic information # library(help = GOSim) # basic information
# browseVignettes("GOSim") # available vignettes # browseVignettes("GOSim") # available vignettes
# data(package = "GOSim") # available datasets # data(package = "GOSim") # available datasets
# GOSim makes extensive assumptions about loaded packages, and many base # GOSim makes extensive assumptions about loaded packages, and many base
# methods are masked. We will thus use library(GOSim) to load it # methods are masked. We will thus use library(GOSim) to load it
# in its entirety and with all packages it depends on. We will still use # in its entirety and with all packages it depends on. We will still use
# the <package>::<function>() syntax in the code below, but this now serves # the <package>::<function>() syntax in the code below, but this now serves
# more of a didactic purpose, rather than actual syntax requirements. # more of a didactic purpose, rather than actual syntax requirements.
library(GOSim) library(GOSim)
# GOSim loads human annotations in org.Hs.eg.db by default. We load yeast # GOSim loads human annotations in org.Hs.eg.db by default. We load yeast
# annotations instead... # annotations instead...
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
BiocManager::install("org.Sc.sgd.db") BiocManager::install("org.Sc.sgd.db")
} }
# Bioconductor annotation packages won't work stably unless we actually load # Bioconductor annotation packages won't work stably unless we actually load
# them: # them:
library(org.Sc.sgd.db) library(org.Sc.sgd.db)
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame # databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ... # from which we can get annotations by rows (genes) with the keys() funtion ...
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
# ... and the types of available annotations with the columns() function # ... and the types of available annotations with the columns() function
AnnotationDbi::columns(org.Sc.sgd.db) AnnotationDbi::columns(org.Sc.sgd.db)
# Note that one of the columns is "GO" ... and we load that into the # Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim: # datastructures used by GOSim:
# Choose GOterms to use # Choose GOterms to use
GOSim::setEvidenceLevel(evidences = "all", GOSim::setEvidenceLevel(evidences = "all",
organism = org.Sc.sgdORGANISM, organism = org.Sc.sgdORGANISM,
gomap = org.Sc.sgdGO) gomap = org.Sc.sgdGO)
# Use Biological Process ontology # Use Biological Process ontology
GOSim::setOntology("BP", loadIC = FALSE) GOSim::setOntology("BP", loadIC = FALSE)
# confirm that we loaded the correct ontology # confirm that we loaded the correct ontology
head(get("gomap", envir = GOSimEnv)) head(get("gomap", envir = GOSimEnv))
# = 2 Fetch GO Annotations ================================================ # = 2 Fetch GO Annotations ================================================
# All keys being used here are yeast systematic names. # All keys being used here are yeast systematic names.
# Get one set of annotations # Get one set of annotations
GOSim::getGOInfo(c("YDL056W")) # Mbp1 GOSim::getGOInfo(c("YDL056W")) # Mbp1
# = 3 Semantic Similarities =============================================== # = 3 Semantic Similarities ===============================================
# Get semantic similarities between genes # Get semantic similarities between genes
?getGeneSim ?getGeneSim
# There are _many_ different metrics of term similarity implemented # There are _many_ different metrics of term similarity implemented
# in this package. # in this package.
# Mbp1 and... # Mbp1 and...
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
# = 4 GO Term Enrichment in Gene Sets ===================================== # = 4 GO Term Enrichment in Gene Sets =====================================
# Calculating GO term enrichment in gene sets is done with the Bioconductor # Calculating GO term enrichment in gene sets is done with the Bioconductor
# topGO package. # topGO package.
if (! requireNamespace("topGO", quietly = TRUE)) { if (! requireNamespace("topGO", quietly = TRUE)) {
BiocManager::install("topGO") BiocManager::install("topGO")
} }
# Package information: # Package information:
# library(help = topGO) # basic information # library(help = topGO) # basic information
# browseVignettes("topGO") # available vignettes # browseVignettes("topGO") # available vignettes
# data(package = "topGO") # available datasets # data(package = "topGO") # available datasets
# Once again - assumptions are made by GOsim that require us to load the # Once again - assumptions are made by GOsim that require us to load the
# topGO package wholesale: # topGO package wholesale:
library(topGO) library(topGO)
# Let's define a gene set: GOterm enrichment for G1/S switch activators: # Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14 mySet <- c("YFR028C", # Cdc14
"YDL056W", # Mbp1 "YDL056W", # Mbp1
"YLR182W", # Swi6 "YLR182W", # Swi6
"YER111C", # Swi4 "YER111C", # Swi4
"YOR083W", # Whi5 "YOR083W", # Whi5
"YBR160W", # Cdc28 "YBR160W", # Cdc28
"YMR199W", # Cln1 "YMR199W", # Cln1
"YPL256C", # Cln2 "YPL256C", # Cln2
"YAL040C") # Cln3 "YAL040C") # Cln3
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
# we define enrichment # we define enrichment
myEnr <- GOenrichment(mySet, allGenes) myEnr <- GOenrichment(mySet, allGenes)
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ... sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
#Most significantly enriched is GO:0071931. What is this? #Most significantly enriched is GO:0071931. What is this?
annotate::getGOTerm("GO:0071931") # ... makes sense. annotate::getGOTerm("GO:0071931") # ... makes sense.
# [END] # [END]

View File

@ -1,351 +1,351 @@
# tocID <- "BIN-MYSPE.R" # tocID <- "BIN-MYSPE.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-MYSPE unit # R code accompanying the BIN-MYSPE unit
# #
# #
# Version: 1.4 # Version: 1.4
# #
# Date: 2017-09 - 2021-10 # Date: 2017-09 - 2021-10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 1.4 Add troubleshooting hints via errText[[...]] # V 1.4 Add troubleshooting hints via errText[[...]]
# V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about # V 1.3 2021 update of MYSPE mechanics; fix a bug no one had complained about
# V 1.2 Reorganized proportional plot section into a "further reading" # V 1.2 Reorganized proportional plot section into a "further reading"
# section, added nested-box, and sankey plot visualization of # section, added nested-box, and sankey plot visualization of
# proportions. Introduced plotly. # proportions. Introduced plotly.
# V 1.1 2020 Workflow changes # V 1.1 2020 Workflow changes
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory # V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist # V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R # V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
# #
# TODO: Sample solution for sankey plot function. # TODO: Sample solution for sankey plot function.
# #
# #
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# DO NOT SIMPLY source() THESE FILES! # DO NOT SIMPLY source() THESE FILES!
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------------- #TOC> -----------------------------------------------------------------
#TOC> 1 PREPARATIONS 52 #TOC> 1 PREPARATIONS 52
#TOC> 2 SUITABLE MYSPE SPECIES 65 #TOC> 2 SUITABLE MYSPE SPECIES 65
#TOC> 3 ADOPT "MYSPE" 89 #TOC> 3 ADOPT "MYSPE" 89
#TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128 #TOC> 4 FURTHER READING: PLOTTING PROPORTIONS 128
#TOC> 4.1 Percentages 146 #TOC> 4.1 Percentages 146
#TOC> 4.2 Visualizing proportions: Pie chart 165 #TOC> 4.2 Visualizing proportions: Pie chart 165
#TOC> 4.3 Visualizing proportions: Nested squares 243 #TOC> 4.3 Visualizing proportions: Nested squares 243
#TOC> 4.4 Visualizing proportions: Sankey diagrams 280 #TOC> 4.4 Visualizing proportions: Sankey diagrams 280
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 PREPARATIONS ======================================================== # = 1 PREPARATIONS ========================================================
# #
# Execute the two conditionals below: # Execute the two conditionals below:
if (! file.exists("./myScripts/.myProfile.R")) { if (! file.exists("./myScripts/.myProfile.R")) {
stop(errText[["noProfileFile"]]) # message defined in .Rprofile stop(errText[["noProfileFile"]]) # message defined in .Rprofile
} }
if (! exists("myStudentNumber")) { if (! exists("myStudentNumber")) {
stop(errText[["noStudentNumber"]]) # message defined in .Rprofile stop(errText[["noStudentNumber"]]) # message defined in .Rprofile
} }
# = 2 SUITABLE MYSPE SPECIES ============================================== # = 2 SUITABLE MYSPE SPECIES ==============================================
# In this unit we will select one species from a list of genome sequenced fungi # In this unit we will select one species from a list of genome sequenced fungi
# and write it into your personalized profile file. This species will be called # and write it into your personalized profile file. This species will be called
# "MYSPE" (My Species) for other learning units and exercises. # "MYSPE" (My Species) for other learning units and exercises.
# A detailed description of the process of compiling the list of genome # A detailed description of the process of compiling the list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file # sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi # ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
# was retrieved from https://fungi.ensembl.org; a search for homologues to # was retrieved from https://fungi.ensembl.org; a search for homologues to
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
# A representative organism at each genus-level was chosen from those hits # A representative organism at each genus-level was chosen from those hits
# that actual;ly have a homologue. Finally, a mapping table was constructed to # that actual;ly have a homologue. Finally, a mapping table was constructed to
# asymmetrically retrieve unique species: a student number will retrieve # asymmetrically retrieve unique species: a student number will retrieve
# a species, but (public) knowledge of the species cannot reconstruct the # a species, but (public) knowledge of the species cannot reconstruct the
# student number. # student number.
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
# of selecting and combining data from various data resources. Studying # of selecting and combining data from various data resources. Studying
# it will give you a better sense of how such workflows can be # it will give you a better sense of how such workflows can be
# implemented in practice. # implemented in practice.
# = 3 ADOPT "MYSPE" ======================================================= # = 3 ADOPT "MYSPE" =======================================================
# Execute: # Execute:
( MYSPE <- getMYSPE(myStudentNumber) ) ( MYSPE <- getMYSPE(myStudentNumber) )
# If this produced an error, this session has not been properly set up. You # If this produced an error, this session has not been properly set up. You
# may not yet have run init() and edited .myProfile.R , or that file is not # may not yet have run init() and edited .myProfile.R , or that file is not
# in your myScripts/ folder. Fix this, and execute: # in your myScripts/ folder. Fix this, and execute:
# #
# source(".Rprofile") . # source(".Rprofile") .
# If this produced NA, your Student Number may not be correct, or you are not in # If this produced NA, your Student Number may not be correct, or you are not in
# my class-list. Contact me. Otherwise, this should have printed a species name, # my class-list. Contact me. Otherwise, this should have printed a species name,
# and the taxonomy ID of its genome-sequenced strain. This is your unique # and the taxonomy ID of its genome-sequenced strain. This is your unique
# speciesfor this course. Note it in your journal ... # speciesfor this course. Note it in your journal ...
biCode(MYSPE) # and also note it's "BiCode" ... biCode(MYSPE) # and also note it's "BiCode" ...
( myTaxID <- names(MYSPE) ) # and its taxID ( myTaxID <- names(MYSPE) ) # and its taxID
# Task: # Task:
# ===== # =====
# Note down the species name and its five letter BiCode on your Student # Note down the species name and its five letter BiCode on your Student
# Wiki user page. Use this species whenever this or future assignments refer # Wiki user page. Use this species whenever this or future assignments refer
# to MYSPE. Whenever you start a session, it will automatically be loaded # to MYSPE. Whenever you start a session, it will automatically be loaded
# from myScripts/.myProfile.R and is available as MYSPE . # from myScripts/.myProfile.R and is available as MYSPE .
# Here is some more information about MYSPE, taken from the table of genome- # Here is some more information about MYSPE, taken from the table of genome-
# sequenced fungi that is in your ./data folder. # sequenced fungi that is in your ./data folder.
fungiDat <- read.csv("data/Species.csv") fungiDat <- read.csv("data/Species.csv")
iMs <- which(fungiDat$Taxon.ID == myTaxID) iMs <- which(fungiDat$Taxon.ID == myTaxID)
( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order ( myOr <- fungiDat$Classification[iMs] ) # Taxonomic order
( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus ( myGn <- gsub("\\s.*", "", MYSPE)) # Taxonomic genus
( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain ( mySt <- fungiDat$Name[iMs] ) # Taxonomic strain
# That's all. # That's all.
# = 4 FURTHER READING: PLOTTING PROPORTIONS =============================== # = 4 FURTHER READING: PLOTTING PROPORTIONS ===============================
# The material below is an exploration of data-preparation and plotting # The material below is an exploration of data-preparation and plotting
# techniques; you can treat this as additional practice and further reading and # techniques; you can treat this as additional practice and further reading and
# I expect that some of the code and plotting examples may be useful in a # I expect that some of the code and plotting examples may be useful in a
# different context. # different context.
# A frequent task is to visualize the proportion of elements with given # A frequent task is to visualize the proportion of elements with given
# categories in a sample. For example, we might ask what the proportion of the # categories in a sample. For example, we might ask what the proportion of the
# different orders of fungi is the order of MYSPE? Let's first collect the # different orders of fungi is the order of MYSPE? Let's first collect the
# numbers. # numbers.
( nFungi <- nrow(fungiDat) ) # sequenced fungi ( nFungi <- nrow(fungiDat) ) # sequenced fungi
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE ( nGenus <- sum(grepl(myGn, fungiDat$Name)) ) # same genus as MYSPE
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) ) # same species as MYSPE
# == 4.1 Percentages ======================================================= # == 4.1 Percentages =======================================================
# The zeroth-order approach to visualization is simply to print percentages: # The zeroth-order approach to visualization is simply to print percentages:
cat(sprintf("\n%s comprise %5.2f%% of fungi.", cat(sprintf("\n%s comprise %5.2f%% of fungi.",
myOr, myOr,
(nOrder * 100) / nFungi)) (nOrder * 100) / nFungi))
# ... or, adding the actual numbers: # ... or, adding the actual numbers:
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
myOr, myOr,
(nOrder * 100) / nFungi, (nOrder * 100) / nFungi,
nOrder, nOrder,
nFungi)) nFungi))
# But that's hard to visualize for most of us, and anyway, we don't know how # But that's hard to visualize for most of us, and anyway, we don't know how
# that relates to other orders. # that relates to other orders.
# == 4.2 Visualizing proportions: Pie chart ================================ # == 4.2 Visualizing proportions: Pie chart ================================
# Often, we will use a pie chart instead. Pie charts are rather informal types # Often, we will use a pie chart instead. Pie charts are rather informal types
# of plots, not well suited for analysis. But easy to do: # of plots, not well suited for analysis. But easy to do:
# Define four colors to identify the four categories # Define four colors to identify the four categories
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0 oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
# and remember the # and remember the
# previous setting # previous setting
pie(c(nSpecies, # subtract numbers since these pie(c(nSpecies, # subtract numbers since these
nGenus - nSpecies, # categories are mutually contained nGenus - nSpecies, # categories are mutually contained
nOrder - nGenus - nSpecies, # in each other nOrder - nGenus - nSpecies, # in each other
nFungi - nOrder - nGenus - nSpecies), nFungi - nOrder - nGenus - nSpecies),
labels = "", labels = "",
radius = 0.9, radius = 0.9,
main = "MYSPE in genome-sequenced fungi", main = "MYSPE in genome-sequenced fungi",
lty = 0, # turn borders for wedges off lty = 0, # turn borders for wedges off
col = pCol, col = pCol,
clockwise = TRUE, clockwise = TRUE,
init.angle = 90) init.angle = 90)
title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot title(main=MYSPE, line=0, cex.main=0.7) # add a title to the plot
legend(x = 0.95, y = 0.8, # place at legend here legend(x = 0.95, y = 0.8, # place at legend here
legend = c("Species", "Genus", "Order", "Fungi"), legend = c("Species", "Genus", "Order", "Fungi"),
y.intersp = 2, # line spacing for labels y.intersp = 2, # line spacing for labels
cex = 0.8, # character size for labels cex = 0.8, # character size for labels
bty = "n", # "no" box around the legend bty = "n", # "no" box around the legend
pt.cex = 2, # size of colour boxes pt.cex = 2, # size of colour boxes
pch = 15, # a filled square pch = 15, # a filled square
col = pCol) col = pCol)
par(oPar) # reset graphics state par(oPar) # reset graphics state
# Unless MYSPE is one of the frequently sequenced species, there will only be a # Unless MYSPE is one of the frequently sequenced species, there will only be a
# very thin wedge visible. Pie charts are not well suited to visualize small # very thin wedge visible. Pie charts are not well suited to visualize small
# proportions. # proportions.
# It is a little more useful if we have non-nested proportions - like the # It is a little more useful if we have non-nested proportions - like the
# number of species in the same order overall: # number of species in the same order overall:
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
head(myTbl) head(myTbl)
# pie() does a reasonable job out of the box to interpret table() data: # pie() does a reasonable job out of the box to interpret table() data:
pie(myTbl) pie(myTbl)
# ... we can improve this quickly with a bit of tweaking: # ... we can improve this quickly with a bit of tweaking:
N <- length(myTbl) N <- length(myTbl)
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
myCol <- rep(pCol[4], N) # N elements of pCol[1] myCol <- rep(pCol[4], N) # N elements of pCol[1]
myCol[sel] <- pCol[1] # replace this one color myCol[sel] <- pCol[1] # replace this one color
myLbl <- rep("", N) # N labels of "" myLbl <- rep("", N) # N labels of ""
myLbl[sel] <- myOr # replace this one label with the MYSPE order myLbl[sel] <- myOr # replace this one label with the MYSPE order
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0 oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1)) # set margins to ~ 0
pie(myTbl, pie(myTbl,
labels = myLbl, labels = myLbl,
radius = 0.9, radius = 0.9,
main = "MYSPE order", main = "MYSPE order",
border = "#DDDDDD", border = "#DDDDDD",
col = myCol, col = myCol,
clockwise = TRUE, clockwise = TRUE,
init.angle = 90) init.angle = 90)
par(oPar) # reset graphics state par(oPar) # reset graphics state
# But the overall problem remains. # But the overall problem remains.
# == 4.3 Visualizing proportions: Nested squares =========================== # == 4.3 Visualizing proportions: Nested squares ===========================
# A simple alternative is to draw such proportions as nested squares: # A simple alternative is to draw such proportions as nested squares:
x <- sqrt(nFungi) x <- sqrt(nFungi)
# set margins to ~ 0 and type to square # set margins to ~ 0 and type to square
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
# empty, square plot # empty, square plot
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
type="n", axes=FALSE, xlab="", ylab="") type="n", axes=FALSE, xlab="", ylab="")
# basic square for all genomes # basic square for all genomes
rect(0, 0, x, x, col = pCol[4]) rect(0, 0, x, x, col = pCol[4])
# grid # grid
u <- 0:floor(x) u <- 0:floor(x)
N <- length(u) N <- length(u)
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
# each square on this grid is one genome # each square on this grid is one genome
# colored squares # colored squares
rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3]) rect(0, 0, sqrt(nOrder), sqrt(nOrder), col = pCol[3])
rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2]) rect(0, 0, sqrt(nGenus), sqrt(nGenus), col = pCol[2])
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
# labels # labels
text(x/2, x/2, "Fungi") text(x/2, x/2, "Fungi")
text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9) text(x * 0.08, x * 0.11, myOr, pos = 4, cex = 0.9)
text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8) text(x * 0.08, x * 0.06, myGn, pos = 4, cex = 0.8)
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
par(oPar) # reset graphics state par(oPar) # reset graphics state
# == 4.4 Visualizing proportions: Sankey diagrams ========================== # == 4.4 Visualizing proportions: Sankey diagrams ==========================
# Sankey diagrams are an excellent way to visualize complicated nested # Sankey diagrams are an excellent way to visualize complicated nested
# proportions and their changes (see here for example: # proportions and their changes (see here for example:
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
# example with the MYSPE proportions, as an illustration of the plotting # example with the MYSPE proportions, as an illustration of the plotting
# principle. # principle.
if (! requireNamespace("plotly")) { if (! requireNamespace("plotly")) {
install.packages("plotly") install.packages("plotly")
} }
# Package information: # Package information:
# library(help = plotly) # basic information # library(help = plotly) # basic information
# browseVignettes("plotly") # available vignettes # browseVignettes("plotly") # available vignettes
# data(package = "plotly") # available datasets # data(package = "plotly") # available datasets
# Here, we use the plotly package that wraps a very well developed javascript # Here, we use the plotly package that wraps a very well developed javascript
# library with many options for interactive plots. I am producing this plot # library with many options for interactive plots. I am producing this plot
# hard-coded for the sample organism "Sporothrix schenkii"; you would need # hard-coded for the sample organism "Sporothrix schenkii"; you would need
# to change the code to adapt it to your own MYSPE - or even build a function # to change the code to adapt it to your own MYSPE - or even build a function
# for this. Do try this if you have a bit of coding experience, sankey diagrams # for this. Do try this if you have a bit of coding experience, sankey diagrams
# are a good way to show hierarchical data relations - and if you get this # are a good way to show hierarchical data relations - and if you get this
# working for your own organism you can be proud that you have understood # working for your own organism you can be proud that you have understood
# how preparing the data works. # how preparing the data works.
myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID myNodes <- list(label = c("Fungi (1014)", # 0 <- node ID
"Ophiostomatales (6)", # 1 "Ophiostomatales (6)", # 1
"Other...", # 2 "Other...", # 2
"Sporothrix (4)", # 3 "Sporothrix (4)", # 3
"Other...", # 4 "Other...", # 4
"Sporothrix schenckii (2)", # 5 "Sporothrix schenckii (2)", # 5
"Other..." # 6 "Other..." # 6
), ),
x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
color = c("#f2f2f0", # color = c("#f2f2f0", #
"#ffd5c4", "#ffd5c4",
"#CCCCCC", "#CCCCCC",
"#ff9582", "#ff9582",
"#CCCCCC", "#CCCCCC",
"#ed394e", "#ed394e",
"#CCCCCC" "#CCCCCC"
), ),
pad = 15, pad = 15,
thickness = 20, thickness = 20,
line = list(color = "black", line = list(color = "black",
width = 0.5)) width = 0.5))
myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of myLinks <- list(source = c(0, 0, 1, 1, 3, 3), # i.e. there is a link of
target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0 target = c(1, 2, 3, 4, 5, 6), # weight 6 between node 0
value = c(6, 18, 4, 2, 2, 2)) # and node 1 value = c(6, 18, 4, 2, 2, 2)) # and node 1
# Setting up the actual plot ... # Setting up the actual plot ...
fig <- plotly::plot_ly(type = "sankey", fig <- plotly::plot_ly(type = "sankey",
arrangement = "snap", arrangement = "snap",
orientation = "h", orientation = "h",
node = myNodes, node = myNodes,
link = myLinks) link = myLinks)
# Adding and adjusting a few layout parameters # Adding and adjusting a few layout parameters
fig <- plotly::layout(fig, fig <- plotly::layout(fig,
title = "Fungi Genomes - Classification", title = "Fungi Genomes - Classification",
font = list(size = 10)) font = list(size = 10))
fig # plot the diagram fig # plot the diagram
# Note that the plot appears in the Viewer window, not the Plot window, and that # Note that the plot appears in the Viewer window, not the Plot window, and that
# it is interactive: you can hover over nodes and links, and drag the nodes # it is interactive: you can hover over nodes and links, and drag the nodes
# around. # around.
# [END] # [END]

View File

@ -1,234 +1,234 @@
# tocID <- "BIN-PHYLO-Data_preparation.R" # tocID <- "BIN-PHYLO-Data_preparation.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit. # R code accompanying the BIN-PHYLO-Data_preparation unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance # 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0 First 2017 version # 1.0 First 2017 version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------- #TOC> ---------------------------------------------------------
#TOC> 1 Preparations 45 #TOC> 1 Preparations 45
#TOC> 2 Fetching sequences 77 #TOC> 2 Fetching sequences 77
#TOC> 3 Multiple Sequence Alignment 118 #TOC> 3 Multiple Sequence Alignment 118
#TOC> 4 Reviewing and Editing Alignments 137 #TOC> 4 Reviewing and Editing Alignments 137
#TOC> 4.1 Masking workflow 153 #TOC> 4.1 Masking workflow 153
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Preparations ======================================================== # = 1 Preparations ========================================================
# You need to reload your protein database, including changes that might have # You need to reload your protein database, including changes that might have
# been made to the reference files. If you have worked with the prerequiste # been made to the reference files. If you have worked with the prerequiste
# units, you should have a script named "makeProteinDB.R" that will create the # units, you should have a script named "makeProteinDB.R" that will create the
# myDB object with a protein and feature database. Ask for advice if not. # myDB object with a protein and feature database. Ask for advice if not.
source("myScripts/makeProteinDB.R") source("myScripts/makeProteinDB.R")
# Load packages we need # Load packages we need
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly = TRUE)) { if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
if (! requireNamespace("msa", quietly = TRUE)) { if (! requireNamespace("msa", quietly = TRUE)) {
BiocManager::install("msa") BiocManager::install("msa")
} }
# Package information: # Package information:
# library(help = msa) # basic information # library(help = msa) # basic information
# browseVignettes("msa") # available vignettes # browseVignettes("msa") # available vignettes
# data(package = "msa") # available datasets # data(package = "msa") # available datasets
# = 2 Fetching sequences ================================================== # = 2 Fetching sequences ==================================================
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES # RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
# domains. You have annotated their ranges as a feature. The following code # domains. You have annotated their ranges as a feature. The following code
# retrieves the sequences from myDB. You have seen similar code in other units. # retrieves the sequences from myDB. You have seen similar code in other units.
sel <- grep("^MBP1_", myDB$protein$name) sel <- grep("^MBP1_", myDB$protein$name)
(proNames <- myDB$protein$name[sel]) (proNames <- myDB$protein$name[sel])
(proIDs <- myDB$protein$ID[sel]) (proIDs <- myDB$protein$ID[sel])
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) (sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% ! (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
myDB$annotation$featureID == sel]) # == ! myDB$annotation$featureID == sel]) # == !
# Why? # Why?
APSI <- character(length(fanIDs)) APSI <- character(length(fanIDs))
for (i in seq_along(fanIDs)) { for (i in seq_along(fanIDs)) {
sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index sel <- myDB$annotation$ID == fanIDs[i] # get the feature row index
proID <- myDB$annotation$proteinID[sel] # get its protein ID proID <- myDB$annotation$proteinID[sel] # get its protein ID
start <- myDB$annotation$start[sel] # get start ... start <- myDB$annotation$start[sel] # get start ...
end <- myDB$annotation$end[sel] # ... and end end <- myDB$annotation$end[sel] # ... and end
sel <- myDB$protein$ID == proID # get the protein row index ... sel <- myDB$protein$ID == proID # get the protein row index ...
# ... and the sequence # ... and the sequence
APSI[i] <- substring(myDB$protein$sequence[sel], start, end) APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
names(APSI)[i] <- (myDB$protein$name[sel]) names(APSI)[i] <- (myDB$protein$name[sel])
} }
head(APSI) head(APSI)
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogenetic tree (see the unit's Wiki page for details on the sequence). # phylogenetic tree (see the unit's Wiki page for details on the sequence).
APSI <- c(APSI, APSI <- c(APSI,
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ") "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
names(APSI)[length(APSI)] <- "KILA_ESCCO" names(APSI)[length(APSI)] <- "KILA_ESCCO"
tail(APSI) tail(APSI)
# = 3 Multiple Sequence Alignment ========================================= # = 3 Multiple Sequence Alignment =========================================
# This vector of sequences with named elements fulfills the requirements to be # This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for # imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings. # the MSA algorithms in Biostrings.
# #
APSESSet <- Biostrings::AAStringSet(APSI) APSESSet <- Biostrings::AAStringSet(APSI)
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
# that happens in your case, just use msaClustalOmega() instead. # that happens in your case, just use msaClustalOmega() instead.
# inspect the alignment. # inspect the alignment.
writeALN(APSESMsa) writeALN(APSESMsa)
# What do you think? Is this a good alignment for phylogenetic inference? # What do you think? Is this a good alignment for phylogenetic inference?
# = 4 Reviewing and Editing Alignments ==================================== # = 4 Reviewing and Editing Alignments ====================================
# Head back to the Wiki page for this unit and read up on the background # Head back to the Wiki page for this unit and read up on the background
# first. # first.
# Let's mask out all columns that have observations for # Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This # less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3)) # means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column. # hyphens in a column.
# #
# We take all sequences, split them into single # We take all sequences, split them into single
# characters, and put them into a matrix. Then we # characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide # go through the matrix, column by column and decide
# whether we want to include that column. # whether we want to include that column.
# == 4.1 Masking workflow ================================================== # == 4.1 Masking workflow ==================================================
# get the length of the alignment # get the length of the alignment
(lenAli <- APSESMsa@unmasked@ranges@width[1]) (lenAli <- APSESMsa@unmasked@ranges@width[1])
# initialize a matrix that can hold all characters # initialize a matrix that can hold all characters
# individually # individually
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli), msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
ncol = lenAli) ncol = lenAli)
# assign the correct rownames # assign the correct rownames
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsa)) { for (i in 1:nrow(APSESMsa)) {
msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), "")) msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
} }
# inspect the result # inspect the result
msaMatrix[1:7, 30:40] msaMatrix[1:7, 30:40]
# Now let's make a logical vector with an element for each column that selects # Now let's make a logical vector with an element for each column that selects
# which columns should be masked out. # which columns should be masked out.
# The number of hyphens in a column is easy to count. Consider: # The number of hyphens in a column is easy to count. Consider:
msaMatrix[ , 20] # column 20 msaMatrix[ , 20] # column 20
msaMatrix[ , 20] == "-" # TRUE for all gap characters msaMatrix[ , 20] == "-" # TRUE for all gap characters
sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE sum(msaMatrix[ , 20] == "-") # adds 1 for each TRUE
# Thus filling our logical vector is simple: # Thus filling our logical vector is simple:
# initialize a mask # initialize a mask
colMask <- logical(ncol(msaMatrix)) colMask <- logical(ncol(msaMatrix))
# define the threshold for rejecting a column # define the threshold for rejecting a column
limit <- round(nrow(APSESMsa) * (2/3)) limit <- round(nrow(APSESMsa) * (2/3))
# iterate over all columns, and write TRUE if there are less-or-equal to "limit" # iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis # hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
# and FALSE columns will be rejected. # and FALSE columns will be rejected.
for (i in 1:ncol(msaMatrix)) { for (i in 1:ncol(msaMatrix)) {
count <- sum(msaMatrix[ , i] == "-") count <- sum(msaMatrix[ , i] == "-")
colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
} }
# Inspect the mask # Inspect the mask
colMask colMask
# How many positions are being kept? # How many positions are being kept?
sum(colMask) sum(colMask)
cat(sprintf("We are masking %4.2f %% of alignment columns.\n", cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask))))) 100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix # Next, we use colMask to remove the masked columns from the matrix
# in one step: # in one step:
maskedMatrix <- msaMatrix[ , colMask] maskedMatrix <- msaMatrix[ , colMask]
# check: # check:
ncol(maskedMatrix) ncol(maskedMatrix)
# ... then collapse each row of single characters back into a string ... # ... then collapse each row of single characters back into a string ...
APSESphyloSet <- character() APSESphyloSet <- character()
for (i in 1:nrow(maskedMatrix)) { for (i in 1:nrow(maskedMatrix)) {
APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="") APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
} }
names(APSESphyloSet) <- rownames(maskedMatrix) names(APSESphyloSet) <- rownames(maskedMatrix)
# inspect ... # inspect ...
writeALN(APSESphyloSet) writeALN(APSESphyloSet)
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and # As you see, we have removed a three residue insertion from MBP1_NEUCR, and
# several indels from the KILA_ESCCO outgroup sequence. # several indels from the KILA_ESCCO outgroup sequence.
# We save the aligned, masked domains to a file in the data/ directory, # We save the aligned, masked domains to a file in the data/ directory,
# in multi-FASTA format. # in multi-FASTA format.
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa") writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
# [END] # [END]

View File

@ -1,406 +1,406 @@
# tocID <- "BIN-PHYLO-Tree_analysis.R" # tocID <- "BIN-PHYLO-Tree_analysis.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit. # R code accompanying the BIN-PHYLO-Tree_analysis unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 updates. Deprecate iTol and use taxize:: instead. # 1.2 2020 updates. Deprecate iTol and use taxize:: instead.
# Rewrite of tip re-ordering. Better handling of # Rewrite of tip re-ordering. Better handling of
# messages. pBar() for randomization. # messages. pBar() for randomization.
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0.2 Typo in variable name, style changes # 1.0.2 Typo in variable name, style changes
# 1.0.1 Wrong section heading # 1.0.1 Wrong section heading
# 1.0 First 2017 version # 1.0 First 2017 version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------- #TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 50 #TOC> 1 Preparation and Tree Plot 50
#TOC> 2 SPECIES REFERENCE TREE 66 #TOC> 2 SPECIES REFERENCE TREE 66
#TOC> 3 Tree Analysis 117 #TOC> 3 Tree Analysis 117
#TOC> 3.1 Rooting Trees 177 #TOC> 3.1 Rooting Trees 177
#TOC> 3.2 Rotating Clades 222 #TOC> 3.2 Rotating Clades 222
#TOC> 3.3 Computing tree distances 309 #TOC> 3.3 Computing tree distances 309
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Preparation and Tree Plot =========================================== # = 1 Preparation and Tree Plot ===========================================
if (! requireNamespace("ape", quietly = TRUE)) { if (! requireNamespace("ape", quietly = TRUE)) {
install.packages("ape") install.packages("ape")
} }
# Package information: # Package information:
# library(help = ape) # basic information # library(help = ape) # basic information
# browseVignettes("ape") # available vignettes # browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets # data(package = "ape") # available datasets
# We change the graphics parameters from time to time, let's define the # We change the graphics parameters from time to time, let's define the
# default so we can recreate a sane state: # default so we can recreate a sane state:
dev.off() dev.off()
PAR <- par() PAR <- par()
# = 2 SPECIES REFERENCE TREE ============================================== # = 2 SPECIES REFERENCE TREE ==============================================
# Before we do any kind of phylogenetic analysis of genes from several species, # Before we do any kind of phylogenetic analysis of genes from several species,
# we MUST have a reference tree of the taxonomic relationships in hand. This # we MUST have a reference tree of the taxonomic relationships in hand. This
# context is absolutely required for the interpretation of our tree. # context is absolutely required for the interpretation of our tree.
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. # We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
if (! requireNamespace("taxize", quietly = TRUE)) { if (! requireNamespace("taxize", quietly = TRUE)) {
install.packages("taxize") install.packages("taxize")
} }
# Package information: # Package information:
# library(help = taxize) # basic information # library(help = taxize) # basic information
# browseVignettes("taxize") # available vignettes # browseVignettes("taxize") # available vignettes
# data(package = "taxize") # available datasets # data(package = "taxize") # available datasets
( mySOI <- c(myDB$taxonomy$ID, "83333") ) ( mySOI <- c(myDB$taxonomy$ID, "83333") )
myClass <- taxize::classification(mySOI, db = "ncbi") myClass <- taxize::classification(mySOI, db = "ncbi")
str(myClass) str(myClass)
myClass[[1]] myClass[[1]]
fungiTree <- taxize::class2tree(myClass, check = TRUE) fungiTree <- taxize::class2tree(myClass, check = TRUE)
plot(fungiTree) plot(fungiTree)
# The tree produced by taxize:: contains full length species names, # The tree produced by taxize:: contains full length species names,
# but it would be more convenient if it had bicodes instead. Also, the actual # but it would be more convenient if it had bicodes instead. Also, the actual
# tree is only part of the list(), which will cause problems later: # tree is only part of the list(), which will cause problems later:
str(fungiTree) str(fungiTree)
# we therefor simplify # we therefor simplify
fungiTree <- fungiTree$phylo fungiTree <- fungiTree$phylo
str(fungiTree) str(fungiTree)
# The species names are in a vector $phylo$tip.label of this list. # The species names are in a vector $phylo$tip.label of this list.
# We can use biCode() to shorten them. # We can use biCode() to shorten them.
fungiTree$tip.label <- biCode(fungiTree$tip.label) fungiTree$tip.label <- biCode(fungiTree$tip.label)
# Plot the tree # Plot the tree
nSP <- length(fungiTree$tip.label) nSP <- length(fungiTree$tip.label)
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
ape::nodelabels(text = fungiTree$node.label, ape::nodelabels(text = fungiTree$node.label,
cex = 0.6, cex = 0.6,
adj = 0.2, adj = 0.2,
bg = "#D4F2DA") bg = "#D4F2DA")
# Note that you can use the arrow buttons in the menu above the plot pane to # Note that you can use the arrow buttons in the menu above the plot pane to
# scroll back to plots you have created earlier - so you can reference back to # scroll back to plots you have created earlier - so you can reference back to
# this species tree in your later analysis. # this species tree in your later analysis.
# = 3 Tree Analysis ======================================================= # = 3 Tree Analysis =======================================================
# 1.1 Visualizing your tree # 1.1 Visualizing your tree
# The trees that are produced by Rphylip are stored as an object of class # The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the # "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and # community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the # manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated # Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere. # trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and # The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been defined with the Rphylip package, and in ape. You can # print it have been defined with the Rphylip package, and in ape. You can
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
# to plot it. The underlying function is plot.phylo(), and documentation for its # to plot it. The underlying function is plot.phylo(), and documentation for its
# many options can by found by typing: # many options can by found by typing:
?plot.phylo ?plot.phylo
# We load the APSES sequence tree that you produced in the # We load the APSES sequence tree that you produced in the
# BIN-PHYLO-Tree_building unit: # BIN-PHYLO-Tree_building unit:
apsTree <- readRDS(file = "data/APSEStreeRproml.rds") apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
plot(apsTree) # default type is "phylogram" plot(apsTree) # default type is "phylogram"
plot(apsTree, type = "unrooted") plot(apsTree, type = "unrooted")
plot(apsTree, type = "fan", no.margin = TRUE) plot(apsTree, type = "fan", no.margin = TRUE)
# rescale to show all of the labels: # rescale to show all of the labels:
# record the current plot parameters by assigning them to a variable ... # record the current plot parameters by assigning them to a variable ...
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)) (tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
# ... and adjust the plot limits for a new plot: # ... and adjust the plot limits for a new plot:
plot(apsTree, plot(apsTree,
type = "fan", type = "fan",
x.lim = tmp$x.lim * 1.8, x.lim = tmp$x.lim * 1.8,
y.lim = tmp$y.lim * 1.8, y.lim = tmp$y.lim * 1.8,
cex = 0.8, cex = 0.8,
no.margin = TRUE) no.margin = TRUE)
# Inspect the tree object # Inspect the tree object
str(apsTree) str(apsTree)
apsTree$tip.label apsTree$tip.label
apsTree$edge apsTree$edge
apsTree$edge.length apsTree$edge.length
# show the node / edge and tip labels on a plot # show the node / edge and tip labels on a plot
plot(apsTree) plot(apsTree)
ape::nodelabels() ape::nodelabels()
ape::edgelabels() ape::edgelabels()
ape::tiplabels() ape::tiplabels()
# show the number of nodes, edges and tips # show the number of nodes, edges and tips
ape::Nnode(apsTree) ape::Nnode(apsTree)
ape::Nedge(apsTree) ape::Nedge(apsTree)
ape::Ntip(apsTree) ape::Ntip(apsTree)
par(PAR) # reset graphics state par(PAR) # reset graphics state
# Finally, write the tree to console in Newick format # Finally, write the tree to console in Newick format
ape::write.tree(apsTree) ape::write.tree(apsTree)
# == 3.1 Rooting Trees ===================================================== # == 3.1 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its # In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree. # clades. Contrary to documentation, Rproml() returns an unrooted tree.
ape::is.rooted(apsTree) ape::is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package. # You can root the tree with the command root() from the "ape" package.
plot(apsTree) plot(apsTree)
# add labels for internal nodes and tips # add labels for internal nodes and tips
ape::nodelabels(cex = 0.5, frame = "circle") ape::nodelabels(cex = 0.5, frame = "circle")
ape::tiplabels(cex = 0.5, frame = "rect") ape::tiplabels(cex = 0.5, frame = "rect")
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different # The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
# number in yours. Substitute the correct node number below for "outgroup". # number in yours. Substitute the correct node number below for "outgroup".
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
plot(apsTree) plot(apsTree)
ape::is.rooted(apsTree) ape::is.rooted(apsTree)
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common # an edge of length zero was added to connect the MRCA (Most Recent Common
# Ancestor) of the ingroup. # Ancestor) of the ingroup.
# The edge lengths are stored in the phylo object: # The edge lengths are stored in the phylo object:
apsTree$edge.length apsTree$edge.length
# ... and you can assign a small arbitrary value to the edge # ... and you can assign a small arbitrary value to the edge
# to show how it connects to the tree without having an # to show how it connects to the tree without having an
# overlap. # overlap.
apsTree$edge.length[1] <- 0.1 apsTree$edge.length[1] <- 0.1
plot(apsTree, cex = 0.7) plot(apsTree, cex = 0.7)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
# This procedure does however not assign an actual length to a root edge, and # This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? , you might ask. I ask # therefore no root edge is visible on the plot. Why? , you might ask. I ask
# myself that too. We'll just add a length by hand. # myself that too. We'll just add a length by hand.
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex = 0.7, root.edge = TRUE) plot(apsTree, cex = 0.7, root.edge = TRUE)
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# == 3.2 Rotating Clades =================================================== # == 3.2 Rotating Clades ===================================================
# To interpret the tree, it is useful to rotate the clades so that they appear # To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species. # in the order expected from the cladogram of species.
# We can either rotate around individual internal nodes ... # We can either rotate around individual internal nodes ...
layout(matrix(1:2, 1, 2)) layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin = TRUE, root.edge = TRUE) plot(apsTree, no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
# Note that the species at the bottom of the clade descending from node # Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top. # 17 is now plotted at the top.
par(PAR) # reset graphics state par(PAR) # reset graphics state
# ... or we can rearrange the tree so it corresponds as well as possible to a # ... or we can rearrange the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that taxize:: has inferred # predefined tip ordering. Here we use the ordering that taxize:: has inferred
# from the NCBI taxonomic classification. # from the NCBI taxonomic classification.
nOrg <- length(apsTree$tip.label) nOrg <- length(apsTree$tip.label)
plot(fungiTree, plot(fungiTree,
no.margin = FALSE, root.edge = TRUE) no.margin = FALSE, root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label, ape::nodelabels(text = fungiTree$node.label,
cex = 0.5, cex = 0.5,
adj = 0.2, adj = 0.2,
bg = "#D4F2DA") bg = "#D4F2DA")
# These are the fungi tree tips ... # These are the fungi tree tips ...
fungiTree$tip.label fungiTree$tip.label
# ... and their order is determined by the edge-list that is stored in # ... and their order is determined by the edge-list that is stored in
fungiTree$edge fungiTree$edge
# which edges join the tips? # which edges join the tips?
ape::tiplabels(cex = 0.5, frame = "rect") ape::tiplabels(cex = 0.5, frame = "rect")
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are # as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
# ordered from bottom to top. # ordered from bottom to top.
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... # And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
sel <- fungiTree$edge[ , 2 ] <= nOrg sel <- fungiTree$edge[ , 2 ] <= nOrg
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) ( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
# Now, here are the genes of the apsTree tips ... # Now, here are the genes of the apsTree tips ...
apsTree$tip.label apsTree$tip.label
# ... and the "constraint" we need for reordering, according to the help page # ... and the "constraint" we need for reordering, according to the help page
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they # of ape::rotateConstr(), is "a vector specifying the order of the tips as they
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector # should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
oSp <- gsub("^", "MBP1_", oSp) oSp <- gsub("^", "MBP1_", oSp)
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) ( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
# Then we can plot the two trees to compare: the fungi- tree # Then we can plot the two trees to compare: the fungi- tree
par(PAR) # reset graphics state par(PAR) # reset graphics state
layout(matrix(1:2, 1, 2)) layout(matrix(1:2, 1, 2))
plot(fungiTree, plot(fungiTree,
no.margin = TRUE, no.margin = TRUE,
root.edge = TRUE) root.edge = TRUE)
ape::nodelabels(text = fungiTree$node.label, ape::nodelabels(text = fungiTree$node.label,
cex = 0.5, cex = 0.5,
adj = 0.2, adj = 0.2,
bg = "#D4F2DA") bg = "#D4F2DA")
# and the re-organized apsesTree ... # and the re-organized apsesTree ...
plot(ape::rotateConstr(apsTree, constraint = oSp[]), plot(ape::rotateConstr(apsTree, constraint = oSp[]),
no.margin = TRUE, no.margin = TRUE,
root.edge = TRUE) root.edge = TRUE)
par(PAR) # reset graphics state par(PAR) # reset graphics state
# As you can see, the reordering is not perfect, since the topologies are # As you can see, the reordering is not perfect, since the topologies are
# different, mostly due to the unresolved nodes in the reference tree. One # different, mostly due to the unresolved nodes in the reference tree. One
# could play with that ... # could play with that ...
# Task: Study the two trees and consider their similarities and differences. # Task: Study the two trees and consider their similarities and differences.
# What do you expect? What do you find? Note that this is not a "mixed" # What do you expect? What do you find? Note that this is not a "mixed"
# gene tree yet, since it contains only a single gene for the species # gene tree yet, since it contains only a single gene for the species
# we considered. All of the branch points in this tree are speciation # we considered. All of the branch points in this tree are speciation
# events. Thus the gene tree should have the same topology as the # events. Thus the gene tree should have the same topology as the
# species tree. Does it? Are the differences important? How many # species tree. Does it? Are the differences important? How many
# branches would you need to remove and reinsert elsewhere to get the # branches would you need to remove and reinsert elsewhere to get the
# same topology as the species tree? # same topology as the species tree?
# In order to quantify how different these two trees are, we need to compute # In order to quantify how different these two trees are, we need to compute
# tree distances. # tree distances.
# == 3.3 Computing tree distances ========================================== # == 3.3 Computing tree distances ==========================================
# Many superb phylogeny tools are contributed by the phangorn package. # Many superb phylogeny tools are contributed by the phangorn package.
if (! requireNamespace("phangorn", quietly = TRUE)) { if (! requireNamespace("phangorn", quietly = TRUE)) {
install.packages("phangorn") install.packages("phangorn")
} }
# Package information: # Package information:
# library(help = phangorn) # basic information # library(help = phangorn) # basic information
# browseVignettes("phangorn") # available vignettes # browseVignettes("phangorn") # available vignettes
# data(package = "phangorn") # available datasets # data(package = "phangorn") # available datasets
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or # To compare two trees, they must have the same tip labels. We delete "MBP1_" or
# "KILA_" from the existing tip labels in a copy of our APSES domain tree. # "KILA_" from the existing tip labels in a copy of our APSES domain tree.
apsTree2 <- apsTree apsTree2 <- apsTree
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn provides several functions to compute tree-differences (and there # phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the # is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference" # "symmetric difference"
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the # Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be? # (presumably) ground truth of fungiTree than a random tree would be?
# The ape package provides the function rtree() # The ape package provides the function rtree()
# to compute random trees. # to compute random trees.
ape::rtree(n = length(apsTree2$tip.label), # number of tips ape::rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above, rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway # and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels tip.label = apsTree2$tip.label, # use the apsTree2 labels
br = NULL) # don't generate branch lengths since br = NULL) # don't generate branch lengths since
# fungiTree has none, so we can't # fungiTree has none, so we can't
# compare them anyway. # compare them anyway.
# (Note the warning message about non-binary trees; we'll suppress that later # (Note the warning message about non-binary trees; we'll suppress that later
# by wrapping the function call in supressMessages(); we don't want to # by wrapping the function call in supressMessages(); we don't want to
# print it 10,000 times :-) # print it 10,000 times :-)
# Let's compute some random trees this way, calculate the distances to # Let's compute some random trees this way, calculate the distances to
# fungiTree, and then compare the values we get for apsTree2. The random # fungiTree, and then compare the values we get for apsTree2. The random
# trees are provided by ape::rtree(). # trees are provided by ape::rtree().
N <- 10000 # takes about 15 seconds, and we'll use the pBar function, N <- 10000 # takes about 15 seconds, and we'll use the pBar function,
# defined in .utilities.R to keep track of where we are at: # defined in .utilities.R to keep track of where we are at:
myTreeDistances <- matrix(numeric(N * 2), ncol = 2) myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
colnames(myTreeDistances) <- c("symm", "path") colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358) set.seed(112358)
for (i in 1:N) { for (i in 1:N) {
pBar(i, N) pBar(i, N)
xTree <- ape::rtree(n = length(apsTree2$tip.label), xTree <- ape::rtree(n = length(apsTree2$tip.label),
rooted = TRUE, rooted = TRUE,
tip.label = apsTree2$tip.label, tip.label = apsTree2$tip.label,
br = NULL) br = NULL)
myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
} }
set.seed(NULL) # reset the random number generator set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"]) table(myTreeDistances[, "symm"])
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) ( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
# Random events less-or-equal to observation, divided by total number of # Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value. # events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
par(PAR) # reset graphics state par(PAR) # reset graphics state
hist(myTreeDistances[, "path"], hist(myTreeDistances[, "path"],
col = "aliceblue", col = "aliceblue",
main = "Distances of random Trees to fungiTree") main = "Distances of random Trees to fungiTree")
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse") abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of # Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value. # events gives us the empirical p-value.
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
(sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
# Indeed, our apsTree is _very_ much more similar to the species tree than # Indeed, our apsTree is _very_ much more similar to the species tree than
# we would expect by random chance. # we would expect by random chance.
# What do we gain from that analysis? Analyzing the tree we get from a single # What do we gain from that analysis? Analyzing the tree we get from a single
# gene of orthologous sequences is a positive control in our computational # gene of orthologous sequences is a positive control in our computational
# experiment. If these genes are indeed orthologues, a correct tree-building # experiment. If these genes are indeed orthologues, a correct tree-building
# program ought to give us a tree that exactly matches the species tree. # program ought to give us a tree that exactly matches the species tree.
# Evaluating how far off we are from the known correct result gives us a way to # Evaluating how far off we are from the known correct result gives us a way to
# validate our workflow and our algorithm. If we can't get that right, we can't # validate our workflow and our algorithm. If we can't get that right, we can't
# expect to get "real" data right either. Employing such positive controls in # expect to get "real" data right either. Employing such positive controls in
# every computational experiment is essential for research. Not doing so is # every computational experiment is essential for research. Not doing so is
# Cargo Cult Bioinformatics. # Cargo Cult Bioinformatics.
# [END] # [END]

View File

@ -1,168 +1,168 @@
# tocID <- "BIN-PHYLO-Tree_building.R" # tocID <- "BIN-PHYLO-Tree_building.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_building unit. # R code accompanying the BIN-PHYLO-Tree_building unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 2020-09 # Date: 2017-10 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac: # 1.2 deprecate save()/load() for saveRDS()/readRDS(); Mac:
# instructions to authorize proml.app # instructions to authorize proml.app
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# 1.0 First 2017 version # 1.0 First 2017 version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# Add MrBayes # Add MrBayes
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html # https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------- #TOC> -----------------------------------------------------------
#TOC> 1 Calculating Trees 48 #TOC> 1 Calculating Trees 48
#TOC> 1.1 PROMLPATH ... 68 #TOC> 1.1 PROMLPATH ... 68
#TOC> 1.1.1 ... on the Mac 73 #TOC> 1.1.1 ... on the Mac 73
#TOC> 1.1.2 ... on Windows 101 #TOC> 1.1.2 ... on Windows 101
#TOC> 1.1.3 ... on Linux 115 #TOC> 1.1.3 ... on Linux 115
#TOC> 1.1.4 Confirming PROMLPATH 120 #TOC> 1.1.4 Confirming PROMLPATH 120
#TOC> 1.2 Building a maximum likelihood tree 134 #TOC> 1.2 Building a maximum likelihood tree 134
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Calculating Trees =================================================== # = 1 Calculating Trees ===================================================
# Follow the instructions found at phylip's home on the Web to install. If you # Follow the instructions found at phylip's home on the Web to install. If you
# are on a Windows computer, take note of the installation directory. # are on a Windows computer, take note of the installation directory.
# After you have installed Phylip on your computer, install the R package that # After you have installed Phylip on your computer, install the R package that
# provides an interface to the Phylip functions. # provides an interface to the Phylip functions.
if (! requireNamespace("Rphylip", quietly = TRUE)) { if (! requireNamespace("Rphylip", quietly = TRUE)) {
install.packages("Rphylip") install.packages("Rphylip")
} }
# Package information: # Package information:
# library(help = Rphylip) # basic information # library(help = Rphylip) # basic information
# browseVignettes("Rphylip") # available vignettes # browseVignettes("Rphylip") # available vignettes
# data(package = "Rphylip") # available datasets # data(package = "Rphylip") # available datasets
# This will install RPhylip, as well as its dependency, the package "ape". # This will install RPhylip, as well as its dependency, the package "ape".
# == 1.1 PROMLPATH ... ===================================================== # == 1.1 PROMLPATH ... =====================================================
# The next part may be tricky. You will need to figure out where # The next part may be tricky. You will need to figure out where
# on your computer Phylip has been installed and define the path # on your computer Phylip has been installed and define the path
# to the proml program that calculates a maximum-likelihood tree. # to the proml program that calculates a maximum-likelihood tree.
# === 1.1.1 ... on the Mac # === 1.1.1 ... on the Mac
# On the Mac, the standard installation places a phylip folder # On the Mac, the standard installation places a phylip folder
# in the /Applications directory. That folder contains all the # in the /Applications directory. That folder contains all the
# individual phylip programs as <name>.app files. These are not # individual phylip programs as <name>.app files. These are not
# the actual executables, but "app" files are actually directories # the actual executables, but "app" files are actually directories
# that contain the required resources for a program to run. # that contain the required resources for a program to run.
# The executable is in a subdirectory and you can point Rphylip # The executable is in a subdirectory and you can point Rphylip
# directly to that subdirectory to find the program it needs: # directly to that subdirectory to find the program it needs:
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
# However, RPHYLIP will not be able to run PHYLIP applications immediately, # However, RPHYLIP will not be able to run PHYLIP applications immediately,
# because they have not been "signed" by the PHYLIP developers. The process # because they have not been "signed" by the PHYLIP developers. The process
# will terminate by your system, with a warning. # will terminate by your system, with a warning.
# - Navigate to the phylip folder in your ~/Applications directory # - Navigate to the phylip folder in your ~/Applications directory
# - Descend into the "exe" folder and find proml.app # - Descend into the "exe" folder and find proml.app
# - Ctrl-click proml.app and choose "Open". A dialogue will show that # - Ctrl-click proml.app and choose "Open". A dialogue will show that
# says: "macOS cannot verify the developer of “proml.app”. # says: "macOS cannot verify the developer of “proml.app”.
# Are you sure you want to open it?" # Are you sure you want to open it?"
# - Click open to continue. You may need to allow access to the terminal # - Click open to continue. You may need to allow access to the terminal
# as well. When the proml terminal session open, you can type # as well. When the proml terminal session open, you can type
# Ctrl-c to abort the program and close the window. # Ctrl-c to abort the program and close the window.
# #
# This adds proml.app to the list of known-good programs and you will not # This adds proml.app to the list of known-good programs and you will not
# need to repeat this process. # need to repeat this process.
# #
# === 1.1.2 ... on Windows # === 1.1.2 ... on Windows
# On Windows you need to know where the programs have been installed, and you # On Windows you need to know where the programs have been installed, and you
# need to specify a path that is correct for the Windows OS. Find the folder # need to specify a path that is correct for the Windows OS. Find the folder
# that is named "exe", and right-click to inspect its properties. The path # that is named "exe", and right-click to inspect its properties. The path
# should be listed among them. # should be listed among them.
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your # If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
# assignment has to be # assignment has to be
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe" # PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
# (Note: "/", not "\") # (Note: "/", not "\")
# I have heard that your path must not contain spaces, and it is prudent to # I have heard that your path must not contain spaces, and it is prudent to
# avoid other special characters as well. # avoid other special characters as well.
# === 1.1.3 ... on Linux # === 1.1.3 ... on Linux
# If you are running Linux I trust you know what to do. It's probably # If you are running Linux I trust you know what to do. It's probably
# something like # something like
# PROMLPATH <- "/usr/local/phylip-3.695/bin" # PROMLPATH <- "/usr/local/phylip-3.695/bin"
# === 1.1.4 Confirming PROMLPATH # === 1.1.4 Confirming PROMLPATH
# Confirm that the settings are right. # Confirm that the settings are right.
PROMLPATH # returns the path PROMLPATH # returns the path
list.dirs(PROMLPATH) # returns the directories in that path list.dirs(PROMLPATH) # returns the directories in that path
list.files(PROMLPATH) # lists the files [1] "proml" "proml.command" list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
# If "proml" is NOT among the files that the last command returns, you # If "proml" is NOT among the files that the last command returns, you
# can't continue. Ask on the mailing list for advice. # can't continue. Ask on the mailing list for advice.
# If everything is good, you can add the line that defines PROMLPATH to # If everything is good, you can add the line that defines PROMLPATH to
# myScripts/.myProfile.R - the path will then be automatically set when # myScripts/.myProfile.R - the path will then be automatically set when
# you quit RStudio and return. # you quit RStudio and return.
# == 1.2 Building a maximum likelihood tree ================================ # == 1.2 Building a maximum likelihood tree ================================
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
# as a "proseq" object with the read.protein() function of the RPhylip package: # as a "proseq" object with the read.protein() function of the RPhylip package:
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa") apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
str(apsIn) str(apsIn)
# ... and you are ready to build a tree. # ... and you are ready to build a tree.
# There are many fast options in PHYLIP - we will use the most _accurate_ one # There are many fast options in PHYLIP - we will use the most _accurate_ one
# that it has: proml, a maximum-likelihood tree building program for protein # that it has: proml, a maximum-likelihood tree building program for protein
# data. # data.
# Building maximum-likelihood trees can eat as much computer time # Building maximum-likelihood trees can eat as much computer time
# as you can throw at it. Calculating a tree of 48 APSES domains # as you can throw at it. Calculating a tree of 48 APSES domains
# with default parameters of Rproml() runs for more than half a day # with default parameters of Rproml() runs for more than half a day
# on my computer. But we have only twelve sequences here, so the # on my computer. But we have only twelve sequences here, so the
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup # process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
# of coffee while you are waiting. # of coffee while you are waiting.
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
# A quick first look: # A quick first look:
plot(apsTree) plot(apsTree)
# save your tree: # save your tree:
saveRDS(apsTree, file = "data/APSEStreeRproml.rds") saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
# If this did not work, ask for advice. # If this did not work, ask for advice.
# [END] # [END]

View File

@ -1,323 +1,323 @@
# tocID <- "BIN-PPI-Analysis.R" # tocID <- "BIN-PPI-Analysis.R"
# #
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit. # R code accompanying the BIN-PPI-Analysis unit.
# #
# Version: 1.4 # Version: 1.4
# #
# Date: 2017-08 - 2020-10 # Date: 2017-08 - 2020-10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.4 Update vector ID's for betweenness centrality. # 1.4 Update vector ID's for betweenness centrality.
# 1.3 Bugfix: called the wrong function on ENSPsel in l. 220 # 1.3 Bugfix: called the wrong function on ENSPsel in l. 220
# 1.2 2020 Updates; Rewrite for new STRINg V11; # 1.2 2020 Updates; Rewrite for new STRINg V11;
# Deprecate save()/load() for saveRDS()/readRDS() # Deprecate save()/load() for saveRDS()/readRDS()
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0 First live version # 1.0 First live version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------------- #TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 50 #TOC> 1 Setup and data 50
#TOC> 2 Functional Edges in the Human Proteome 86 #TOC> 2 Functional Edges in the Human Proteome 86
#TOC> 2.1 Cliques 129 #TOC> 2.1 Cliques 129
#TOC> 2.2 Communities 170 #TOC> 2.2 Communities 170
#TOC> 2.3 Betweenness Centrality 184 #TOC> 2.3 Betweenness Centrality 184
#TOC> 3 biomaRt 231 #TOC> 3 biomaRt 231
#TOC> 4 Task for submission 302 #TOC> 4 Task for submission 302
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Setup and data ====================================================== # = 1 Setup and data ======================================================
# Not surprisingly, the analysis of PPI networks needs iGraph: # Not surprisingly, the analysis of PPI networks needs iGraph:
if (! requireNamespace("igraph", quietly = TRUE)) { if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph") install.packages("igraph")
} }
# Package information: # Package information:
# library(help = igraph) # basic information # library(help = igraph) # basic information
# browseVignettes("igraph") # available vignettes # browseVignettes("igraph") # available vignettes
# data(package = "igraph") # available datasets # data(package = "igraph") # available datasets
# In order for you to explore some real, biological networks, I give you a # In order for you to explore some real, biological networks, I give you a
# dataframe of functional relationships of human proteins that I have downloaded # dataframe of functional relationships of human proteins that I have downloaded
# from the STRING database. The full table has 8.5 million records, here is a # from the STRING database. The full table has 8.5 million records, here is a
# subset of records with combined confidence scores > 980 # subset of records with combined confidence scores > 980
# The selected set of edges with a confidence of > 964 is a dataframe with about # The selected set of edges with a confidence of > 964 is a dataframe with about
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of # 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
# a fungal proteome. You can load the saved dataframe here (To read more about # a fungal proteome. You can load the saved dataframe here (To read more about
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). # what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
STRINGedges <- readRDS("./data/STRINGedges.rds") STRINGedges <- readRDS("./data/STRINGedges.rds")
head(STRINGedges) head(STRINGedges)
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
# Ensemble transcript identifiers that start with ENSP. We'll remove them: # Ensemble transcript identifiers that start with ENSP. We'll remove them:
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
head(STRINGedges) head(STRINGedges)
# = 2 Functional Edges in the Human Proteome ============================== # = 2 Functional Edges in the Human Proteome ==============================
# There are many possibilities to explore interesting aspects of biological # There are many possibilities to explore interesting aspects of biological
# networks, we will keep with some very simple procedures here but you have # networks, we will keep with some very simple procedures here but you have
# to be aware that this is barely scratching the surface of possibilities. # to be aware that this is barely scratching the surface of possibilities.
# However, once the network exists in your computer, it is comparatively # However, once the network exists in your computer, it is comparatively
# easy to find information online about the many, many options to analyze. # easy to find information online about the many, many options to analyze.
# Make a graph from this dataframe # Make a graph from this dataframe
?igraph::graph_from_data_frame ?igraph::graph_from_data_frame
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - # CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google # layout of such large graphs is possible, but requires specialized code. Google
# for <layout large graphs> if you are curious. Also, consider what one can # for <layout large graphs> if you are curious. Also, consider what one can
# really learn from plotting such a graph ... # really learn from plotting such a graph ...
# Of course simple computations on this graph are reasonably fast: # Of course simple computations on this graph are reasonably fast:
compSTR <- igraph::components(gSTR) compSTR <- igraph::components(gSTR)
summary(compSTR) # our graph is fully connected! summary(compSTR) # our graph is fully connected!
hist(log(igraph::degree(gSTR)), col="#FEE0AF") hist(log(igraph::degree(gSTR)), col="#FEE0AF")
# this actually does look rather scale-free # this actually does look rather scale-free
(freqRank <- table(igraph::degree(gSTR))) (freqRank <- table(igraph::degree(gSTR)))
plot(log10(as.numeric(names(freqRank)) + 1), plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b", log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF", pch = 21, bg = "#FEE0AF",
xlab = "log(Rank)", ylab = "log(frequency)", xlab = "log(Rank)", ylab = "log(frequency)",
main = "8,400 nodes from the human functional interaction network") main = "8,400 nodes from the human functional interaction network")
# This looks very scale-free indeed. # This looks very scale-free indeed.
(regressionLine <- lm(log10(as.numeric(freqRank)) ~ (regressionLine <- lm(log10(as.numeric(freqRank)) ~
log10(as.numeric(names(freqRank)) + 1))) log10(as.numeric(names(freqRank)) + 1)))
abline(regressionLine, col = "firebrick") abline(regressionLine, col = "firebrick")
# Now explore some more: # Now explore some more:
# == 2.1 Cliques =========================================================== # == 2.1 Cliques ===========================================================
# Let's find the largest cliques. Remember: a clique is a fully connected # Let's find the largest cliques. Remember: a clique is a fully connected
# subgraph, i.e. a subgraph in which every node is connected to every other. # subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs. # Biological complexes often appear as cliques in interaction graphs.
igraph::clique_num(gSTR) igraph::clique_num(gSTR)
# The largest clique has 81 members. # The largest clique has 81 members.
(C <- igraph::largest_cliques(gSTR)[[1]]) (C <- igraph::largest_cliques(gSTR)[[1]])
# Pick one of the proteins and find out what this fully connected cluster of 81 # Pick one of the proteins and find out what this fully connected cluster of 81
# proteins is (you can simply Google for any of the IDs). Is this expected? # proteins is (you can simply Google for any of the IDs). Is this expected?
# Plot this ... # Plot this ...
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
# color the vertices along a color spectrum # color the vertices along a color spectrum
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
# color the edges to have the same color as the originating node # color the edges to have the same color as the originating node
eCol <- character() eCol <- character()
for (i in seq_along(vCol)) { for (i in seq_along(vCol)) {
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
} }
oPar <- par(mar= rep(0,4)) # Turn margins off oPar <- par(mar= rep(0,4)) # Turn margins off
plot(R, plot(R,
layout = igraph::layout_in_circle(R), layout = igraph::layout_in_circle(R),
vertex.size = 3, vertex.size = 3,
vertex.color = vCol, vertex.color = vCol,
edge.color = eCol, edge.color = eCol,
edge.width = 0.1, edge.width = 0.1,
vertex.label = NA) vertex.label = NA)
par(oPar) par(oPar)
# ... well: remember: a clique means every node is connected to every other # ... well: remember: a clique means every node is connected to every other
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI # node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
# networks looks like for large complexes. # networks looks like for large complexes.
# == 2.2 Communities ======================================================= # == 2.2 Communities =======================================================
set.seed(112358) # set RNG seed for repeatable randomness set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- igraph::cluster_infomap(gSTR) gSTRclusters <- igraph::cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
igraph::modularity(gSTRclusters) # ... measures how separated the different igraph::modularity(gSTRclusters) # ... measures how separated the different
# membership types are from each other # membership types are from each other
tMem <- table(igraph::membership(gSTRclusters)) tMem <- table(igraph::membership(gSTRclusters))
length(tMem) # About 700 communities identified length(tMem) # About 700 communities identified
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ... hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
range(tMem) # ... but one has > 200 members range(tMem) # ... but one has > 200 members
# == 2.3 Betweenness Centrality ============================================ # == 2.3 Betweenness Centrality ============================================
# Let's find the nodes with the 10 - highest betweenness centralities. # Let's find the nodes with the 10 - highest betweenness centralities.
# #
BC <- igraph::centr_betw(gSTR) BC <- igraph::centr_betw(gSTR)
# remember: BC$res contains the results # remember: BC$res contains the results
head(BC$res) head(BC$res)
BC$res[1] # betweenness centrality of node 1 in the graph ... BC$res[1] # betweenness centrality of node 1 in the graph ...
# ... which one is node 1? # ... which one is node 1?
igraph::V(gSTR)[1] igraph::V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements of BC with their # to get the ten-highest nodes, we simply label the elements of BC with their
# index ... # index ...
names(BC$res) <- as.character(1:length(BC$res)) names(BC$res) <- as.character(1:length(BC$res))
# ... and then we sort: # ... and then we sort:
sBC <- sort(BC$res, decreasing = TRUE) sBC <- sort(BC$res, decreasing = TRUE)
head(sBC) head(sBC)
# This ordered vector means: node 3 has the highest betweenness centrality, # This ordered vector means: node 3 has the highest betweenness centrality,
# node 721 has the second highest, etc. # node 721 has the second highest, etc.
(BCsel <- as.numeric(names(sBC)[1:10])) (BCsel <- as.numeric(names(sBC)[1:10]))
# We can use the first ten labels to subset the nodes in gSTR and fetch the # We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs... # IDs...
(ENSPsel <- names(igraph::V(gSTR)[BCsel])) (ENSPsel <- names(igraph::V(gSTR)[BCsel]))
# Task: # Task:
# ===== # =====
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT # IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
# We are going to use these IDs to produce some output for a submitted task: # We are going to use these IDs to produce some output for a submitted task:
# therefore I need you to execute the following line, note the "seal" that this # therefore I need you to execute the following line, note the "seal" that this
# returns, and not change myENSPsel later: # returns, and not change myENSPsel later:
myENSPsel <- selectENSP(ENSPsel) myENSPsel <- selectENSP(ENSPsel)
# Next, to find what these proteins are... # Next, to find what these proteins are...
# We could now Google for all of these IDs to learn more about them. But really, # We could now Google for all of these IDs to learn more about them. But really,
# googling for IDs one after the other, that would be lame. Let's instead use # googling for IDs one after the other, that would be lame. Let's instead use
# the very, very useful biomaRt package to translate these Ensemble IDs into # the very, very useful biomaRt package to translate these Ensemble IDs into
# gene symbols. # gene symbols.
# = 3 biomaRt ============================================================= # = 3 biomaRt =============================================================
# IDs are just labels, but for _bio_informatics we need to learn more about the # IDs are just labels, but for _bio_informatics we need to learn more about the
# biological function of the genes or proteins that we retrieve via graph data # biological function of the genes or proteins that we retrieve via graph data
# mining. biomaRt is the tool of choice. It's a package distributed by the # mining. biomaRt is the tool of choice. It's a package distributed by the
# bioconductor project. This here is not a biomaRt tutorial (that's for another # bioconductor project. This here is not a biomaRt tutorial (that's for another
# day), simply a few lines of sample code to get you started on the specific use # day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs. # case of retrieving descriptions for ensembl protein IDs.
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("biomaRt", quietly = TRUE)) { if (! requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt") BiocManager::install("biomaRt")
} }
# Package information: # Package information:
# library(help = biomaRt) # basic information # library(help = biomaRt) # basic information
# browseVignettes("biomaRt") # available vignettes # browseVignettes("biomaRt") # available vignettes
# data(package = "biomaRt") # available datasets # data(package = "biomaRt") # available datasets
# define which dataset to use ... this takes a while for download # define which dataset to use ... this takes a while for download
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined? # what filters are defined?
( filters <- biomaRt::listFilters(myMart) ) ( filters <- biomaRt::listFilters(myMart) )
# and what attributes can we filter for? # and what attributes can we filter for?
( attributes <- biomaRt::listAttributes(myMart) ) ( attributes <- biomaRt::listAttributes(myMart) )
# Soooo many options - let's look for the correct name of filters that are # Soooo many options - let's look for the correct name of filters that are
# useful for ENSP IDs ... # useful for ENSP IDs ...
filters[grep("ENSP", filters$description), ] filters[grep("ENSP", filters$description), ]
# ... and the correct attribute names for gene symbols and descriptions ... # ... and the correct attribute names for gene symbols and descriptions ...
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
attributes[grep("description", attributes$description, ignore.case = TRUE), ] attributes[grep("description", attributes$description, ignore.case = TRUE), ]
# ... so we can put this together: here is a syntax example: # ... so we can put this together: here is a syntax example:
biomaRt::getBM(filters = "ensembl_peptide_id", biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol", attributes = c("hgnc_symbol",
"wikigene_description", "wikigene_description",
"interpro_description", "interpro_description",
"phenotype_description"), "phenotype_description"),
values = "ENSP00000000442", values = "ENSP00000000442",
mart = myMart) mart = myMart)
# A simple loop will now get us the information for our 10 most central genes # A simple loop will now get us the information for our 10 most central genes
# from the human subset of STRING. # from the human subset of STRING.
CPdefs <- list() # Since we don't know how many matches one of our queries CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list. # will return, we'll put the result dataframes into a list.
for (ID in myENSPsel) { for (ID in myENSPsel) {
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol", attributes = c("hgnc_symbol",
"wikigene_description", "wikigene_description",
"interpro_description", "interpro_description",
"phenotype_description"), "phenotype_description"),
values = ID, values = ID,
mart = myMart) mart = myMart)
} }
# So what are the proteins with the ten highest betweenness centralities? # So what are the proteins with the ten highest betweenness centralities?
# ... are you surprised? (I am! Really.) # ... are you surprised? (I am! Really.)
# = 4 Task for submission ================================================= # = 4 Task for submission =================================================
# Write a loop that will go through your personalized list of Ensemble IDs and # Write a loop that will go through your personalized list of Ensemble IDs and
# for each ID: # for each ID:
# -- print the ID, # -- print the ID,
# -- print the first row's HGNC symbol, # -- print the first row's HGNC symbol,
# -- print the first row's wikigene description. # -- print the first row's wikigene description.
# -- print the first row's phenotype. # -- print the first row's phenotype.
# #
# Write your thoughts about this group of genes. # Write your thoughts about this group of genes.
# #
# (Hint, you can structure your loop in the same way as the loop that # (Hint, you can structure your loop in the same way as the loop that
# created CPdefs. ) # created CPdefs. )
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code # Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
# for this loop and its output into your report if you are submitting # for this loop and its output into your report if you are submitting
# anything for credit for this unit. Please read the requirements carefully. # anything for credit for this unit. Please read the requirements carefully.
# [END] # [END]

View File

@ -1,252 +1,252 @@
# tocID <- "BIN-SEQA-Composition.R" # tocID <- "BIN-SEQA-Composition.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-SEQA-Comparison unit # R code accompanying the BIN-SEQA-Comparison unit
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-11 - 2020-09 # Date: 2017-11 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# 1.2 2020 Maintenance # 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# Versions: # Versions:
# 1.0 First live version 2017 # 1.0 First live version 2017
# 0.1 First code copied from BCH441_A03_makeYFOlist.R # 0.1 First code copied from BCH441_A03_makeYFOlist.R
# #
# TODO: # TODO:
# #
# #
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# DO NOT SIMPLY source() THESE FILES! # DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------------- #TOC> ----------------------------------------------------------
#TOC> 1 Preparation 48 #TOC> 1 Preparation 48
#TOC> 2 Aggregate properties 69 #TOC> 2 Aggregate properties 69
#TOC> 3 Sequence Composition Enrichment 113 #TOC> 3 Sequence Composition Enrichment 113
#TOC> 3.1 Barplot, and side-by-side barplot 136 #TOC> 3.1 Barplot, and side-by-side barplot 136
#TOC> 3.2 Plotting ratios 171 #TOC> 3.2 Plotting ratios 171
#TOC> 3.3 Plotting log ratios 188 #TOC> 3.3 Plotting log ratios 188
#TOC> 3.4 Sort by frequency 204 #TOC> 3.4 Sort by frequency 204
#TOC> 3.5 Color by amino acid type 221 #TOC> 3.5 Color by amino acid type 221
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Preparation ========================================================= # = 1 Preparation =========================================================
if (! requireNamespace("seqinr", quietly = TRUE)) { if (! requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr") install.packages("seqinr")
} }
# Package information: # Package information:
# library(help = seqinr) # basic information # library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes # browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets # data(package = "seqinr") # available datasets
# Load a reference sequence to work with: # Load a reference sequence to work with:
# If you have done the BIN-Storing_data unit: # If you have done the BIN-Storing_data unit:
source("makeProteinDB.R") source("makeProteinDB.R")
sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
mySeq <- myDB$protein$sequence[sel] mySeq <- myDB$protein$sequence[sel]
# If not, use the yeast Mbp1 sequence: # If not, use the yeast Mbp1 sequence:
mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
# = 2 Aggregate properties ================================================ # = 2 Aggregate properties ================================================
# Let's try a simple function from seqinr: computing the pI of the sequence # Let's try a simple function from seqinr: computing the pI of the sequence
?seqinr::computePI ?seqinr::computePI
# This takes as input a vector of upper-case AA codes # This takes as input a vector of upper-case AA codes
# We can use the function strsplit() to split the string # We can use the function strsplit() to split the string
# into single characters # into single characters
(s <- strsplit(mySeq, "")) # splitting on the empty spring (s <- strsplit(mySeq, "")) # splitting on the empty spring
# splits into single characters # splits into single characters
s <- unlist(s) # strsplit() returns a list! Why? s <- unlist(s) # strsplit() returns a list! Why?
# (But we don't need a list now...) # (But we don't need a list now...)
# Alternatively, seqinr provides # Alternatively, seqinr provides
# the function s2c() to convert strings into # the function s2c() to convert strings into
# character vectors (and c2s to convert them back). # character vectors (and c2s to convert them back).
seqinr::s2c(mySeq) seqinr::s2c(mySeq)
seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point seqinr::computePI(seqinr::s2c(mySeq)) # isoelectric point
seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight seqinr::pmw(seqinr::s2c(mySeq)) # molecular weight
seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of seqinr::AAstat(seqinr::s2c(mySeq)) # This also plots the distribution of
# values along the sequence # values along the sequence
# A true Labor of Love has gone into the # A true Labor of Love has gone into the
# compilation of the "aaindex" data: # compilation of the "aaindex" data:
?seqinr::aaindex ?seqinr::aaindex
data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it data(aaindex, package = "seqinr") # "attach" the dataset - i.e. make it
# accessible as an R object # accessible as an R object
length(aaindex) # no seqinr:: needed for the dataset since we just length(aaindex) # no seqinr:: needed for the dataset since we just
# "attached" it with data() # "attached" it with data()
# Here are all the index descriptions # Here are all the index descriptions
for (i in 1:length(aaindex)) { for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
} }
# = 3 Sequence Composition Enrichment ===================================== # = 3 Sequence Composition Enrichment =====================================
# Lets use one of the indices to calculate and plot amino-acid # Lets use one of the indices to calculate and plot amino-acid
# composition enrichment: # composition enrichment:
aaindex[[459]]$D aaindex[[459]]$D
# #
# Let's construct an enrichment plot to compare average frequencies # Let's construct an enrichment plot to compare average frequencies
# with the amino acid counts in our sequence. # with the amino acid counts in our sequence.
(refData <- aaindex[[459]]$I) # reference frequencies in % (refData <- aaindex[[459]]$I) # reference frequencies in %
names(refData) <- seqinr::a(names(refData)) # change names to single-letter names(refData) <- seqinr::a(names(refData)) # change names to single-letter
# code using seqinr's "a()" function # code using seqinr's "a()" function
sum(refData) sum(refData)
refData # ... in % refData # ... in %
# tabulate the amino acid counts in mySeq # tabulate the amino acid counts in mySeq
(obsData <- table(seqinr::s2c(mySeq))) # counts (obsData <- table(seqinr::s2c(mySeq))) # counts
(obsData <- 100 * (obsData / sum(obsData))) # frequencies (obsData <- 100 * (obsData / sum(obsData))) # frequencies
# == 3.1 Barplot, and side-by-side barplot ================================= # == 3.1 Barplot, and side-by-side barplot =================================
barplot(obsData, col = "#CCCCCC", cex.names = 0.7) barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
abline(h = 100/20, col="#BB0000") abline(h = 100/20, col="#BB0000")
barplot(refData, col = "#BB0000", cex.names = 0.7) barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555") abline(h = 100/20, col="#555555")
# Ok: first problem - the values in obsData are in alphabetical order. But the # Ok: first problem - the values in obsData are in alphabetical order. But the
# values in refData are in alphabetical order of amino acid name: alanine, # values in refData are in alphabetical order of amino acid name: alanine,
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this # arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
# order a lot - one of the old biochemistry tropes in the field. So we need to # order a lot - one of the old biochemistry tropes in the field. So we need to
# re-order one of the vectors to match the other. That's easy though: # re-order one of the vectors to match the other. That's easy though:
refData refData
(refData <- refData[names(obsData)]) (refData <- refData[names(obsData)])
barplot(refData, col = "#BB0000", cex.names = 0.7) barplot(refData, col = "#BB0000", cex.names = 0.7)
abline(h = 100/20, col="#555555") abline(h = 100/20, col="#555555")
# To compare the values, we want to see them in a barplot, side-by-side ... # To compare the values, we want to see them in a barplot, side-by-side ...
barplot(rbind(obsData, refData), barplot(rbind(obsData, refData),
ylim = c(0, 12), ylim = c(0, 12),
beside = TRUE, beside = TRUE,
col = c("#CCCCCC", "#BB0000"), col = c("#CCCCCC", "#BB0000"),
cex.names = 0.7) cex.names = 0.7)
abline(h = 100/20, col="#00000044") abline(h = 100/20, col="#00000044")
# ... and add a legend # ... and add a legend
legend (x = 1, y = 12, legend (x = 1, y = 12,
legend = c("mySeq", "Average composition"), legend = c("mySeq", "Average composition"),
fill = c("#CCCCCC", "#BB0000"), fill = c("#CCCCCC", "#BB0000"),
cex = 0.7, cex = 0.7,
bty = "n") bty = "n")
# == 3.2 Plotting ratios =================================================== # == 3.2 Plotting ratios ===================================================
# To better compare the values, we'll calculate ratios between # To better compare the values, we'll calculate ratios between
# obsData and refData # obsData and refData
barplot(obsData / refData, barplot(obsData / refData,
col = "#CCCCCC", col = "#CCCCCC",
ylab = "Sequence / Average", ylab = "Sequence / Average",
ylim = c(0, 2.5), ylim = c(0, 2.5),
cex.names = 0.7) cex.names = 0.7)
abline(h = 1, col="#BB0000") abline(h = 1, col="#BB0000")
abline(h = c(1/2, 2), lty = 2, col="#BB000055") abline(h = c(1/2, 2), lty = 2, col="#BB000055")
# ... but ratios are not very good here, since the difference in height on the # ... but ratios are not very good here, since the difference in height on the
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted # plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
# lines) are exactly the same fold-difference ! # lines) are exactly the same fold-difference !
# == 3.3 Plotting log ratios =============================================== # == 3.3 Plotting log ratios ===============================================
# A better way to display this # A better way to display this
# is to plot log(ratios). # is to plot log(ratios).
barplot(log(obsData / refData), barplot(log(obsData / refData),
col = "#CCCCCC", col = "#CCCCCC",
ylab = "log(Sequence / Average)", ylab = "log(Sequence / Average)",
ylim = log(c(1/3, 3)), ylim = log(c(1/3, 3)),
cex.names = 0.7) cex.names = 0.7)
abline(h = log(1), col="#BB0000") abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
# Note how the two-fold difference lines are now the same distance from the # Note how the two-fold difference lines are now the same distance from the
# line of equal ratio. # line of equal ratio.
# == 3.4 Sort by frequency ================================================= # == 3.4 Sort by frequency =================================================
barplot(sort(log(obsData / refData), decreasing = TRUE), barplot(sort(log(obsData / refData), decreasing = TRUE),
ylim = log(c(1/3, 3)), ylim = log(c(1/3, 3)),
col = "#CCCCCC", col = "#CCCCCC",
ylab = "log(Sequence / Average)", ylab = "log(Sequence / Average)",
cex.names = 0.7) cex.names = 0.7)
abline(h = log(1), col="#BB0000") abline(h = log(1), col="#BB0000")
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
yTxt <- log(0.9) yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07) arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7) text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1) yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07) arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# == 3.5 Color by amino acid type ========================================== # == 3.5 Color by amino acid type ==========================================
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R # Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
# script, or define your own. # script, or define your own.
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5) barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
lR <- sort(log(obsData / refData), decreasing = TRUE) lR <- sort(log(obsData / refData), decreasing = TRUE)
barplot(lR, barplot(lR,
ylim = log(c(1/3, 3)), ylim = log(c(1/3, 3)),
col = AACOLS[names(lR)], col = AACOLS[names(lR)],
ylab = "log(Sequence / Average)", ylab = "log(Sequence / Average)",
cex.names = 0.7) cex.names = 0.7)
abline(h = log(1), col="#00000055") abline(h = log(1), col="#00000055")
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033") abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
yTxt <- log(0.9) yTxt <- log(0.9)
arrows(4, yTxt, 0, yTxt, length = 0.07) arrows(4, yTxt, 0, yTxt, length = 0.07)
text(5.5, yTxt, "Enriched", cex = 0.7) text(5.5, yTxt, "Enriched", cex = 0.7)
yTxt <- log(1.1) yTxt <- log(1.1)
arrows(20, yTxt, 24, yTxt, length = 0.07) arrows(20, yTxt, 24, yTxt, length = 0.07)
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
# Task: # Task:
# Interpret this plot. (Can you?) Which types of amino acids are enriched? # Interpret this plot. (Can you?) Which types of amino acids are enriched?
# Depleted? # Depleted?
# [END] # [END]

View File

@ -1,394 +1,394 @@
# tocID <- "BIN-Sequence.R" # tocID <- "BIN-Sequence.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit. # R code accompanying the BIN-Sequence unit.
# #
# Version: 1.5 # Version: 1.5
# #
# Date: 2017-09 - 2020-09 # Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.5 2020 Updates # 1.5 2020 Updates
# 1.4 Change from require() to requireNamespace(), # 1.4 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.3 Update set.seed() usage # 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh # 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr() # 1.1 Add chartr()
# 1.0 First live version 2017. # 1.0 First live version 2017.
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------- #TOC> ----------------------------------------------------
#TOC> 1 Prepare 63 #TOC> 1 Prepare 63
#TOC> 2 Storing Sequence 80 #TOC> 2 Storing Sequence 80
#TOC> 3 String properties 109 #TOC> 3 String properties 109
#TOC> 4 Substrings 116 #TOC> 4 Substrings 116
#TOC> 5 Creating strings: sprintf() 137 #TOC> 5 Creating strings: sprintf() 137
#TOC> 6 Changing strings 172 #TOC> 6 Changing strings 172
#TOC> 6.1.1 Changing case 174 #TOC> 6.1.1 Changing case 174
#TOC> 6.1.2 Reverse 179 #TOC> 6.1.2 Reverse 179
#TOC> 6.1.3 Change characters 183 #TOC> 6.1.3 Change characters 183
#TOC> 6.1.4 Substitute characters 211 #TOC> 6.1.4 Substitute characters 211
#TOC> 6.2 stringi and stringr 231 #TOC> 6.2 stringi and stringr 231
#TOC> 6.3 dbSanitizeSequence() 241 #TOC> 6.3 dbSanitizeSequence() 241
#TOC> 7 Permuting and sampling 253 #TOC> 7 Permuting and sampling 253
#TOC> 7.1 Permutations 260 #TOC> 7.1 Permutations 260
#TOC> 7.2 Sampling 306 #TOC> 7.2 Sampling 306
#TOC> 7.2.1 Equiprobable characters 308 #TOC> 7.2.1 Equiprobable characters 308
#TOC> 7.2.2 Defined probability vector 350 #TOC> 7.2.2 Defined probability vector 350
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Prepare ============================================================= # = 1 Prepare =============================================================
# Much basic sequence handling is supported by the Bioconductor package # Much basic sequence handling is supported by the Bioconductor package
# Biostrings. # Biostrings.
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly = TRUE)) { if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
# = 2 Storing Sequence ==================================================== # = 2 Storing Sequence ====================================================
# Sequences can be represented and stored as vectors of single characters ... # Sequences can be represented and stored as vectors of single characters ...
(v <- c("D", "I", "V", "M", "T", "Q")) (v <- c("D", "I", "V", "M", "T", "Q"))
# ... as strings ... # ... as strings ...
(s <- "DIVMTQ") (s <- "DIVMTQ")
# ... or as more complex objects with rich metadata e.g. as a Biostrings # ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc. # DNAstring, RNAstring, AAString, etc.
(a <- Biostrings::AAString("DIVMTQ")) (a <- Biostrings::AAString("DIVMTQ"))
# ... and all of these representations can be interconverted: # ... and all of these representations can be interconverted:
# string to vector ... # string to vector ...
unlist(strsplit(s, "")) unlist(strsplit(s, ""))
# vector to string ... # vector to string ...
paste(v, sep = "", collapse = "") paste(v, sep = "", collapse = "")
# ... and AAstring to plain string. # ... and AAstring to plain string.
as.character(a) as.character(a)
# Since operations with character vectors trivially follow all other vector # Since operations with character vectors trivially follow all other vector
# conventions and syntax, and we will look at Biostrings methods in more # conventions and syntax, and we will look at Biostrings methods in more
# detail in a later unit, we will focus on basic strings in the following. # detail in a later unit, we will focus on basic strings in the following.
# = 3 String properties =================================================== # = 3 String properties ===================================================
length(s) # why ??? length(s) # why ???
nchar(s) # Aha! nchar(s) # Aha!
# = 4 Substrings ========================================================== # = 4 Substrings ==========================================================
# Use the substr() function # Use the substr() function
substr(s, 2, 4) substr(s, 2, 4)
# or the similar substring() # or the similar substring()
substring(s, 2, 4) substring(s, 2, 4)
# Note: both functions are vectorized (i.e. they operate on vectors # Note: both functions are vectorized (i.e. they operate on vectors
# of arguments, you don't need to loop over input)... # of arguments, you don't need to loop over input)...
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA") myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
substr( myBiCodes, 1, 3) substr( myBiCodes, 1, 3)
substring(myBiCodes, 1, 3) substring(myBiCodes, 1, 3)
# ... however only substring() will also use vectors for start and stop # ... however only substring() will also use vectors for start and stop
s <- "gatattgtgatgacccagtaa" # a DNA sequence s <- "gatattgtgatgacccagtaa" # a DNA sequence
(vI <- seq(1, nchar(s), by = 3)) # an index vector (vI <- seq(1, nchar(s), by = 3)) # an index vector
substr( s, vI, vI+2) # ... returns only the first nucleotide triplet substr( s, vI, vI+2) # ... returns only the first nucleotide triplet
substring(s, vI, vI+2) # ... returns all triplets substring(s, vI, vI+2) # ... returns all triplets
# = 5 Creating strings: sprintf() ========================================= # = 5 Creating strings: sprintf() =========================================
# Sprintf is a very smart, very powerful function and has cognates in all # Sprintf is a very smart, very powerful function and has cognates in all
# other programming languages. It has a bit of a learning curve, but this is # other programming languages. It has a bit of a learning curve, but this is
# totally worth it: # totally worth it:
# the function takes a format string, and a list of other arguments. It returns # the function takes a format string, and a list of other arguments. It returns
# a formatted string. Here are some examples - watch carefully for sprintf() # a formatted string. Here are some examples - watch carefully for sprintf()
# calls elsewhere in the code. # calls elsewhere in the code.
sprintf("Just a string.") sprintf("Just a string.")
sprintf("A string and the number %d.", 5) sprintf("A string and the number %d.", 5)
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
sprintf("Pi is ~ %1.2f ...", pi) sprintf("Pi is ~ %1.2f ...", pi)
sprintf("or more accurately ~ %1.11f.", pi) sprintf("or more accurately ~ %1.11f.", pi)
x <- "bottles of beer" x <- "bottles of beer"
N <- 99 N <- 99
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
N, x, N, x, "one down, and pass it around", N - 1, x) N, x, N, x, "one down, and pass it around", N - 1, x)
# Note that in the last example, the value of the string was displayed with # Note that in the last example, the value of the string was displayed with
# R's usual print-formatting function and therefore the line-break "\n" did # R's usual print-formatting function and therefore the line-break "\n" did
# not actually break the line. To have line breaks, tabs etc, you need to use # not actually break the line. To have line breaks, tabs etc, you need to use
# cat() to display the string: # cat() to display the string:
for (i in N:(N-4)) { for (i in N:(N-4)) {
cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
i, x, i, x, "one down, and pass it around", i - 1, x)) i, x, i, x, "one down, and pass it around", i - 1, x))
} }
# sprintf() is vectorized: if one of its parameters is a vector, it # sprintf() is vectorized: if one of its parameters is a vector, it
# will generate one output string for each of the vector's elements: # will generate one output string for each of the vector's elements:
cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
# = 6 Changing strings ==================================================== # = 6 Changing strings ====================================================
# === 6.1.1 Changing case # === 6.1.1 Changing case
tolower(s) tolower(s)
toupper(tolower(s)) toupper(tolower(s))
# === 6.1.2 Reverse # === 6.1.2 Reverse
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?) # (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
# Biostrings::str_rev(s) # Biostrings::str_rev(s)
# The following works, of course, but awkward: # The following works, of course, but awkward:
s s
paste0(rev(unlist(strsplit(s, ""))), collapse = "") paste0(rev(unlist(strsplit(s, ""))), collapse = "")
# reverse complement # reverse complement
COMP <- c("t", "g", "c", "a") COMP <- c("t", "g", "c", "a")
names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names names(COMP) <- c("a", "c", "g", "t") # mapping the complement via names
s s
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
# === 6.1.3 Change characters # === 6.1.3 Change characters
# chartr(old, new, x) maps all characters in x that appear in "old" to the # chartr(old, new, x) maps all characters in x that appear in "old" to the
# correpsonding character in "new." Kind of like the COMP vector above ... # correpsonding character in "new." Kind of like the COMP vector above ...
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
# One could implement toupper() and tolower() with this - remember that R has # One could implement toupper() and tolower() with this - remember that R has
# character vectors of uppercase and lowercase letters as language constants. # character vectors of uppercase and lowercase letters as language constants.
chartr(paste0(letters, collapse = ""), chartr(paste0(letters, collapse = ""),
paste0(LETTERS, collapse = ""), paste0(LETTERS, collapse = ""),
"Twinkle, twinkle little star, how I wonder what you are.") "Twinkle, twinkle little star, how I wonder what you are.")
# One amusing way to use the function is for a reversible substitution # One amusing way to use the function is for a reversible substitution
# cypher. # cypher.
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
set.seed(112358) # set RNG seed for repeatable randomness set.seed(112358) # set RNG seed for repeatable randomness
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) ( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
# encode ... # encode ...
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) (x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
# decode ... # decode ...
chartr(myCypher, alBet, x) chartr(myCypher, alBet, x)
# (Nb. substitution cyphers are easy to crack!) # (Nb. substitution cyphers are easy to crack!)
# === 6.1.4 Substitute characters # === 6.1.4 Substitute characters
# gsub can change lengths. # gsub can change lengths.
# Example: implementing the binary Fibonacci sequence: # Example: implementing the binary Fibonacci sequence:
# 0 -> 1; 1 -> 10 , in three nested gsub() statements # 0 -> 1; 1 -> 10 , in three nested gsub() statements
( s <- 1 ) ( s <- 1 )
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
# Iterate this line a few times ... # Iterate this line a few times ...
# #
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
# for the features of the sequence. # for the features of the sequence.
# I use gsub() often to delete unwanted characters ... # I use gsub() often to delete unwanted characters ...
# ... select something, and substitute the empty string for it. # ... select something, and substitute the empty string for it.
(s <- gsub("-", "", s)) (s <- gsub("-", "", s))
# For example: clean up a sequence # For example: clean up a sequence
# copy/paste from UniProt # copy/paste from UniProt
(s <- " 10 20 30 40 50 (s <- " 10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
# remove numbers # remove numbers
(s <- gsub("[0-9]", "", s)) (s <- gsub("[0-9]", "", s))
# remove "whitespace" (spaces, tabs, line breaks)... # remove "whitespace" (spaces, tabs, line breaks)...
(s <- gsub("\\s", "", s)) (s <- gsub("\\s", "", s))
# == 6.2 stringi and stringr =============================================== # == 6.2 stringi and stringr ===============================================
# But there are also specialized functions eg. to remove leading/trailing # But there are also specialized functions eg. to remove leading/trailing
# whitespace which may be important to sanitize user input etc. Have a look at # whitespace which may be important to sanitize user input etc. Have a look at
# the function descriptions for the stringr and the stringi package. stringr is # the function descriptions for the stringr and the stringi package. stringr is
# part of the tidyverse, and for the most part a wrapper for stringi functions. # part of the tidyverse, and for the most part a wrapper for stringi functions.
# https://github.com/tidyverse/stringr # https://github.com/tidyverse/stringr
# == 6.3 dbSanitizeSequence() ============================================== # == 6.3 dbSanitizeSequence() ==============================================
# In our learning units, we use a function dbSanitizeSequence() to clean up # In our learning units, we use a function dbSanitizeSequence() to clean up
# sequences that may be copy/pasted from Web-sources # sequences that may be copy/pasted from Web-sources
cat( s <- ">FASTA header will be removed cat( s <- ">FASTA header will be removed
10 20 30 40 50 10 20 30 40 50
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
dbSanitizeSequence(s) dbSanitizeSequence(s)
# = 7 Permuting and sampling ============================================== # = 7 Permuting and sampling ==============================================
# An important aspect of working with strings is generating random strings # An important aspect of working with strings is generating random strings
# with given statistical properties: reference items to evaluate significance. # with given statistical properties: reference items to evaluate significance.
# == 7.1 Permutations ====================================================== # == 7.1 Permutations ======================================================
# One way to produce such reference items is to permute a string. A permuted # One way to produce such reference items is to permute a string. A permuted
# string has the same composition as the original, but all positional # string has the same composition as the original, but all positional
# information is lost. The sample() function can be used to permute: # information is lost. The sample() function can be used to permute:
# This is the sequence of the ompA secretion signal # This is the sequence of the ompA secretion signal
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
(x <- sample(s, length(s))) # permuted (x <- sample(s, length(s))) # permuted
# Here's a small example how such permuted strings may be useful. As you look # Here's a small example how such permuted strings may be useful. As you look
# at the ompA sequence, you suspect that the two lysines near the +-charged # at the ompA sequence, you suspect that the two lysines near the +-charged
# N-terminus may not be accidental, but selected for a positively charged # N-terminus may not be accidental, but selected for a positively charged
# N-terminus. What is the chance that such a sequence has two lysines close to # N-terminus. What is the chance that such a sequence has two lysines close to
# the N-terminus simply by chance? Or put differently: what is the average # the N-terminus simply by chance? Or put differently: what is the average
# distance of two lysines in such a sequence to the N-terminus. First, we # distance of two lysines in such a sequence to the N-terminus. First, we
# need an expression that measures the distance. A simple use of the which() # need an expression that measures the distance. A simple use of the which()
# function will do just fine. # function will do just fine.
which(s == "K") # shows they are in position 2 and 3, so ... which(s == "K") # shows they are in position 2 and 3, so ...
mean(which(s == "K")) # ... gives us the average, and ... mean(which(s == "K")) # ... gives us the average, and ...
mean(which(x == "K")) # ... gives us the average of the permuted sequence. mean(which(x == "K")) # ... gives us the average of the permuted sequence.
# So what does the distribution look like? Lets do 10,000 trials. # So what does the distribution look like? Lets do 10,000 trials.
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
N <- 10000 N <- 10000
d <- numeric(N) d <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
d[i] <- mean(which(sample(s, length(s)) == "K")) d[i] <- mean(which(sample(s, length(s)) == "K"))
} }
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
hist(d, breaks = 20) hist(d, breaks = 20)
abline(v = 2.5, lwd = 2, col = "firebrick") abline(v = 2.5, lwd = 2, col = "firebrick")
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
# N-terminus or more. That's just below the signifcance # N-terminus or more. That's just below the signifcance
# threshold of 5 %. It's a trend, but to be sure we are looking # threshold of 5 %. It's a trend, but to be sure we are looking
# at a biological effect we would need to see more # at a biological effect we would need to see more
# sequences. # sequences.
# == 7.2 Sampling ========================================================== # == 7.2 Sampling ==========================================================
# === 7.2.1 Equiprobable characters # === 7.2.1 Equiprobable characters
# Assume you need a large random-nucleotide string for some statistical model. # Assume you need a large random-nucleotide string for some statistical model.
# How to create such a string? sample() can easily create it: # How to create such a string? sample() can easily create it:
nuc <- c("A", "C", "G", "T") nuc <- c("A", "C", "G", "T")
N <- 100 N <- 100
set.seed(16818) # set RNG seed for repeatable randomness set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE) v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = "")) (mySeq <- paste(v, collapse = ""))
# What's the GC content? # What's the GC content?
table(v) table(v)
sum(table(v)[c("G", "C")]) # 51 is close to expected sum(table(v)[c("G", "C")]) # 51 is close to expected
# What's the number of CpG motifs? Easy to check with the stringi # What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function # stri_match_all() function
if (! requireNamespace("stringi", quietly = TRUE)) { if (! requireNamespace("stringi", quietly = TRUE)) {
install.packages("stringi") install.packages("stringi")
} }
# Package information: # Package information:
# library(help = stringi) # basic information # library(help = stringi) # basic information
# browseVignettes("stringi") # available vignettes # browseVignettes("stringi") # available vignettes
# data(package = "stringi") # available datasets # data(package = "stringi") # available datasets
(x <- stringi::stri_match_all(mySeq, regex = "CG")) (x <- stringi::stri_match_all(mySeq, regex = "CG"))
length(unlist(x)) length(unlist(x))
# Now you could compare that number with yeast DNA sequences, and determine # Now you could compare that number with yeast DNA sequences, and determine
# whether there are more or less CpG motifs than expected by chance. # whether there are more or less CpG motifs than expected by chance.
# (cf. https://en.wikipedia.org/wiki/CpG_site) # (cf. https://en.wikipedia.org/wiki/CpG_site)
# But hold on: is that a fair comparison? sample() gives us all four nucleotides # But hold on: is that a fair comparison? sample() gives us all four nucleotides
# with the same probability. But the yeast genomic DNA GC content is only # with the same probability. But the yeast genomic DNA GC content is only
# 38%. So you would expect fewer CpG motifs based on the statistical properties # 38%. So you would expect fewer CpG motifs based on the statistical properties
# of the smaller number of Cs and Gs - before biology even comes into play. How # of the smaller number of Cs and Gs - before biology even comes into play. How
# do we account for that? # do we account for that?
# === 7.2.2 Defined probability vector # === 7.2.2 Defined probability vector
# This is where we need to know how to create samples with specific probability # This is where we need to know how to create samples with specific probability
# distributions. A crude hack would be to create a sampling source vector with # distributions. A crude hack would be to create a sampling source vector with
# 19 C, 19 G, 31 A and 31 T # 19 C, 19 G, 31 A and 31 T
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31)) c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
# ... but that doesn't scale if the numeric accuracy needs to be higher. # ... but that doesn't scale if the numeric accuracy needs to be higher.
# #
# However sample() has an argument that takes care of that: you can explicitly # However sample() has an argument that takes care of that: you can explicitly
# specify the probabilities with which each element of the the sampling vector # specify the probabilities with which each element of the the sampling vector
# should be chosen: # should be chosen:
nuc <- c("A", "C", "G", "T") nuc <- c("A", "C", "G", "T")
N <- 100 N <- 100
myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities myProb <- c(0.31, 0.19, 0.19, 0.31) # sampling probabilities
set.seed(16818) # set RNG seed for repeatable randomness set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, prob = myProb, replace = TRUE) v <- sample(nuc, N, prob = myProb, replace = TRUE)
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = "")) (mySeq <- paste(v, collapse = ""))
# What's the GC content? # What's the GC content?
table(v) table(v)
sum(table(v)[c("G", "C")]) # Close to expected sum(table(v)[c("G", "C")]) # Close to expected
# What's the number of CpG motifs? # What's the number of CpG motifs?
(x <- stringi::stri_match_all(mySeq, regex = "CG")) (x <- stringi::stri_match_all(mySeq, regex = "CG"))
# ... not a single one in this case. # ... not a single one in this case.
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,349 +1,349 @@
# tocID <- "FND-Genetic_code.R" # tocID <- "FND-Genetic_code.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-Genetic_code unit. # R code accompanying the FND-Genetic_code unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017 10 - 2019 01 # Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance # 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0.1 Comment on "incomplete final line" warning in FASTA # 1.0.1 Comment on "incomplete final line" warning in FASTA
# 1.0 First live version # 1.0 First live version
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------------------- #TOC> ----------------------------------------------------------------
#TOC> 1 Storing the genetic code 45 #TOC> 1 Storing the genetic code 45
#TOC> 1.1 Genetic code in Biostrings 63 #TOC> 1.1 Genetic code in Biostrings 63
#TOC> 2 Working with the genetic code 94 #TOC> 2 Working with the genetic code 94
#TOC> 2.1 Translate a sequence. 129 #TOC> 2.1 Translate a sequence. 129
#TOC> 3 An alternative representation: 3D array 212 #TOC> 3 An alternative representation: 3D array 212
#TOC> 3.1 Print a Genetic code table 246 #TOC> 3.1 Print a Genetic code table 246
#TOC> 4 Tasks 272 #TOC> 4 Tasks 272
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Storing the genetic code ============================================ # = 1 Storing the genetic code ============================================
# The genetic code maps trinucleotide codons to amino acids. To store it, we # The genetic code maps trinucleotide codons to amino acids. To store it, we
# need some mechanism to associate the two representations. The most # need some mechanism to associate the two representations. The most
# convenient way to do that is a "named vector" which holds the amino acid # convenient way to do that is a "named vector" which holds the amino acid
# code and assigns the codons as names to its elements. # code and assigns the codons as names to its elements.
x <- c("M", "H", "H", "*", "*", "*") x <- c("M", "H", "H", "*", "*", "*")
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA") names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
x x
# Then we can access the vector by the codon as name, and retrieve the # Then we can access the vector by the codon as name, and retrieve the
# amino acid ... # amino acid ...
x["ATG"] x["ATG"]
x["CAC"] x["CAC"]
x["TAA"] x["TAA"]
# ... or the names of elements, to retrieve the codon(s) # ... or the names of elements, to retrieve the codon(s)
names(x)[x == "M"] names(x)[x == "M"]
names(x)[x == "H"] names(x)[x == "H"]
names(x)[x == "*"] names(x)[x == "*"]
# == 1.1 Genetic code in Biostrings ======================================== # == 1.1 Genetic code in Biostrings ========================================
# Coveniently, the standard genetic code as well as its alternatives are # Coveniently, the standard genetic code as well as its alternatives are
# available in the Bioconductor "Biostrings" package: # available in the Bioconductor "Biostrings" package:
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly = TRUE)) { if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
# The standard genetic code vector # The standard genetic code vector
Biostrings::GENETIC_CODE Biostrings::GENETIC_CODE
# The table of genetic codes. This information corresponds to this page # The table of genetic codes. This information corresponds to this page
# at the NCBI: # at the NCBI:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
Biostrings::GENETIC_CODE_TABLE Biostrings::GENETIC_CODE_TABLE
# Most of the alternative codes are mitochondrial codes. The id of the # Most of the alternative codes are mitochondrial codes. The id of the
# Alternative Yeast Nuclear code is "12" # Alternative Yeast Nuclear code is "12"
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
# = 2 Working with the genetic code ======================================= # = 2 Working with the genetic code =======================================
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
# to a "local" variable, rather than retrieving it from the package all the # to a "local" variable, rather than retrieving it from the package all the
# time. # time.
GC <- Biostrings::GENETIC_CODE GC <- Biostrings::GENETIC_CODE
# This is a named vector of characters ... # This is a named vector of characters ...
str(GC) str(GC)
# ... which also stores the alternative initiation codons TTG and CTG in # ... which also stores the alternative initiation codons TTG and CTG in
# an attribute of the vector. (Alternative initiation codons sometimes are # an attribute of the vector. (Alternative initiation codons sometimes are
# used instead of ATG to intiate translation, if translation is not initiated # used instead of ATG to intiate translation, if translation is not initiated
# at ATG thses are still translated with fMet.) # at ATG thses are still translated with fMet.)
attr(GC, "alt_init_codons") attr(GC, "alt_init_codons")
# But the key to use this vector is in the "names" which we use for subsetting # But the key to use this vector is in the "names" which we use for subsetting
# the list of amino acids in whatever way we need. # the list of amino acids in whatever way we need.
names(GC) names(GC)
# The translation of "TGG" ... # The translation of "TGG" ...
GC["TGG"] GC["TGG"]
# All stop codons # All stop codons
names(GC)[GC == "*"] names(GC)[GC == "*"]
# All start codons # All start codons
names(GC)[GC == "M"] # ... or names(GC)[GC == "M"] # ... or
c(names(GC)[GC == "M"], c(names(GC)[GC == "M"],
attr(GC, "alt_init_codons")) attr(GC, "alt_init_codons"))
# == 2.1 Translate a sequence. ============================================= # == 2.1 Translate a sequence. =============================================
# I have provided a gene sequence in the data directory: # I have provided a gene sequence in the data directory:
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence. # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
# read it # read it
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# You will notice that this generates a Warning message: # You will notice that this generates a Warning message:
# Warning message: # Warning message:
# In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") : # In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
# incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa' # incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
# The reason for this is that the last character of the file is the letter "A" # The reason for this is that the last character of the file is the letter "A"
# and not a "\n" line break. This file is exactly how it was sent from the # and not a "\n" line break. This file is exactly how it was sent from the
# NCBI server; I think good, defensive programming practice would have been to # NCBI server; I think good, defensive programming practice would have been to
# include some kind of an end-marker in the file, like a final "\n". This helps # include some kind of an end-marker in the file, like a final "\n". This helps
# us recognize an incomplete transmission. Let's parse the actual sequence from # us recognize an incomplete transmission. Let's parse the actual sequence from
# the file, and then check for completeness. # the file, and then check for completeness.
head(mbp1) head(mbp1)
# drop the first line (header) # drop the first line (header)
mbp1 <- mbp1[-1] mbp1 <- mbp1[-1]
head(mbp1) head(mbp1)
# concatenate it all to a single string # concatenate it all to a single string
mbp1 <- paste(mbp1, sep = "", collapse = "") mbp1 <- paste(mbp1, sep = "", collapse = "")
# how long is it? # how long is it?
nchar(mbp1) nchar(mbp1)
# how many codons? # how many codons?
nchar(mbp1)/3 nchar(mbp1)/3
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a # That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
# first verification that the file we read is complete, the nucleotides of a # first verification that the file we read is complete, the nucleotides of a
# complete ORF should be divisible by 3. # complete ORF should be divisible by 3.
# Extract the codons. There are many ways to split a long string into chunks # Extract the codons. There are many ways to split a long string into chunks
# of three characters. Here we use the Biostrings codons() function. codons() # of three characters. Here we use the Biostrings codons() function. codons()
# requires an object of type DNAstring - a special kind of string with # requires an object of type DNAstring - a special kind of string with
# attributes that are useful for Biostrings. Thus we convert the sequence first # attributes that are useful for Biostrings. Thus we convert the sequence first
# with DNAstring(), then split it up, then convert it into a plain # with DNAstring(), then split it up, then convert it into a plain
# character vector. # character vector.
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
head(mbp1Codons) head(mbp1Codons)
# now translate each codon # now translate each codon
mbp1AA <- character(834) mbp1AA <- character(834)
for (i in seq_along(mbp1Codons)) { for (i in seq_along(mbp1Codons)) {
mbp1AA[i] <- GC[mbp1Codons[i]] mbp1AA[i] <- GC[mbp1Codons[i]]
} }
head(mbp1Codons) head(mbp1Codons)
head(mbp1AA) head(mbp1AA)
tail(mbp1Codons) tail(mbp1Codons)
tail(mbp1AA) # Note the stop! tail(mbp1AA) # Note the stop!
# The TAA "ochre" stop codon is our second verification that the nucleotide # The TAA "ochre" stop codon is our second verification that the nucleotide
# sequence is complete: a stop codon can't appear internally in an ORF. # sequence is complete: a stop codon can't appear internally in an ORF.
# We can work with the mbp1AA vector, for example to tabulate the # We can work with the mbp1AA vector, for example to tabulate the
# amino acid frequencies: # amino acid frequencies:
table(mbp1AA) table(mbp1AA)
sort(table(mbp1AA), decreasing = TRUE) sort(table(mbp1AA), decreasing = TRUE)
# Or we can paste all elements together into a single string. But let's remove # Or we can paste all elements together into a single string. But let's remove
# the stop, it's not actually a part of the sequence. To remove the last element # the stop, it's not actually a part of the sequence. To remove the last element
# of a vector, re-assign it with a vector minus the index of the last element: # of a vector, re-assign it with a vector minus the index of the last element:
mbp1AA <- mbp1AA[-(length(mbp1AA))] mbp1AA <- mbp1AA[-(length(mbp1AA))]
tail(mbp1AA) # Note the stop is gone! tail(mbp1AA) # Note the stop is gone!
# paste it together, collapsing the elements using an empty string as the # paste it together, collapsing the elements using an empty string as the
# separation-character (i.e.: nothing) # separation-character (i.e.: nothing)
(Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) (Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
# = 3 An alternative representation: 3D array ============================= # = 3 An alternative representation: 3D array =============================
# We don't use 3D arrays often - usually just 2D tables and data frames, so # We don't use 3D arrays often - usually just 2D tables and data frames, so
# here is a good opportunity to review the syntax of 3D arrays with a # here is a good opportunity to review the syntax of 3D arrays with a
# genetic code cube: # genetic code cube:
# Initialize, using A G C T as the names of the elements in each dimension # Initialize, using A G C T as the names of the elements in each dimension
cCube <- array(data = character(64), cCube <- array(data = character(64),
dim = c(4, 4, 4), dim = c(4, 4, 4),
dimnames = list(c("A", "G", "C", "T"), dimnames = list(c("A", "G", "C", "T"),
c("A", "G", "C", "T"), c("A", "G", "C", "T"),
c("A", "G", "C", "T"))) c("A", "G", "C", "T")))
# fill it with amino acid codes using three nested loops # fill it with amino acid codes using three nested loops
for (i in 1:4) { for (i in 1:4) {
for (j in 1:4) { for (j in 1:4) {
for (k in 1:4) { for (k in 1:4) {
myCodon <- paste(dimnames(cCube)[[1]][i], myCodon <- paste(dimnames(cCube)[[1]][i],
dimnames(cCube)[[2]][j], dimnames(cCube)[[2]][j],
dimnames(cCube)[[3]][k], dimnames(cCube)[[3]][k],
sep = "", sep = "",
collapse = "") collapse = "")
cCube[i, j, k] <- GC[myCodon] cCube[i, j, k] <- GC[myCodon]
} }
} }
} }
# confirm # confirm
cCube["A", "T", "G"] # methionine cCube["A", "T", "G"] # methionine
cCube["T", "T", "T"] # phenylalanine cCube["T", "T", "T"] # phenylalanine
cCube["T", "A", "G"] # stop (amber) cCube["T", "A", "G"] # stop (amber)
# == 3.1 Print a Genetic code table ======================================== # == 3.1 Print a Genetic code table ========================================
# The data structure of our cCube is well suited to print a table. In the # The data structure of our cCube is well suited to print a table. In the
# "standard" way to print the genetic code, we write codons with the same # "standard" way to print the genetic code, we write codons with the same
# second nucleotide in columns, and arrange rows in blocks of same # second nucleotide in columns, and arrange rows in blocks of same
# first nucleotide, varying the third nucleotide fastest. This maximizes the # first nucleotide, varying the third nucleotide fastest. This maximizes the
# similarity of adjacent amino acids in the table if we print the # similarity of adjacent amino acids in the table if we print the
# nucleotides in the order T C A G. It's immidiately obvious that the code # nucleotides in the order T C A G. It's immidiately obvious that the code
# is not random: the universal genetic code is exceptionally error tolerant in # is not random: the universal genetic code is exceptionally error tolerant in
# the sense that mutations (or single-nucleotide translation errors) are likely # the sense that mutations (or single-nucleotide translation errors) are likely
# to result in an amino acid with similar biophysical properties as the # to result in an amino acid with similar biophysical properties as the
# original. # original.
nuc <- c("T", "C", "A", "G") nuc <- c("T", "C", "A", "G")
# (calling variables f, s, t to indicate first, second, and third position ...) # (calling variables f, s, t to indicate first, second, and third position ...)
for (f in nuc) { # first varies in blocks for (f in nuc) { # first varies in blocks
for (t in nuc) { # third varies in columns for (t in nuc) { # third varies in columns
for (s in nuc) { # second varies in rows for (s in nuc) { # second varies in rows
cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t])) cat(sprintf("%s%s%s: %s ", f, s, t, cCube[f, s, t]))
} }
cat("\n") cat("\n")
} }
cat("\n") cat("\n")
} }
# = 4 Tasks =============================================================== # = 4 Tasks ===============================================================
# Task: What do you need to change to print the table with U instead # Task: What do you need to change to print the table with U instead
# of T? Try it. # of T? Try it.
# Task: Point mutations are more often transitions (purine -> purine; # Task: Point mutations are more often transitions (purine -> purine;
# pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine; # pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
# pyrimidine -> purine), even though twice as many transversions # pyrimidine -> purine), even though twice as many transversions
# are possible in the code. This is most likely due a deamination / # are possible in the code. This is most likely due a deamination /
# tautomerization process that favours C -> T changes. If the code # tautomerization process that favours C -> T changes. If the code
# indeed minimizes the effect of mutations, you would expect that # indeed minimizes the effect of mutations, you would expect that
# codons that differ by a transition code for more similar amino acids # codons that differ by a transition code for more similar amino acids
# than codons that differ by a transversion. Is that true? List the set # than codons that differ by a transversion. Is that true? List the set
# of all amino acid pairs that are encoded by codons with a C -> T # of all amino acid pairs that are encoded by codons with a C -> T
# transition. Then list the set of amino acid pairs with a C -> A # transition. Then list the set of amino acid pairs with a C -> A
# transversion. Which set of pairs is more similar? # transversion. Which set of pairs is more similar?
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences # Task: How many stop codons do the two mbp1-gene derived amino acid sequences
# have if you translate them in the 2. or the 3. frame? # have if you translate them in the 2. or the 3. frame?
# Task: How does the amino acid composition change if you translate the mbp1 # Task: How does the amino acid composition change if you translate the mbp1
# gene with the Alternative Yeast Nuclear code that is used by the # gene with the Alternative Yeast Nuclear code that is used by the
# "GTC clade" of fungi? # "GTC clade" of fungi?
# (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code ) # (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
# Solution: # Solution:
# Fetch the code # Fetch the code
Biostrings::GENETIC_CODE_TABLE Biostrings::GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
altYcode <- Biostrings::getGeneticCode("12") altYcode <- Biostrings::getGeneticCode("12")
# what's the difference? # what's the difference?
(delta <- which(Biostrings::GENETIC_CODE != altYcode)) (delta <- which(Biostrings::GENETIC_CODE != altYcode))
Biostrings::GENETIC_CODE[delta] Biostrings::GENETIC_CODE[delta]
altYcode[delta] altYcode[delta]
# translate # translate
altYAA <- character(834) altYAA <- character(834)
for (i in seq_along(mbp1Codons)) { for (i in seq_along(mbp1Codons)) {
altYAA[i] <- altYcode[mbp1Codons[i]] altYAA[i] <- altYcode[mbp1Codons[i]]
} }
table(mbp1AA) table(mbp1AA)
table(altYAA) table(altYAA)
# Task: The genetic code has significant redundacy, i.e. there are up to six # Task: The genetic code has significant redundacy, i.e. there are up to six
# codons that code for the same amino acid. Write code that lists how # codons that code for the same amino acid. Write code that lists how
# many amino acids are present how often i.e. it should tell you that # many amino acids are present how often i.e. it should tell you that
# two amino acids are encoded only with a single codon, three amino # two amino acids are encoded only with a single codon, three amino
# acids have six codons, etc. Solution below, but don't peek. There # acids have six codons, etc. Solution below, but don't peek. There
# are many possible ways to do this. # are many possible ways to do this.
# #
# #
# Solution: # Solution:
( x <- table(table(Biostrings::GENETIC_CODE)) ) ( x <- table(table(Biostrings::GENETIC_CODE)) )
# confirm # confirm
sum(x * as.numeric(names(x))) sum(x * as.numeric(names(x)))
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,224 +1,224 @@
# tocID <- "FND-STA-Information_theory.R" # tocID <- "FND-STA-Information_theory.R"
# #
# ============================================================================== # ==============================================================================
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Information_theory unit. # R code accompanying the FND-STA-Information_theory unit.
# #
# Version: 0.2.1 # Version: 0.2.1
# #
# Date: 2017 - 2021 # Date: 2017 - 2021
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 0.2.1 Maintenance # 0.2.1 Maintenance
# 0.2 Under development # 0.2 Under development
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------- #TOC> --------------------------------------
#TOC> 1 ___Section___ 39 #TOC> 1 ___Section___ 39
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 ___Section___ ======================================================= # = 1 ___Section___ =======================================================
# What level of information is "significant" # What level of information is "significant"
# Assume the background distribution is the database frequencies of # Assume the background distribution is the database frequencies of
# amino acids: # amino acids:
AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to AAref <- numeric() # Uniprot frequencies October 2017, slightly adjusted to
# sum to 1.0 # sum to 1.0
AAref["A"] <- 0.0904 AAref["A"] <- 0.0904
AAref["C"] <- 0.0123 AAref["C"] <- 0.0123
AAref["D"] <- 0.0545 AAref["D"] <- 0.0545
AAref["E"] <- 0.0617 AAref["E"] <- 0.0617
AAref["F"] <- 0.0394 AAref["F"] <- 0.0394
AAref["G"] <- 0.0724 AAref["G"] <- 0.0724
AAref["H"] <- 0.0221 AAref["H"] <- 0.0221
AAref["I"] <- 0.0573 AAref["I"] <- 0.0573
AAref["K"] <- 0.0504 AAref["K"] <- 0.0504
AAref["L"] <- 0.0986 AAref["L"] <- 0.0986
AAref["M"] <- 0.0240 AAref["M"] <- 0.0240
AAref["N"] <- 0.0392 AAref["N"] <- 0.0392
AAref["P"] <- 0.0486 AAref["P"] <- 0.0486
AAref["Q"] <- 0.0381 AAref["Q"] <- 0.0381
AAref["R"] <- 0.0570 AAref["R"] <- 0.0570
AAref["S"] <- 0.0673 AAref["S"] <- 0.0673
AAref["T"] <- 0.0558 AAref["T"] <- 0.0558
AAref["V"] <- 0.0686 AAref["V"] <- 0.0686
AAref["W"] <- 0.0129 AAref["W"] <- 0.0129
AAref["Y"] <- 0.0294 AAref["Y"] <- 0.0294
sum(AAref) sum(AAref)
# Function to calculate Shannon entropy # Function to calculate Shannon entropy
H <- function(pmf) { H <- function(pmf) {
# Calculate Shannon entropy # Calculate Shannon entropy
# Parameters: # Parameters:
# pmf (numeric) probability mass function: a vector of states and # pmf (numeric) probability mass function: a vector of states and
# associated probabilities. Each element of # associated probabilities. Each element of
# pmf must be in (0, 1] and sum(pmf) must be 1. # pmf must be in (0, 1] and sum(pmf) must be 1.
# Value: # Value:
# Shannon entropy in bits. # Shannon entropy in bits.
# Examples: # Examples:
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random # H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
# # nucleotide sequence # # nucleotide sequence
# H(1) # If all elements are the same, entropy is zero # H(1) # If all elements are the same, entropy is zero
# #
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
stop("Input is not a discrete probability distribution.") stop("Input is not a discrete probability distribution.")
} }
H <- -sum(pmf * (log(pmf) / log(2))) H <- -sum(pmf * (log(pmf) / log(2)))
return(H) return(H)
} }
# Why use all.equal()? Exact comparisons with floating point numbers are # Why use all.equal()? Exact comparisons with floating point numbers are
# brittle. Consider for example: # brittle. Consider for example:
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 # all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
# Entropy of the database frequencies (in bits): # Entropy of the database frequencies (in bits):
(Href <- H(AAref)) (Href <- H(AAref))
# for comparison: entropy if all amino acids are equiprobable # for comparison: entropy if all amino acids are equiprobable
H(rep(0.05, 20)) H(rep(0.05, 20))
# Set up a simulation to estimate the distribution of Information values # Set up a simulation to estimate the distribution of Information values
# from random sequences drawn from AAref. This is the distribution for the # from random sequences drawn from AAref. This is the distribution for the
# statistical null hypothesis: # statistical null hypothesis:
nObs <- 15 # number of observations (e.g aligned sequences) nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80 # nObs <- 80
nTrials <- 10000 # number of trials nTrials <- 10000 # number of trials
IObs <- numeric(nTrials) # vector to store Information in each trial IObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ... simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ... for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref: # sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H Hobs <- H(simCounts/sum(simCounts)) # counts to frequency, calc. H
IObs[i] <- Href - Hobs # store information IObs[i] <- Href - Hobs # store information
} }
# evaluate # evaluate
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25) hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC") abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
# The purple lines are drawn at the 5% quantiles of the Iobs distributions - # The purple lines are drawn at the 5% quantiles of the Iobs distributions -
# i.e. an actual observation that lies outside the purple lines is deemed # i.e. an actual observation that lies outside the purple lines is deemed
# "significant"(1)(2). Of course, this is only true to the degree that the # "significant"(1)(2). Of course, this is only true to the degree that the
# database frequencies are a valid model for the null-hypothesis on the # database frequencies are a valid model for the null-hypothesis on the
# sequence position we are considering here. # sequence position we are considering here.
# (1) If we use 5% quantiles, this means a value is significantly larger # (1) If we use 5% quantiles, this means a value is significantly larger
# than expected, and we ignore cases when the value is < 0; if we # than expected, and we ignore cases when the value is < 0; if we
# consider both smaller and larger values, we need to use 2.5% quantiles, # consider both smaller and larger values, we need to use 2.5% quantiles,
# since 5% of all observations lie outside the 0.025 and 0.975 # since 5% of all observations lie outside the 0.025 and 0.975
# quantiles. # quantiles.
# #
# (2) For an actual observation of counts, we calculate its observed # (2) For an actual observation of counts, we calculate its observed
# _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1). # _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
# You can probably now appreciate that information is a bit of a shortcut for # You can probably now appreciate that information is a bit of a shortcut for
# biological sequences, and does not really take the different inherent # biological sequences, and does not really take the different inherent
# frequencies based on the character of the amino acids into account. For # frequencies based on the character of the amino acids into account. For
# example, L is the most frequent and C is the least frequent, but if we have an # example, L is the most frequent and C is the least frequent, but if we have an
# alignment of 1000 sequences and we see that the frequencies for L and C are # alignment of 1000 sequences and we see that the frequencies for L and C are
# swapped, that would be _very_ surprising - nevertheless, the information would # swapped, that would be _very_ surprising - nevertheless, the information would
# be 0. In order to take that into account, we should actually compute # be 0. In order to take that into account, we should actually compute
# Kullback-Leibler divergences. # Kullback-Leibler divergences.
# Swap C and L frequencies # Swap C and L frequencies
p <- AAref p <- AAref
q <- AAref q <- AAref
q["L"] <- AAref["C"] q["L"] <- AAref["C"]
q["C"] <- AAref["L"] q["C"] <- AAref["L"]
H(p) H(p)
H(q) H(q)
KLdiv <- function(p, q) { KLdiv <- function(p, q) {
# p and q are two pmfs of discrete probability distributions # p and q are two pmfs of discrete probability distributions
# with the same outcomes, which are nowhere 0. # with the same outcomes, which are nowhere 0.
# Value: Kullback-Leibler divergence sum(p * log( p / q))). # Value: Kullback-Leibler divergence sum(p * log( p / q))).
if (length(p) != length(q)) { if (length(p) != length(q)) {
stop("PANIC: input vector lengths differ!") stop("PANIC: input vector lengths differ!")
} }
if (any(c((p == 0), (q == 0)))) { if (any(c((p == 0), (q == 0)))) {
stop("PANIC: 0's found in input vectors!") stop("PANIC: 0's found in input vectors!")
} }
return(sum(p * log( p / q ))) return(sum(p * log( p / q )))
} }
KLdiv(p, p) KLdiv(p, p)
KLdiv(p, q) KLdiv(p, q)
nObs <- 15 # number of observations (e.g aligned sequences) nObs <- 15 # number of observations (e.g aligned sequences)
# nObs <- 80 # nObs <- 80
nTrials <- 10000 # number of trials nTrials <- 10000 # number of trials
KLdivObs <- numeric(nTrials) # vector to store Information in each trial KLdivObs <- numeric(nTrials) # vector to store Information in each trial
simCounts <- numeric(20) # vector to tabulate our information ... simCounts <- numeric(20) # vector to tabulate our information ...
names(simCounts) <- names(AAref)# ... with the names of AAref names(simCounts) <- names(AAref)# ... with the names of AAref
for (i in 1:nTrials) { # simulate ... for (i in 1:nTrials) { # simulate ...
# sample AAref letters, nObs times, with the probabilities of AAref: # sample AAref letters, nObs times, with the probabilities of AAref:
AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
x <- table(AAobs) # table simulated observations x <- table(AAobs) # table simulated observations
simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
simCounts[names(x)] <- x # overwrite with observed counts simCounts[names(x)] <- x # overwrite with observed counts
simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts simCounts <- simCounts + 0.5 # add Jeffreys' pseudocounts
simCounts <- simCounts/sum(simCounts) # counts to frequency simCounts <- simCounts/sum(simCounts) # counts to frequency
KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
} }
# evaluate # evaluate
hist(KLdivObs, col = "#C9F4E3", breaks = 25) hist(KLdivObs, col = "#C9F4E3", breaks = 25)
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC") abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
quantile(KLdivObs, 0.992) quantile(KLdivObs, 0.992)
# Running the simulation with KL does not give a fundamentally # Running the simulation with KL does not give a fundamentally
# different behaviour - since we are just randomly sampling. But KL would be # different behaviour - since we are just randomly sampling. But KL would be
# more sensitive in case there is biological selection, where the sampling is no # more sensitive in case there is biological selection, where the sampling is no
# longer random. If I run the same simulation, with nObs <- 80 but calculating # longer random. If I run the same simulation, with nObs <- 80 but calculating
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L # KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
# frequency swap gives me a KL divergence of 0.18 - this is significant at p = # frequency swap gives me a KL divergence of 0.18 - this is significant at p =
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a # 0.008 - (remember, Information is 0 in this case). So that's actually quite a
# nice addition to the toolbox. # nice addition to the toolbox.
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,351 +1,351 @@
# tocID <- "FND-STA-Significance.R" # tocID <- "FND-STA-Significance.R"
# #
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit. # R code accompanying the FND-STA-Significance unit.
# #
# Version: 1.3 # Version: 1.3
# #
# Date: 2017-09 - 2020-09 # Date: 2017-09 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.3 2020 Maintenance. Add sample solution. # 1.3 2020 Maintenance. Add sample solution.
# 1.2 Update set.seed() usage # 1.2 Update set.seed() usage
# 1.1 Corrected treatment of empirical p-value # 1.1 Corrected treatment of empirical p-value
# 1.0 First contents # 1.0 First contents
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------------------ #TOC> ------------------------------------------------------------------
#TOC> 1 Significance and p-value 49 #TOC> 1 Significance and p-value 49
#TOC> 1.1 Significance levels 60 #TOC> 1.1 Significance levels 60
#TOC> 1.2 probability and p-value 77 #TOC> 1.2 probability and p-value 77
#TOC> 1.2.1 p-value illustrated 109 #TOC> 1.2.1 p-value illustrated 109
#TOC> 2 One- or two-sided 165 #TOC> 2 One- or two-sided 165
#TOC> 3 Significance by integration 209 #TOC> 3 Significance by integration 209
#TOC> 4 Significance by simulation or permutation 215 #TOC> 4 Significance by simulation or permutation 215
#TOC> 5 Final tasks 327 #TOC> 5 Final tasks 327
#TOC> 6 Sample solutions 336 #TOC> 6 Sample solutions 336
#TOC> 6.1 338 #TOC> 6.1 338
#TOC> 6.2 342 #TOC> 6.2 342
#TOC> 6.3 346 #TOC> 6.3 346
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Significance and p-value ============================================ # = 1 Significance and p-value ============================================
# The idea of the probability of an event has a precise mathematical # The idea of the probability of an event has a precise mathematical
# interpretation, but how is it useful to know the probability? Usually we are # interpretation, but how is it useful to know the probability? Usually we are
# interested in whether we should accept or reject a hypothesis based on the # interested in whether we should accept or reject a hypothesis based on the
# observations we have. A rational way to do this is to say: if the probability # observations we have. A rational way to do this is to say: if the probability
# of observing the data is very small under the null-hypothesis, then we will # of observing the data is very small under the null-hypothesis, then we will
# assume the observation is due to something other than the null-hypothesis. But # assume the observation is due to something other than the null-hypothesis. But
# what do we mean by the "probability of our observation"? And what is "very # what do we mean by the "probability of our observation"? And what is "very
# small"? # small"?
# == 1.1 Significance levels =============================================== # == 1.1 Significance levels ===============================================
# A "very small" probability is purely a matter of convention - a cultural # A "very small" probability is purely a matter of convention - a cultural
# convention. In the biomedical field we usually call probabilities of less then # convention. In the biomedical field we usually call probabilities of less then
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
# observations with a probability of less than 0.05 "significant" and if we want # observations with a probability of less than 0.05 "significant" and if we want
# to highlight this in text or in a graph, we often mark them with an asterisk # to highlight this in text or in a graph, we often mark them with an asterisk
# (*). Also we often call observations with a probability of less than 0.01 # (*). Also we often call observations with a probability of less than 0.01
# "highly significant" and mark them with two asterisks (**). But there is no # "highly significant" and mark them with two asterisks (**). But there is no
# special significance in these numbers, the cutoff point for significance could # special significance in these numbers, the cutoff point for significance could
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
# British statistician Ronald Fisher happened to propose for this purpose in # British statistician Ronald Fisher happened to propose for this purpose in
# 1925. Incidentally, Fisher later recommended to use different cutoffs for # 1925. Incidentally, Fisher later recommended to use different cutoffs for
# different purposes (cf. # different purposes (cf.
# https://en.wikipedia.org/wiki/Statistical_significance). # https://en.wikipedia.org/wiki/Statistical_significance).
# == 1.2 probability and p-value =========================================== # == 1.2 probability and p-value ===========================================
# But what do we even mean by the probability of an observation? # But what do we even mean by the probability of an observation?
# Assume I am drawing samples from a normal distribution with a mean of 0 and a # Assume I am drawing samples from a normal distribution with a mean of 0 and a
# standard deviation of 1. The sample I get is ... # standard deviation of 1. The sample I get is ...
set.seed(sqrt(5)) set.seed(sqrt(5))
x <- rnorm(1) x <- rnorm(1)
set.seed(NULL) set.seed(NULL)
print(x, digits = 22) print(x, digits = 22)
# [1] -0.8969145466249813791748 # [1] -0.8969145466249813791748
# So what's the probability of that number? Obviously, the probability of # So what's the probability of that number? Obviously, the probability of
# getting exactly this number is very, very, very small. But also obviously, # getting exactly this number is very, very, very small. But also obviously,
# this does not mean that observing this number is in any way significant - we # this does not mean that observing this number is in any way significant - we
# always observe some number. That's not what we mean in this case. There are # always observe some number. That's not what we mean in this case. There are
# several implicit assumptions when we speak of the probability of an # several implicit assumptions when we speak of the probability of an
# observation: # observation:
# 1: the observation can be compared to a probability distribution; # 1: the observation can be compared to a probability distribution;
# 2: that distribution can be integrated between any specific value # 2: that distribution can be integrated between any specific value
# and its upper and lower bounds (or +- infinity). # and its upper and lower bounds (or +- infinity).
# Then what we really mean by the probability of an observation in the context # Then what we really mean by the probability of an observation in the context
# of that distribution is: the probability of observing that value, or a value # of that distribution is: the probability of observing that value, or a value
# more extreme than the one we have. We call this the p-value. Note that we are # more extreme than the one we have. We call this the p-value. Note that we are
# not talking about an individual number anymore, we are talking about the area # not talking about an individual number anymore, we are talking about the area
# under the curve between our observation and the upper (or lower) bound of the # under the curve between our observation and the upper (or lower) bound of the
# curve, as a fraction of the whole. # curve, as a fraction of the whole.
# === 1.2.1 p-value illustrated # === 1.2.1 p-value illustrated
# Let's illustrate. First we draw a million random values from our # Let's illustrate. First we draw a million random values from our
# standard, normal distribution: # standard, normal distribution:
N <- 1e6 # one million N <- 1e6 # one million
set.seed(112358) # set RNG seed for repeatable randomness set.seed(112358) # set RNG seed for repeatable randomness
r <- rnorm(N) # N values from a normal distribution r <- rnorm(N) # N values from a normal distribution
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
# Let's see what the distribution looks like: # Let's see what the distribution looks like:
(h <- hist(r)) (h <- hist(r))
# The histogram details are now available in the list h - e.g. h$counts # The histogram details are now available in the list h - e.g. h$counts
# Where is the value we have drawn previously? # Where is the value we have drawn previously?
abline(v = x, col = "#EE0000") abline(v = x, col = "#EE0000")
# How many values are smaller? # How many values are smaller?
sum(r < x) sum(r < x)
# Let's color the bars: # Let's color the bars:
# first, make a vector of red and green colors for the bars with breaks # first, make a vector of red and green colors for the bars with breaks
# smaller and larger then x, white for the bar that contains x ... # smaller and larger then x, white for the bar that contains x ...
hCol <- rep("#EE000044", sum(h$breaks < x) - 1) hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
hCol <- c(hCol, "#FFFFFFFF") hCol <- c(hCol, "#FFFFFFFF")
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1)) hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
# ... then plot the histogram, with colored bars ... # ... then plot the histogram, with colored bars ...
hist(r, col = hCol) hist(r, col = hCol)
# ... add two colored rectangles into the white bar ... # ... add two colored rectangles into the white bar ...
idx <- sum(h$breaks < x) idx <- sum(h$breaks < x)
xMin <- h$breaks[idx] xMin <- h$breaks[idx]
xMax <- h$breaks[idx + 1] xMax <- h$breaks[idx + 1]
y <- h$counts[idx] y <- h$counts[idx]
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE) rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE) rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
# ... and a red line for our observation. # ... and a red line for our observation.
abline(v = x, col = "#EE0000", lwd = 2) abline(v = x, col = "#EE0000", lwd = 2)
# The p-value of our observation is the red area as a fraction of the # The p-value of our observation is the red area as a fraction of the
# whole histogram (red + green). # whole histogram (red + green).
# Task: # Task:
# Explain how the expression sum(r < x) works to give us a count of values # Explain how the expression sum(r < x) works to give us a count of values
# with the property we are looking for. E.g., examine -4:4 < x # with the property we are looking for. E.g., examine -4:4 < x
# Task: # Task:
# Write an expression to estimate the probability that a value # Write an expression to estimate the probability that a value
# drawn from the vector r is less-or-equal to x. The result you get # drawn from the vector r is less-or-equal to x. The result you get
# will depend on the exact values that went into the vector r but it should # will depend on the exact values that went into the vector r but it should
# be close to 0.185 That expression is the p-value associated with x. # be close to 0.185 That expression is the p-value associated with x.
# (Sample solution 6.1) # (Sample solution 6.1)
# = 2 One- or two-sided =================================================== # = 2 One- or two-sided ===================================================
# The shape of our histogram confirms that the rnorm() function has returned # The shape of our histogram confirms that the rnorm() function has returned
# values that appear distributed according to a normal distribution. In a normal # values that appear distributed according to a normal distribution. In a normal
# distribution, readily available tables tell us that 5% of the values (i.e. our # distribution, readily available tables tell us that 5% of the values (i.e. our
# significance level) lie 1.96 (or approximately 2) standard deviations away # significance level) lie 1.96 (or approximately 2) standard deviations away
# from the mean. Is this the case here? How many values in our vector r are # from the mean. Is this the case here? How many values in our vector r are
# larger than 1.96? # larger than 1.96?
sum(r > 1.96) sum(r > 1.96)
# [1] 24589 # [1] 24589
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why? # Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
# The answer is: we have to be careful with two-sided distributions. 2 standard # The answer is: we have to be careful with two-sided distributions. 2 standard
# deviations away from the mean means either larger or smaller than 1.96 . This # deviations away from the mean means either larger or smaller than 1.96 . This
# can give rise to errors. If we are simply are interested in outliers, no # can give rise to errors. If we are simply are interested in outliers, no
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct. # matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
# But if we are specifically interested in, say, larger values, because a # But if we are specifically interested in, say, larger values, because a
# smaller value is not meaningful, then the significance cutoff, expressed as # smaller value is not meaningful, then the significance cutoff, expressed as
# standard deviations, is relaxed. We can use the quantile function to see what # standard deviations, is relaxed. We can use the quantile function to see what
# the cutoff values are: # the cutoff values are:
quantile(r) quantile(r)
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
# close to ± 1.96, as expected # close to ± 1.96, as expected
quantile(r, probs = 0.95) # for the single 5% boundary quantile(r, probs = 0.95) # for the single 5% boundary
# close to 1.64 . Check counts to confirm: # close to 1.64 . Check counts to confirm:
sum(r > quantile(r, probs = 0.95)) sum(r > quantile(r, probs = 0.95))
# [1] 50000 # [1] 50000
# which is 5%, as expected. # which is 5%, as expected.
# Task: # Task:
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram. # Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
# (Sample solution 6.2) # (Sample solution 6.2)
# To summarize: when we evaluate the significance of an event, we divide a # To summarize: when we evaluate the significance of an event, we divide a
# probability distribution into two parts at the point where the event was # probability distribution into two parts at the point where the event was
# observed. We then ask whether the integral over the more extreme part is less # observed. We then ask whether the integral over the more extreme part is less
# or more than 5% of the whole. If it is less, we deem the event to be # or more than 5% of the whole. If it is less, we deem the event to be
# significant. # significant.
# #
# = 3 Significance by integration ========================================= # = 3 Significance by integration =========================================
# If the underlying probability distribution can be analytically or numerically # If the underlying probability distribution can be analytically or numerically
# integrated, the siginificance of an observation can be directly computed. # integrated, the siginificance of an observation can be directly computed.
# = 4 Significance by simulation or permutation =========================== # = 4 Significance by simulation or permutation ===========================
# But whether the integration is correct, or relies on assumptions that may not # But whether the integration is correct, or relies on assumptions that may not
# be warranted for biological data, can be a highly technical question. # be warranted for biological data, can be a highly technical question.
# Fortunately, we can often simply run a simulation, a random resampling, or a # Fortunately, we can often simply run a simulation, a random resampling, or a
# permutation and then count the number of outcomes, just as we did with our # permutation and then count the number of outcomes, just as we did with our
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical # rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
# p-value" is defined as (Nobs + 1) / (N + 1). ) # p-value" is defined as (Nobs + 1) / (N + 1). )
# Here is an example. Assume you have a protein sequence and # Here is an example. Assume you have a protein sequence and
# you speculate that positively charged residues are close to negatively charged # you speculate that positively charged residues are close to negatively charged
# residues to balance charge locally. A statistic that would capture this is the # residues to balance charge locally. A statistic that would capture this is the
# mean minimum distance between all D,E residues and the closest R,K,H # mean minimum distance between all D,E residues and the closest R,K,H
# residue. Let's compute this for the sequence of yeast Mbp1. # residue. Let's compute this for the sequence of yeast Mbp1.
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK", MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA", "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR", "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ", "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS", "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY", "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP", "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT", "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP", "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK", "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR", "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK", "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA") "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
# first we split this string into individual characters: # first we split this string into individual characters:
v <- unlist(strsplit(MBP1, "")) v <- unlist(strsplit(MBP1, ""))
# and find the positions of our charged residues # and find the positions of our charged residues
ED <- grep("[ED]", v) ED <- grep("[ED]", v)
RKH <- grep("[RKH]", v) RKH <- grep("[RKH]", v)
sep <- numeric(length(ED)) # this vector will hold the distances sep <- numeric(length(ED)) # this vector will hold the distances
for (i in seq_along(ED)) { for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i])) sep[i] <- min(abs(RKH - ED[i]))
} }
# Task: read and explain this bit of code # Task: read and explain this bit of code
# Now that sep is computed, what does it look like? # Now that sep is computed, what does it look like?
table(sep) # these are the minimum distances table(sep) # these are the minimum distances
# 24 of D,E residues are adjacent to R,K,H; # 24 of D,E residues are adjacent to R,K,H;
# the longest separation is 28 residues. # the longest separation is 28 residues.
# What is the mean separation? # What is the mean separation?
mean(sep) mean(sep)
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed # The value is 4.1 . Is this significant? Honestly, I would be hard pressed
# to solve this analytically. But by permutation it's soooo easy. # to solve this analytically. But by permutation it's soooo easy.
# First, we combine what we have done above into a function: # First, we combine what we have done above into a function:
chSep <- function(v) { chSep <- function(v) {
# computes the mean minimum separation of oppositely charged residues # computes the mean minimum separation of oppositely charged residues
# Parameter: v (char) a vector of amino acids in the one-letter code # Parameter: v (char) a vector of amino acids in the one-letter code
# Value: msep (numeric) mean minimum separation # Value: msep (numeric) mean minimum separation
ED <- grep("[EDed]", v) ED <- grep("[EDed]", v)
RKH <- grep("[RKHrkh]", v) RKH <- grep("[RKHrkh]", v)
sep <- numeric(length(ED)) sep <- numeric(length(ED))
for (i in seq_along(ED)) { for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i])) sep[i] <- min(abs(RKH - ED[i]))
} }
return(mean(sep)) return(mean(sep))
} }
# Execute the function to define it. # Execute the function to define it.
# Confirm that the function gives the same result as the number we # Confirm that the function gives the same result as the number we
# calculated above: # calculated above:
chSep(v) chSep(v)
# Now we can produce a random permutation of v, and recalculate # Now we can produce a random permutation of v, and recalculate
set.seed(pi) # set RNG seed for repeatable randomness set.seed(pi) # set RNG seed for repeatable randomness
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful. # code paradigm. It is very useful.
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
chSep(w) chSep(w)
# 3.773 ... that's actually less than what we had before. # 3.773 ... that's actually less than what we had before.
# Let's do this 10000 times and record the results (takes a few seconds): # Let's do this 10000 times and record the results (takes a few seconds):
N <- 10000 N <- 10000
chs <- numeric(N) chs <- numeric(N)
for (i in 1:N) { for (i in 1:N) {
chs[i] <- chSep(sample(v, length(v))) # charge chs[i] <- chSep(sample(v, length(v))) # charge
} }
hist(chs, breaks = 50) hist(chs, breaks = 50)
abline(v = chSep(v), col = "#EE0000") abline(v = chSep(v), col = "#EE0000")
# Contrary to our expectations, the actual observed mean minimum charge # Contrary to our expectations, the actual observed mean minimum charge
# separation seems to be larger than what we observe in randomly permuted # separation seems to be larger than what we observe in randomly permuted
# sequences. But is this significant? Your task to find out. # sequences. But is this significant? Your task to find out.
# Task: # Task:
# Calculate the empirical p-value for chsep(v) # Calculate the empirical p-value for chsep(v)
# (Sample solution 6.3) # (Sample solution 6.3)
# = 5 Final tasks ========================================================= # = 5 Final tasks =========================================================
# From chs, compute the empirical p-value of a mean minimum charge separation to # From chs, compute the empirical p-value of a mean minimum charge separation to
# be larger or equal to the value observed for the yeast MBP1 sequence. Note # be larger or equal to the value observed for the yeast MBP1 sequence. Note
# the result in your journal. Is it significant? Also note the result of # the result in your journal. Is it significant? Also note the result of
# the following expression for validation: # the following expression for validation:
seal(sum(chs)) seal(sum(chs))
# = 6 Sample solutions ==================================================== # = 6 Sample solutions ====================================================
# == 6.1 ================================================================== # == 6.1 ==================================================================
# #
sum(r <= x) / length(r) sum(r <= x) / length(r)
# == 6.2 ================================================================== # == 6.2 ==================================================================
# #
abline(v = quantile(r, probs = c(0.05))) abline(v = quantile(r, probs = c(0.05)))
# == 6.3 ================================================================== # == 6.3 ==================================================================
# #
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) ) ( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
# [END] # [END]

View File

@ -1,3 +1,3 @@
# BCH441-WORK-ABC-units # BCH441-WORK-ABC-units
This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date. This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.

View File

@ -1,245 +1,245 @@
# tocID <- "RPR-Biostrings.R" # tocID <- "RPR-Biostrings.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Biostrings unit. # R code accompanying the RPR-Biostrings unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Updates # 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.0 2017 Revisions # 1.0 2017 Revisions
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------------- #TOC> -----------------------------------------------------------------
#TOC> 1 The Biostrings:: Package 56 #TOC> 1 The Biostrings:: Package 56
#TOC> 2 Getting Data into Biostrings:: Objects 88 #TOC> 2 Getting Data into Biostrings:: Objects 88
#TOC> 3 Working with Biostrings:: Objects 110 #TOC> 3 Working with Biostrings:: Objects 110
#TOC> 3.1 Properties 127 #TOC> 3.1 Properties 127
#TOC> 3.2 Subsetting 168 #TOC> 3.2 Subsetting 168
#TOC> 3.3 Operators 180 #TOC> 3.3 Operators 180
#TOC> 3.4 Transformations 187 #TOC> 3.4 Transformations 187
#TOC> 4 Getting Data out of Biostrings:: Objects 194 #TOC> 4 Getting Data out of Biostrings:: Objects 194
#TOC> 5 More 203 #TOC> 5 More 203
#TOC> 5.1 Views 205 #TOC> 5.1 Views 205
#TOC> 5.2 Iranges 219 #TOC> 5.2 Iranges 219
#TOC> 5.3 StringSets 225 #TOC> 5.3 StringSets 225
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# This is a very brief introduction to the Biostrings:: package, other units will # This is a very brief introduction to the Biostrings:: package, other units will
# be using more of the Biostrings:: functions. # be using more of the Biostrings:: functions.
# = 1 The Biostrings:: Package ============================================ # = 1 The Biostrings:: Package ============================================
# First, we install and load the Biostrings:: package from bioconductor (if we # First, we install and load the Biostrings:: package from bioconductor (if we
# haven't done so already). # haven't done so already).
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly = TRUE)) { if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Examine the package information: # Examine the package information:
library(help = Biostrings) # basic information library(help = Biostrings) # basic information
browseVignettes("Biostrings") # available vignettes browseVignettes("Biostrings") # available vignettes
data(package = "Biostrings") # available datasets data(package = "Biostrings") # available datasets
# At its core, Biostrings:: objects are "classes" of type XString (you can think # At its core, Biostrings:: objects are "classes" of type XString (you can think
# of a "class" in R as a special kind of list), that can take on particular # of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information. # flavours for RNA, DNA or amino acid sequence information.
class(Biostrings::RNAString("AUG")) class(Biostrings::RNAString("AUG"))
class(Biostrings::DNAString("ATG")) class(Biostrings::DNAString("ATG"))
class(Biostrings::AAString("M")) class(Biostrings::AAString("M"))
# An essential property of Biostrings:: objects is that they only allow letters # An essential property of Biostrings:: objects is that they only allow letters
# from the applicable IUPAC alphabet: # from the applicable IUPAC alphabet:
Biostrings::RNAString("AUG") Biostrings::RNAString("AUG")
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
# = 2 Getting Data into Biostrings:: Objects ============================== # = 2 Getting Data into Biostrings:: Objects ==============================
# Example: read FASTA. Extract sequence. Convert to DNAString object. # Example: read FASTA. Extract sequence. Convert to DNAString object.
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
rawSeq <- dbSanitizeSequence(rawSeq) rawSeq <- dbSanitizeSequence(rawSeq)
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
# into an object of class DNAstring # into an object of class DNAstring
# Multi FASTA files can be read directly as a "XStringSet) ... # Multi FASTA files can be read directly as a "XStringSet) ...
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
# ... and if you subset one sequence from the set, you get an XString object # ... and if you subset one sequence from the set, you get an XString object
# back again. # back again.
(Xseq <- biosDNASet[[1]]) (Xseq <- biosDNASet[[1]])
biosDNAseq == Xseq # the comparison evaluates to TRUE ... biosDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical. identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
# = 3 Working with Biostrings:: Objects =================================== # = 3 Working with Biostrings:: Objects ===================================
# Biostrings:: is a highly engineered package that is tightly integrated into # Biostrings:: is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat # the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the # undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their # package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics # explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of # that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with # functions for a very specific context, you will probably get away with
# Biostrings::<function>() - but even in the demonstration code of this script # Biostrings::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library, # not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where # but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from. # the functions come from.
library(Biostrings) library(Biostrings)
# == 3.1 Properties ======================================================== # == 3.1 Properties ========================================================
str(rawSeq) str(rawSeq)
str(biosDNAseq) str(biosDNAseq)
length(rawSeq) # ... is 1: one string only. To get the number of length(rawSeq) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar(). # characters in a string, you need nchar().
length(biosDNAseq) # but the length of a "Bstring" is the number of elements length(biosDNAseq) # but the length of a "Bstring" is the number of elements
nchar(rawSeq) nchar(rawSeq)
nchar(biosDNAseq) # ... but nchar() works too. nchar(biosDNAseq) # ... but nchar() works too.
(uL <- Biostrings::uniqueLetters(biosDNAseq)) (uL <- Biostrings::uniqueLetters(biosDNAseq))
# Count frequencies - with strings, you would strsplit() into a character # Count frequencies - with strings, you would strsplit() into a character
# vector and then use table(). biost # vector and then use table(). biost
Biostrings::alphabetFrequency(biosDNAseq) Biostrings::alphabetFrequency(biosDNAseq)
# letterFrequency() works with a defined alphabet - such as what uniqueLetters() # letterFrequency() works with a defined alphabet - such as what uniqueLetters()
# returns. # returns.
Biostrings::letterFrequency(biosDNAseq, uL) Biostrings::letterFrequency(biosDNAseq, uL)
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
length(biosDNAseq) # GC contents length(biosDNAseq) # GC contents
Biostrings::dinucleotideFrequency(biosDNAseq) Biostrings::dinucleotideFrequency(biosDNAseq)
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
barplot(sort(triNuc), col="#4499EE33") barplot(sort(triNuc), col="#4499EE33")
triNuc[triNuc == max(triNuc)] triNuc[triNuc == max(triNuc)]
triNuc[triNuc == min(triNuc)] triNuc[triNuc == min(triNuc)]
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
# compare to a shuffled sequence: # compare to a shuffled sequence:
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433", add = TRUE) barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
max(triNuc) max(triNuc)
# Interpret this plot. # Interpret this plot.
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433") barplot(sort(triNuc), col="#EEEE4433")
max(triNuc) max(triNuc)
# == 3.2 Subsetting ======================================================== # == 3.2 Subsetting ========================================================
# Subsetting any XString object works as expected: # Subsetting any XString object works as expected:
biosDNAseq[4:15] biosDNAseq[4:15]
# ... well - maybe not expected, because rawSeq[4:15] would not work. # ... well - maybe not expected, because rawSeq[4:15] would not work.
# Alternatively to the "[" operator, use the subseq() function - especially for # Alternatively to the "[" operator, use the subseq() function - especially for
# long sequences. This is far more efficient. # long sequences. This is far more efficient.
Biostrings::subseq(biosDNAseq, start = 1, end = 30) Biostrings::subseq(biosDNAseq, start = 1, end = 30)
# == 3.3 Operators ========================================================= # == 3.3 Operators =========================================================
# RNAstring() and DNAstring() objects compare U and T as equals! # RNAstring() and DNAstring() objects compare U and T as equals!
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
# == 3.4 Transformations =================================================== # == 3.4 Transformations ===================================================
biosDNAseq[4:15] biosDNAseq[4:15]
Biostrings::reverseComplement(biosDNAseq[4:15]) Biostrings::reverseComplement(biosDNAseq[4:15])
Biostrings::translate(biosDNAseq[4:15]) Biostrings::translate(biosDNAseq[4:15])
# = 4 Getting Data out of Biostrings:: Objects ============================ # = 4 Getting Data out of Biostrings:: Objects ============================
# If you need a character object, use toString(): # If you need a character object, use toString():
Biostrings::toString(biosDNAseq[4:15]) Biostrings::toString(biosDNAseq[4:15])
# saveRDS() and readRDS() works like on all other R objects. # saveRDS() and readRDS() works like on all other R objects.
# = 5 More ================================================================ # = 5 More ================================================================
# == 5.1 Views ============================================================= # == 5.1 Views =============================================================
# Biostring "Views" are objects that store multiple substrings of one # Biostring "Views" are objects that store multiple substrings of one
# Biostring object. # Biostring object.
(myView <- Biostrings::Views(biosDNAseq, (myView <- Biostrings::Views(biosDNAseq,
start = c(1, 19, 37), start = c(1, 19, 37),
end = c(15, 30, 45))) end = c(15, 30, 45)))
# Views are convenient to store feature annotations # Views are convenient to store feature annotations
names(myView) <- c("Feature-A", "Feature-B", "Feature-C") names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
# == 5.2 Iranges =========================================================== # == 5.2 Iranges ===========================================================
# Biostrings:: Iranges are like Views with a common start point. These can be # Biostrings:: Iranges are like Views with a common start point. These can be
# useful for feature annotations. Instead of start/end you store start/width. # useful for feature annotations. Instead of start/end you store start/width.
# == 5.3 StringSets ======================================================== # == 5.3 StringSets ========================================================
# Biostring "StringSets" store multiple sequences. # Biostring "StringSets" store multiple sequences.
# #
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
sample(ompA) # sample can work directly on a Biostring object to shuffle it sample(ompA) # sample can work directly on a Biostring object to shuffle it
x <- Biostrings::toString(ompA) x <- Biostrings::toString(ompA)
for (i in 2:10) { for (i in 2:10) {
x[i] <- Biostrings::toString(sample(ompA)) x[i] <- Biostrings::toString(sample(ompA))
} }
shuffledPeptideSet <- Biostrings::AAStringSet(x) shuffledPeptideSet <- Biostrings::AAStringSet(x)
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
shuffledPeptideSet shuffledPeptideSet
length(shuffledPeptideSet) length(shuffledPeptideSet)
Biostrings::width(shuffledPeptideSet) Biostrings::width(shuffledPeptideSet)
Biostrings::alphabetFrequency(shuffledPeptideSet) Biostrings::alphabetFrequency(shuffledPeptideSet)
# [END] # [END]

View File

@ -1,165 +1,165 @@
# tocID <- "RPR-ChimeraX_remote.R" # tocID <- "RPR-ChimeraX_remote.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code demonstrating remote scripting of ChimeraX. # R code demonstrating remote scripting of ChimeraX.
# #
# Version: 1.0.1 # Version: 1.0.1
# #
# Date: 2020-09 # Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.0.1 2021 Minimal updates # 1.0.1 2021 Minimal updates
# 1.0 First ABC units version # 1.0 First ABC units version
# #
# #
# TODO: # TODO:
# %-encode and escape quotes, or just pass-through? # %-encode and escape quotes, or just pass-through?
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------------ #TOC> ------------------------------------------------------
#TOC> 1 ChimeraX REMOTE SCRIPTING 41 #TOC> 1 ChimeraX REMOTE SCRIPTING 41
#TOC> 1.1 Defining a Port 59 #TOC> 1.1 Defining a Port 59
#TOC> 1.2 Open ChimeraX 81 #TOC> 1.2 Open ChimeraX 81
#TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113 #TOC> 2 WORKED EXAMPLE: SUPERPOSITION 113
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 ChimeraX REMOTE SCRIPTING =========================================== # = 1 ChimeraX REMOTE SCRIPTING ===========================================
# One of the cool features of ChimeraX is that it can be driven by Python code, # One of the cool features of ChimeraX is that it can be driven by Python code,
# both within a running session and through Python scripts. What I find even # both within a running session and through Python scripts. What I find even
# cooler though is that ChimeraX can be driven from any programming language via # cooler though is that ChimeraX can be driven from any programming language via
# its remote control function that can listen to commands sent from any other # its remote control function that can listen to commands sent from any other
# application. The interface that is used here is the standard REST (method) - # application. The interface that is used here is the standard REST (method) -
# the GET and POST verbs that ubiquitously underly the communication of clients # the GET and POST verbs that ubiquitously underly the communication of clients
# and servers on the Web. # and servers on the Web.
# In order to establish the communication between this script and ChimeraX, all # In order to establish the communication between this script and ChimeraX, all
# we need to do is: # we need to do is:
# - open ChimeraX; # - open ChimeraX;
# - tell it to listen on a specific "port"; # - tell it to listen on a specific "port";
# - send commands to that port via httr:: # - send commands to that port via httr::
# == 1.1 Defining a Port =================================================== # == 1.1 Defining a Port ===================================================
# The httr:: package needs to be available # The httr:: package needs to be available
if (! requireNamespace("httr", quietly = TRUE)) { if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr") install.packages("httr")
} }
# Package information: # Package information:
# library(help = httr) # basic information # library(help = httr) # basic information
# browseVignettes("httr") # available vignettes # browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets # data(package = "httr") # available datasets
# We need to think od a port. Any available port number between 49152-65535 is # We need to think od a port. Any available port number between 49152-65535 is
# fine. We'll choose 61803 because that's the fractional part of the golden # fine. We'll choose 61803 because that's the fractional part of the golden
# ratio. But one could choose another. # ratio. But one could choose another.
CXPORT <- 61803 CXPORT <- 61803
# Check that our current version of R supports sockets (default since V 3.3) # Check that our current version of R supports sockets (default since V 3.3)
capabilities("sockets") # MUST be TRUE. If not, don't continue. capabilities("sockets") # MUST be TRUE. If not, don't continue.
# == 1.2 Open ChimeraX ===================================================== # == 1.2 Open ChimeraX =====================================================
# - Open a fresh, new session of recently updated version of ChimeraX # - Open a fresh, new session of recently updated version of ChimeraX
# - type: # - type:
# #
# remotecontrol rest start port 61803 # remotecontrol rest start port 61803
# #
# ... or whatever the value of CXPORT is. # ... or whatever the value of CXPORT is.
# Now watch what happens in ChimeraX when you execute the following line: # Now watch what happens in ChimeraX when you execute the following line:
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") ) ( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
# The .utilities.R script includes the function CX(), based on this principle, # The .utilities.R script includes the function CX(), based on this principle,
# through which you can send commands to ChimeraX # through which you can send commands to ChimeraX
CX("camera sbs") CX("camera sbs")
CX("lighting soft") CX("lighting soft")
CX("color sequential #1 & protein target abc palette powderblue:orchid:white") CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
# The command echos Chimera's response if the parameter "quietly" is # The command echos Chimera's response if the parameter "quietly" is
# FALSE (default), and we can silence output with quietly = TRUE : # FALSE (default), and we can silence output with quietly = TRUE :
CX("info models #1 attribute num_residues") CX("info models #1 attribute num_residues")
CX("info models #1 attribute num_residues", quietly = TRUE) CX("info models #1 attribute num_residues", quietly = TRUE)
# Either way, the command also returns Chimera's responses "invisibly"; # Either way, the command also returns Chimera's responses "invisibly";
# i.e. we can use the results by assigning the output to a variable: # i.e. we can use the results by assigning the output to a variable:
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE) hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
x <- read.table(file = textConnection(hBonds), skip = 9, x <- read.table(file = textConnection(hBonds), skip = 9,
blank.lines.skip = TRUE, fill = TRUE) blank.lines.skip = TRUE, fill = TRUE)
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff") hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
# = 2 WORKED EXAMPLE: SUPERPOSITION ======================================= # = 2 WORKED EXAMPLE: SUPERPOSITION =======================================
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able # We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
# to explore possible DNA binding regions in 1BM8 # to explore possible DNA binding regions in 1BM8
# The model for 1BM8 is already open as model 1 (#1) # The model for 1BM8 is already open as model 1 (#1)
CX("hide #1 cartoons") # hide model 1 cartoon representation CX("hide #1 cartoons") # hide model 1 cartoon representation
CX("open 1DUX") # assume this is opened as model #2 CX("open 1DUX") # assume this is opened as model #2
CX("hide #2") # hide everything ... CX("hide #2") # hide everything ...
CX("select #2/C") # chain c (protein) CX("select #2/C") # chain c (protein)
CX("show sel cartoons") # ... and show cartoons of chain c (protein) CX("show sel cartoons") # ... and show cartoons of chain c (protein)
CX("color sequential sel target c palette steelblue:darkmagenta") CX("color sequential sel target c palette steelblue:darkmagenta")
CX("view #2/C") # re-center the display CX("view #2/C") # re-center the display
CX("cofr #2/C:62@CA") # set pivot to an interface residue CX("cofr #2/C:62@CA") # set pivot to an interface residue
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
CX("style sel stick") CX("style sel stick")
CX("show sel target ab") # show atoms/bonds CX("show sel target ab") # show atoms/bonds
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan") CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan") CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
CX("surface sel enclose sel") # compute joint accessible surface of both chains CX("surface sel enclose sel") # compute joint accessible surface of both chains
CX("transparency 50") CX("transparency 50")
CX("select clear") CX("select clear")
# Now superimpose the 1BM8 chain onto 1DUX chain C # Now superimpose the 1BM8 chain onto 1DUX chain C
CX("show #1 cartoons") CX("show #1 cartoons")
CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition CX("matchmaker #1/A to #2/C pairing ss") # the actual superposition
# study the general layout, and the position of the 1mb8 secondary structure # study the general layout, and the position of the 1mb8 secondary structure
# elements relative to 1DUX # elements relative to 1DUX
# Let's examine side chain orientations in more detail # Let's examine side chain orientations in more detail
CX("hide #2/C cartoons") # hide the 1DUX protein CX("hide #2/C cartoons") # hide the 1DUX protein
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b) # select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
CX("select zone #2/A,B 3.5 #1 & protein residues true") CX("select zone #2/A,B 3.5 #1 & protein residues true")
CX("~select sel & H") # de-select H atoms CX("~select sel & H") # de-select H atoms
CX("show sel target ab") CX("show sel target ab")
CX("size stickRadius 0.4") CX("size stickRadius 0.4")
CX("select clear") CX("select clear")
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk # The overall architecture of the Mbp1 APSES domain is a good match for the Elk
# transcription factor binding mode; the detailed conformations of side chains # transcription factor binding mode; the detailed conformations of side chains
# would need to change only to a minor degree. There is a very significant # would need to change only to a minor degree. There is a very significant
# degree of structural similarity; remarkable, given that the DNA is not the # degree of structural similarity; remarkable, given that the DNA is not the
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was # target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
# determined without a DNA ligand. # determined without a DNA ligand.
CX("remotecontrol rest stop") # release the socket CX("remotecontrol rest stop") # release the socket
# Done. # Done.
# [END] # [END]

View File

@ -1,322 +1,322 @@
# tocID <- "RPR-FASTA.R" # tocID <- "RPR-FASTA.R"
# #
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-FASTA unit. # R code accompanying the RPR-FASTA unit.
# #
# Version: 1.1.2 # Version: 1.1.2
# #
# Date: 2017-10 - 2021-09 # Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.1.2 style update # 1.1.2 style update
# 1.1.1 bugfix - wrong function name # 1.1.1 bugfix - wrong function name
# 1.1 2020 Maintenance. Rewrite validation logic. Add data # 1.1 2020 Maintenance. Rewrite validation logic. Add data
# to utilities. Define AACOLS # to utilities. Define AACOLS
# 1.0 New unit. # 1.0 New unit.
# #
# #
# TODO: Make a simple solution first, then extend it to error checking, and # TODO: Make a simple solution first, then extend it to error checking, and
# to handle .mfa files. # to handle .mfa files.
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------- #TOC> -----------------------------------------------------
#TOC> 1 Reading and validating FASTA 45 #TOC> 1 Reading and validating FASTA 45
#TOC> 1.1 Validating FASTA 81 #TOC> 1.1 Validating FASTA 81
#TOC> 2 Parsing FASTA 227 #TOC> 2 Parsing FASTA 227
#TOC> 3 Interpreting FASTA 247 #TOC> 3 Interpreting FASTA 247
#TOC> 4 Writing FASTA 274 #TOC> 4 Writing FASTA 274
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Reading and validating FASTA ======================================== # = 1 Reading and validating FASTA ========================================
# FASTA is a text based format, structured in lines that are separated by # FASTA is a text based format, structured in lines that are separated by
# line-feed or paragraph-break characters. Which one of these is used, depends # line-feed or paragraph-break characters. Which one of these is used, depends
# on your operating system. But R's readLines() function knows how to handle # on your operating system. But R's readLines() function knows how to handle
# these correctly, accross platforms. Don't try to read such files "by hand". # these correctly, accross platforms. Don't try to read such files "by hand".
# Here is the yeast Mbp1 gene, via SGD. # Here is the yeast Mbp1 gene, via SGD.
file.show("./data/S288C_YDL056W_MBP1_coding.fsa") file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
# The warning is generated because the programmer at the NCBI who implemented # The warning is generated because the programmer at the NCBI who implemented
# the code to write this FASTA file neglected to place a line-break character # the code to write this FASTA file neglected to place a line-break character
# after the last sequence character. While this is not technically incorrect, # after the last sequence character. While this is not technically incorrect,
# it is poor practice: the resulting file can't be distinguished from one that # it is poor practice: the resulting file can't be distinguished from one that
# has been truncated in transmission. # has been truncated in transmission.
head(faMBP1) head(faMBP1)
# Note that there are NO line-break characters ("\n") at the end of these # Note that there are NO line-break characters ("\n") at the end of these
# strings, even though they were present in the original file. readLines() # strings, even though they were present in the original file. readLines()
# has "consumed" these characters while reading - but every single line is in # has "consumed" these characters while reading - but every single line is in
# a vector of its own. # a vector of its own.
tail(faMBP1) tail(faMBP1)
# Also note that the last line has fewer characters - this means readLines() # Also note that the last line has fewer characters - this means readLines()
# imported the whole line, despite it not being terminated by "\n". # imported the whole line, despite it not being terminated by "\n".
# It's very straightforward to work with such data, for example by collapsing # It's very straightforward to work with such data, for example by collapsing
# everything except the first line into a single string ... # everything except the first line into a single string ...
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = "")) f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
f[1] f[1]
nchar(f[2]) nchar(f[2])
# == 1.1 Validating FASTA ================================================== # == 1.1 Validating FASTA ==================================================
# The code above is making the assumption that everything from line 2 until # The code above is making the assumption that everything from line 2 until
# the end IS sequence, the whole sequence and nothing but sequence. # the end IS sequence, the whole sequence and nothing but sequence.
# That assumption can break down in many ways: # That assumption can break down in many ways:
# #
# - there could be more than one header line. The specification says otherwise, # - there could be more than one header line. The specification says otherwise,
# but some older files use multiple, consecutive header lines. You don't # but some older files use multiple, consecutive header lines. You don't
# want that to end up in your sequence. # want that to end up in your sequence.
# - this could be not a FASTA file at all. It could be raw sequence, a # - this could be not a FASTA file at all. It could be raw sequence, a
# different sequence file format, or a wholly different file altogether. # different sequence file format, or a wholly different file altogether.
# If you look at the file, you can immediately tell, but if you are # If you look at the file, you can immediately tell, but if you are
# reading the file in a complex workflow, your could easily import wrong # reading the file in a complex workflow, your could easily import wrong
# data into your analysis. # data into your analysis.
# - there could be more than one sequence in the file. Such Multi-FASTA files # - there could be more than one sequence in the file. Such Multi-FASTA files
# occur commonly, as downloads of ORFs from genome regions or other # occur commonly, as downloads of ORFs from genome regions or other
# sets of genes or proteins, or as the input / output for multiple # sets of genes or proteins, or as the input / output for multiple
# sequence alignment programs. # sequence alignment programs.
# #
# Data "from the wild" can (and usually does) have the most unexpected # Data "from the wild" can (and usually does) have the most unexpected
# variations and it is really, really important to be clear about the # variations and it is really, really important to be clear about the
# assumptions that you are making. It is possible to "fix" things, according # assumptions that you are making. It is possible to "fix" things, according
# to the "Robustness Principle" : # to the "Robustness Principle" :
# "Be conservative in what you send, # "Be conservative in what you send,
# be liberal in what you accept". # be liberal in what you accept".
# (cf. https://en.wikipedia.org/wiki/Robustness_principle ) # (cf. https://en.wikipedia.org/wiki/Robustness_principle )
# ... but if you think about this, that's actually a really poor idea, # ... but if you think about this, that's actually a really poor idea,
# which is much more likely to dilute standards, make unwarranted # which is much more likely to dilute standards, make unwarranted
# assumptions, and allow errors to pass silently and corrupt data. # assumptions, and allow errors to pass silently and corrupt data.
# #
# Let's discard this principle on the trash-heap of # Let's discard this principle on the trash-heap of
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test, # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
# identify problems, and follow the principle: "crash early, crash often". Of # identify problems, and follow the principle: "crash early, crash often". Of
# course I can write code that would reformat any possible input as a FASTA # course I can write code that would reformat any possible input as a FASTA
# file - but what good will it do me if it parses the file I receive # file - but what good will it do me if it parses the file I receive
# from a server into FASTA format like: # from a server into FASTA format like:
# #
# >404- Page Not Found</title</head> # >404- Page Not Found</title</head>
# dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe # dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
# spellingrcntacttheadministratrsdyhtml # spellingrcntacttheadministratrsdyhtml
# #
# Therefore, we write ourselves a FASTA checker that will enforce the following: # Therefore, we write ourselves a FASTA checker that will enforce the following:
# (1) a FASTA file contains one or more sequences separated by zero or # (1) a FASTA file contains one or more sequences separated by zero or
# more empty lines # more empty lines
# (2) a sequence contains one header line followed by # (2) a sequence contains one header line followed by
# one or more sequence lines # one or more sequence lines
# (3) a sequence line contains one or more uppercase or lowercase single # (3) a sequence line contains one or more uppercase or lowercase single
# letter amino acid codes, hyphens (gap character), or * (stop). # letter amino acid codes, hyphens (gap character), or * (stop).
# #
# Anything else should generate an error. # Anything else should generate an error.
# (Case 1): Header(s) exist # (Case 1): Header(s) exist
fX <- c("ABC", fX <- c("ABC",
"defghi", "defghi",
"klmnpq") "klmnpq")
sel <- grepl("^>", fX) # "^>" is a regular expression that sel <- grepl("^>", fX) # "^>" is a regular expression that
# means: the exact character ">" at the # means: the exact character ">" at the
# beginning ("^") of the line. # beginning ("^") of the line.
if ( ! any(sel) ) { stop("no header lines in input.") } if ( ! any(sel) ) { stop("no header lines in input.") }
# (Case 2) No adjacent header lines # (Case 2) No adjacent header lines
fX <- c(">ABC", fX <- c(">ABC",
">123", ">123",
"defghi", "defghi",
"klmnpq") "klmnpq")
sel <- grepl("^>", fX) sel <- grepl("^>", fX)
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
if ( any(sel)) { stop("adjacent header lines in input.") } if ( any(sel)) { stop("adjacent header lines in input.") }
# (Case 3.1) all sequence lines contain only valid characters # (Case 3.1) all sequence lines contain only valid characters
# (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG # (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
# are defined with the .utilities.R script) # are defined with the .utilities.R script)
AAVALID AAVALID
fX <- c(">ABC", fX <- c(">ABC",
"def ;-) ghi", "def ;-) ghi",
"klmnpq") "klmnpq")
myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character myRegex <- sprintf("[^%s]", AAVALID) # NOT a valid character
sel <- ! grepl("^>", fX) # NOT headers sel <- ! grepl("^>", fX) # NOT headers
if (any(grepl(myRegex, fX[sel]))) { if (any(grepl(myRegex, fX[sel]))) {
stop("invalid chracter(s) outside of header lines.") stop("invalid chracter(s) outside of header lines.")
} }
# (Case 3.2) all headers are followed directly by # (Case 3.2) all headers are followed directly by
# at least one letter of sequence # at least one letter of sequence
fX <- c(">ABC", fX <- c(">ABC",
"", "",
">123", ">123",
"defghi", "defghi",
"klmnpq") "klmnpq")
sel <- grep("^>", fX) + 1 # indexes of headers + 1 sel <- grep("^>", fX) + 1 # indexes of headers + 1
myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character myRegex <- sprintf("[%s]+", AAVALID) # at least one valid character
if (! all(grepl(myRegex, fX[sel]))) { if (! all(grepl(myRegex, fX[sel]))) {
stop("a header has no adjacent sequence.") stop("a header has no adjacent sequence.")
} }
# Ah, you might ask - couldn't we just have dropped all empty lines, and # Ah, you might ask - couldn't we just have dropped all empty lines, and
# then caught this in Case 2? No - for two reasons: we would still miss headers # then caught this in Case 2? No - for two reasons: we would still miss headers
# at the end of file, and, we would have changed the line numbering - and # at the end of file, and, we would have changed the line numbering - and
# ideally our "production" function will create information about where the # ideally our "production" function will create information about where the
# error is to be found. # error is to be found.
# Now combine this into a function ... # Now combine this into a function ...
val <- function(fa) { val <- function(fa) {
if ( ! any(grepl("^>", fa)) ) { if ( ! any(grepl("^>", fa)) ) {
stop("no header lines in input.") stop("no header lines in input.")
} }
sel <- grepl("^>", fa) sel <- grepl("^>", fa)
if ( any(sel[- length(sel)] & sel[-1])) { if ( any(sel[- length(sel)] & sel[-1])) {
stop("adjacent header lines in input.") stop("adjacent header lines in input.")
} }
sel <- ! grepl("^>", fa) sel <- ! grepl("^>", fa)
if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) { if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
stop("invalid chracter(s) outside of header lines.") stop("invalid chracter(s) outside of header lines.")
} }
sel <- grep("^>", fa) + 1 sel <- grep("^>", fa) + 1
if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) { if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
stop("a header has no adjacent sequence.") stop("a header has no adjacent sequence.")
} }
return(invisible(NULL)) return(invisible(NULL))
} }
# Here is an example # Here is an example
FA <- c(">head1", FA <- c(">head1",
"acdef", "acdef",
"ghi", "ghi",
"", "",
">head2", ">head2",
"kl", "kl",
">head3", ">head3",
"mn", "mn",
"pqrs") "pqrs")
val(FA) # ... should not create an error val(FA) # ... should not create an error
# A somewhat more elaborate validateFA() function was loaded with the # A somewhat more elaborate validateFA() function was loaded with the
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- # ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
# fasta files have space-characters in their spacer lines. Try it ... # fasta files have space-characters in their spacer lines. Try it ...
validateFA(FA) validateFA(FA)
# = 2 Parsing FASTA ======================================================= # = 2 Parsing FASTA =======================================================
# Once we have validated our assumptions about our input, it's quite # Once we have validated our assumptions about our input, it's quite
# painless to parse it. I have put this together as a function and the function # painless to parse it. I have put this together as a function and the function
# gets loaded from ./.utilities.R # gets loaded from ./.utilities.R
# #
# Lets try this: # Lets try this:
# - the first 3 elements of faMBP1: # - the first 3 elements of faMBP1:
readFASTA(faMBP1[1:3]) readFASTA(faMBP1[1:3])
# - a multi FASTA file of aligned APSES domain sequences: # - a multi FASTA file of aligned APSES domain sequences:
refAPSES <- readFASTA("./data/refAPSES.mfa") refAPSES <- readFASTA("./data/refAPSES.mfa")
# Subset the sequence with "P39678" in the header # Subset the sequence with "P39678" in the header
refAPSES[grep("P39678", refAPSES$head) ,] refAPSES[grep("P39678", refAPSES$head) ,]
# = 3 Interpreting FASTA ================================================== # = 3 Interpreting FASTA ==================================================
# FASTA files are straightforward to interpret - just one thing may be of note: # FASTA files are straightforward to interpret - just one thing may be of note:
# when working with strings, we can use substr(<string>, <start>, <stop>) to # when working with strings, we can use substr(<string>, <start>, <stop>) to
# extract substrings, but more often we expand the string into a vector of # extract substrings, but more often we expand the string into a vector of
# single characters with strsplit(<string>, ""). strsplit() returns a list, # single characters with strsplit(<string>, ""). strsplit() returns a list,
# to accommodate that <string> could be a vector of many elements, therefore # to accommodate that <string> could be a vector of many elements, therefore
# we usually unlist() the result if we use it only on a single string. # we usually unlist() the result if we use it only on a single string.
# Example: How many positive charged residues in "MBP1_SACCE"? # Example: How many positive charged residues in "MBP1_SACCE"?
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], "")) s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
s s
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
# for the characters, sum() coerces to 1 and 0 # for the characters, sum() coerces to 1 and 0
# respectively, and that gives us the result. # respectively, and that gives us the result.
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
# residue distribution # residue distribution
x <- factor(s, levels = names(AACOLS)) x <- factor(s, levels = names(AACOLS))
pie(table(x)[names(AACOLS)], col = AACOLS) pie(table(x)[names(AACOLS)], col = AACOLS)
# = 4 Writing FASTA ======================================================= # = 4 Writing FASTA =======================================================
# Writing FASTA files is mostly just the reverse of reading, with one # Writing FASTA files is mostly just the reverse of reading, with one
# twist: we need to break the long sequence string into chunks of the desired # twist: we need to break the long sequence string into chunks of the desired
# width. The FASTA specification calls for a maximum of 120 characters per line, # width. The FASTA specification calls for a maximum of 120 characters per line,
# but writing out much less than that is common, since it allows to comfortably # but writing out much less than that is common, since it allows to comfortably
# view lines on the console, or printing them on a sheet of paper (do we still # view lines on the console, or printing them on a sheet of paper (do we still
# do that actually?). How do we break a string into chunks? A combination of # do that actually?). How do we break a string into chunks? A combination of
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we # nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
# loop through our FASTA object in memory, we can build the output by c()'ing # loop through our FASTA object in memory, we can build the output by c()'ing
# blocks of header + sequence to each other. For VERY large objects this might # blocks of header + sequence to each other. For VERY large objects this might
# be slow - in that case, we might want to precalculate the size of the output # be slow - in that case, we might want to precalculate the size of the output
# object. But that's more of a hypothetical consideration. # object. But that's more of a hypothetical consideration.
( s <- refAPSES$seq[2] ) ( s <- refAPSES$seq[2] )
nchar(s) nchar(s)
w <- 30 # width of chunk w <- 30 # width of chunk
(starts <- seq(1, nchar(s), by = w)) # starting index of chunk (starts <- seq(1, nchar(s), by = w)) # starting index of chunk
(ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk (ends <- c((starts - 1)[-1], nchar(s))) # ending index of chunk
# Task: Is this safe? What happens if nchar(s) is shorter than w? # Task: Is this safe? What happens if nchar(s) is shorter than w?
# What happens if nchar(s) is an exact multiple of w? # What happens if nchar(s) is an exact multiple of w?
substring(s, starts, ends) substring(s, starts, ends)
# confirm that the output contains the first and last residue, and both # confirm that the output contains the first and last residue, and both
# residues adjacent to the breaks # residues adjacent to the breaks
# As always, the function has been defined in ".utilities.R" for to use # As always, the function has been defined in ".utilities.R" for to use
# any time... type writeFASTA to examine it. # any time... type writeFASTA to examine it.
# Let's try this... # Let's try this...
writeFASTA(refAPSES, width = 40) writeFASTA(refAPSES, width = 40)
# roundtrip for validation: write refAPSES with a different format, # roundtrip for validation: write refAPSES with a different format,
# read it back in - the new dataframe must be identical # read it back in - the new dataframe must be identical
# to the original dataframe. # to the original dataframe.
fname <- tempfile() fname <- tempfile()
writeFASTA(refAPSES, fn = fname, width = 30) writeFASTA(refAPSES, fn = fname, width = 30)
identical(refAPSES, readFASTA(fname)) identical(refAPSES, readFASTA(fname))
# ...works for me :-) # ...works for me :-)
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,385 +1,385 @@
# tocID <- "RPR-Genetic_code_optimality.R" # tocID <- "RPR-Genetic_code_optimality.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit. # R code accompanying the RPR-Genetic_code_optimality unit.
# #
# Version: 1.3 # Version: 1.3
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.3 2020 Maintenance # 1.3 2020 Maintenance
# 1.2 Change from require() to requireNamespace(), # 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite() # use Biocmanager:: not biocLite()
# 1.1 Update set.seed() usage # 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo. # 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material. # 1.0 New material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------------- #TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 58 #TOC> 1 Designing a computational experiment 58
#TOC> 2 Setting up the tools 74 #TOC> 2 Setting up the tools 74
#TOC> 2.1 Natural and alternative genetic codes 77 #TOC> 2.1 Natural and alternative genetic codes 77
#TOC> 2.2 Effect of mutations 135 #TOC> 2.2 Effect of mutations 135
#TOC> 2.2.1 reverse-translate 146 #TOC> 2.2.1 reverse-translate 146
#TOC> 2.2.2 Randomly mutate 171 #TOC> 2.2.2 Randomly mutate 171
#TOC> 2.2.3 Forward- translate 196 #TOC> 2.2.3 Forward- translate 196
#TOC> 2.2.4 measure effect 213 #TOC> 2.2.4 measure effect 213
#TOC> 3 Run the experiment 267 #TOC> 3 Run the experiment 267
#TOC> 4 Task solutions 363 #TOC> 4 Task solutions 363
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# This unit demonstrates R code to simulate alternate genetic codes and evaluate # This unit demonstrates R code to simulate alternate genetic codes and evaluate
# their robsustness to code changes. The approaches are quite simple and you # their robsustness to code changes. The approaches are quite simple and you
# will be able to come up with obvious refinements; the point of this code is to # will be able to come up with obvious refinements; the point of this code is to
# demonstrate some R programming techniques, in preparation for more # demonstrate some R programming techniques, in preparation for more
# sophisticated questions later. # sophisticated questions later.
# = 1 Designing a computational experiment ================================ # = 1 Designing a computational experiment ================================
# Computational experiments are conducted like wet-lab experiments. We begin # Computational experiments are conducted like wet-lab experiments. We begin
# with a hypothesis, then define the observables that relate to the hypothesis, # with a hypothesis, then define the observables that relate to the hypothesis,
# then define the measures we apply to observations, and finally we interpret # then define the measures we apply to observations, and finally we interpret
# our observations. If we want to learn something about the evolution of the # our observations. If we want to learn something about the evolution of the
# genetic code ... # genetic code ...
# - we construct a hypothesis such as: the genetic code has evolved so as to # - we construct a hypothesis such as: the genetic code has evolved so as to
# minimize the effect of mutations; # minimize the effect of mutations;
# - we define the observables: the effect of mutations in # - we define the observables: the effect of mutations in
# sequences, given the natural and possible alternative codes; # sequences, given the natural and possible alternative codes;
# - we define the measures to quantify the effect of mutations; # - we define the measures to quantify the effect of mutations;
# - then we compute alternatives and interpret the results. # - then we compute alternatives and interpret the results.
# = 2 Setting up the tools ================================================ # = 2 Setting up the tools ================================================
# == 2.1 Natural and alternative genetic codes ============================= # == 2.1 Natural and alternative genetic codes =============================
# Load genetic code tables from the Biostrings package # Load genetic code tables from the Biostrings package
if (! requireNamespace("BiocManager", quietly = TRUE)) { if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager") install.packages("BiocManager")
} }
if (! requireNamespace("Biostrings", quietly = TRUE)) { if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings") BiocManager::install("Biostrings")
} }
# Package information: # Package information:
# library(help = Biostrings) # basic information # library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes # browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets # data(package = "Biostrings") # available datasets
# There are many ways to generate alternative codes. The simplest way is to # There are many ways to generate alternative codes. The simplest way is to
# randomly assign amino acids to codons. A more sophisticated way is to keep the # randomly assign amino acids to codons. A more sophisticated way is to keep the
# redundancy of codons intact, since it may reflect some form of symmetry # redundancy of codons intact, since it may reflect some form of symmetry
# breaking that ignores the third nucleotide of a codon for the most part; # breaking that ignores the third nucleotide of a codon for the most part;
# therefore we only replace the amino acids of the existing code with random # therefore we only replace the amino acids of the existing code with random
# others. Here are two functions that implement these two ideas about alternate # others. Here are two functions that implement these two ideas about alternate
# codes. # codes.
randomGC <- function(GC) { randomGC <- function(GC) {
# Return a genetic code with randomly assigned amino acids. # Return a genetic code with randomly assigned amino acids.
# Parameters: # Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter # GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet. # codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments in which # Value: named chr same vector with random amino acid assignments in which
# every amino acid and "*" is encoded at least once. # every amino acid and "*" is encoded at least once.
aa <- unique(GC) # the amino acids in the input code aa <- unique(GC) # the amino acids in the input code
GC[1:64] <- sample(aa, 64, replace = TRUE) # random code GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
while(length(unique(GC)) < length(aa)) { # We could end up with a code that while(length(unique(GC)) < length(aa)) { # We could end up with a code that
# does not contain all amino acids, # does not contain all amino acids,
# then we sample() again. # then we sample() again.
GC[1:64] <- sample(aa, 64, replace = TRUE) GC[1:64] <- sample(aa, 64, replace = TRUE)
} }
return(GC) return(GC)
} }
swappedGC <- function(GC) { swappedGC <- function(GC) {
# Return a genetic code with randomly swapped amino acids. # Return a genetic code with randomly swapped amino acids.
# Parameters: # Parameters:
# GC named chr length-64 character vector of 20 amino acid one-letter # GC named chr length-64 character vector of 20 amino acid one-letter
# codes plus "*" (stop), named with the codon triplet. # codes plus "*" (stop), named with the codon triplet.
# Value: named chr same vector with random amino acid assignments where the # Value: named chr same vector with random amino acid assignments where the
# amino acids have been swapped. # amino acids have been swapped.
aaOrig <- unique(GC) # the amino acids in the input code aaOrig <- unique(GC) # the amino acids in the input code
aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled aaSwap <- sample(aaOrig, length(aaOrig)) # shuffled
names(aaSwap) <- aaOrig # name them after the original names(aaSwap) <- aaOrig # name them after the original
GC[1:64] <- aaSwap[GC] # replace original with shuffled GC[1:64] <- aaSwap[GC] # replace original with shuffled
return(GC) return(GC)
} }
# == 2.2 Effect of mutations =============================================== # == 2.2 Effect of mutations ===============================================
# To evaluate the effects of mutations we will do the following: # To evaluate the effects of mutations we will do the following:
# - we take an amino acid sequence (Mbp1 will do just nicely); # - we take an amino acid sequence (Mbp1 will do just nicely);
# - we reverse-translate it into a nucleotide sequence; # - we reverse-translate it into a nucleotide sequence;
# - we mutate it randomly; # - we mutate it randomly;
# - we translate it back to amino acids; # - we translate it back to amino acids;
# - we count the number of mutations and evaluate their severity. # - we count the number of mutations and evaluate their severity.
# === 2.2.1 reverse-translate # === 2.2.1 reverse-translate
# To reverse-translate an amino acid vector, we randomly pick one of its # To reverse-translate an amino acid vector, we randomly pick one of its
# codons from a genetic code, and assemble all codons to a sequence. # codons from a genetic code, and assemble all codons to a sequence.
traRev <- function(s, GC) { traRev <- function(s, GC) {
# Parameters: # Parameters:
# s chr a sequence vector # s chr a sequence vector
# GC chr a genetic code # GC chr a genetic code
# Value: # Value:
# A reverse-translated vector of codons # A reverse-translated vector of codons
vC <- character(length(s)) vC <- character(length(s))
for (i in seq_along(s)) { for (i in seq_along(s)) {
codon <- names(GC)[GC == s[i]] # get all codons for this AA codon <- names(GC)[GC == s[i]] # get all codons for this AA
if (length(codon) > 1) { # if there's more than one ... if (length(codon) > 1) { # if there's more than one ...
codon <- sample(codon, 1) # pick one at random ... codon <- sample(codon, 1) # pick one at random ...
} }
vC[i] <- codon # store it vC[i] <- codon # store it
} }
return(vC) return(vC)
} }
# === 2.2.2 Randomly mutate # === 2.2.2 Randomly mutate
# To mutate, we split a codon into it's three nucleotides, then randomly replace # To mutate, we split a codon into it's three nucleotides, then randomly replace
# one of the three with another nucleotide. # one of the three with another nucleotide.
randMut <- function(vC) { randMut <- function(vC) {
# Parameter: # Parameter:
# vC chr a vector of codons # vC chr a vector of codons
# Value: chr a vector of codons with a single point mutation from vC # Value: chr a vector of codons with a single point mutation from vC
nuc <- c("A", "C", "G", "T") nuc <- c("A", "C", "G", "T")
for (i in seq_along(vC)) { for (i in seq_along(vC)) {
triplet <- unlist(strsplit(vC[i], "")) # split into three nucl. triplet <- unlist(strsplit(vC[i], "")) # split into three nucl.
iNuc <- sample(1:3, 1) # choose one of the three iNuc <- sample(1:3, 1) # choose one of the three
mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
triplet[iNuc] <- mutNuc # replace the original triplet[iNuc] <- mutNuc # replace the original
vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon vC[i] <- paste0(triplet, collapse = "") # collapse it to a codon
} }
return(vC) return(vC)
} }
# === 2.2.3 Forward- translate # === 2.2.3 Forward- translate
traFor <- function(vC, GC) { traFor <- function(vC, GC) {
# Parameters: # Parameters:
# vC chr a codon vector # vC chr a codon vector
# GC chr a genetic code # GC chr a genetic code
# Value: # Value:
# A vector of amino acids # A vector of amino acids
vAA <- character(length(vC)) vAA <- character(length(vC))
for (i in seq_along(vC)) { for (i in seq_along(vC)) {
vAA[i] <- GC[vC[i]] # translate and store vAA[i] <- GC[vC[i]] # translate and store
} }
return(vAA) return(vAA)
} }
# === 2.2.4 measure effect # === 2.2.4 measure effect
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc # How do we evaluate the effect of the mutation? We'll take a simple ad hoc
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
# categories, according to their free energy of transfer from water to octanol: # categories, according to their free energy of transfer from water to octanol:
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G") aaNeutral <- c("A", "H", "T", "S", "V", "G")
# Then we will penalize as follows: # Then we will penalize as follows:
# Changes within one category: 0.1 # Changes within one category: 0.1
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3 # Changes from hydrophobic or hydrophilic to neutral or back: 0.3
# Changes from hydrophobic to hydrophilic or back: 1.0 # Changes from hydrophobic to hydrophilic or back: 1.0
# Changes to stop-codon: 3.0 # Changes to stop-codon: 3.0
evalMut <- function(nat, mut) { evalMut <- function(nat, mut) {
# Evaluate severity of mutations between amino acid sequence vectors nat and # Evaluate severity of mutations between amino acid sequence vectors nat and
# mut in an ad hoc approach based on hydrophobicity changes. # mut in an ad hoc approach based on hydrophobicity changes.
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
aaNeutral <- c("A", "H", "T", "S", "V", "G") aaNeutral <- c("A", "H", "T", "S", "V", "G")
penalties <- numeric(length(nat)) penalties <- numeric(length(nat))
lMut <- nat != mut # logical TRUE for all mutated positions lMut <- nat != mut # logical TRUE for all mutated positions
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1 penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0 penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3 penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0 penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1 penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3 penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3 penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3 penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1 penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
return(sum(penalties)) return(sum(penalties))
} }
# A more sophisticated approach could take additional quantities into account, # A more sophisticated approach could take additional quantities into account,
# such as charge, size, or flexibility - and it could add heuristics, such as: # such as charge, size, or flexibility - and it could add heuristics, such as:
# proline is always bad in secondary structure, charged amino acids are terrible # proline is always bad in secondary structure, charged amino acids are terrible
# in the folded core of a protein, replacing a small by a large amino acid in # in the folded core of a protein, replacing a small by a large amino acid in
# the core is very disruptive ... etc. # the core is very disruptive ... etc.
# #
# For our experiment, we should not use a mutation data matrix however: # For our experiment, we should not use a mutation data matrix however:
# empirical mutation probabilities are superbly suited to estimate evolutionary # empirical mutation probabilities are superbly suited to estimate evolutionary
# relationships. Here however, as we are trying to evaluate effects of random # relationships. Here however, as we are trying to evaluate effects of random
# mutations on genetic codes, our reasoning would be circular - we would # mutations on genetic codes, our reasoning would be circular - we would
# discover that the natural genetic code is optimal ... because it is most # discover that the natural genetic code is optimal ... because it is most
# similar to the natural genetic code. That would be Cargo Cult bioinformatics. # similar to the natural genetic code. That would be Cargo Cult bioinformatics.
# = 3 Run the experiment ================================================== # = 3 Run the experiment ==================================================
# Fetch the standard Genetic code from Biostrings:: # Fetch the standard Genetic code from Biostrings::
stdCode <- Biostrings::GENETIC_CODE stdCode <- Biostrings::GENETIC_CODE
# Fetch the nucleotide sequence for MBP1: # Fetch the nucleotide sequence for MBP1:
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
myDNA <- paste0(myDNA, collapse = "") myDNA <- paste0(myDNA, collapse = "")
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
myDNA <- myDNA[-length(myDNA)] # drop the stop codon myDNA <- myDNA[-length(myDNA)] # drop the stop codon
myAA <- traFor(myDNA, stdCode) myAA <- traFor(myDNA, stdCode)
# Mutate and evaluate # Mutate and evaluate
set.seed(112358) set.seed(112358)
x <- randMut(myDNA) x <- randMut(myDNA)
set.seed(NULL) set.seed(NULL)
x <- traFor(x, stdCode) x <- traFor(x, stdCode)
evalMut(myAA, x) # 166.4 evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed. # Try this 200 times, and see how the values are distributed.
N <- 200 N <- 200
valSTDC <- numeric(N) valSTDC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) { # this takes a few seconds ... for (i in 1:N) { # this takes a few seconds ...
x <- randMut(myDNA) # mutate x <- randMut(myDNA) # mutate
x <- traFor(x, stdCode) # translate x <- traFor(x, stdCode) # translate
valSTDC[i] <- evalMut(myAA, x) # evaluate valSTDC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
hist(valSTDC, hist(valSTDC,
breaks = 15, breaks = 15,
col = "palegoldenrod", col = "palegoldenrod",
xlim = c(0, 400), xlim = c(0, 400),
ylim = c(0, N/4), ylim = c(0, N/4),
main = "Standard vs. Synthetic Genetic Code", main = "Standard vs. Synthetic Genetic Code",
xlab = "Mutation penalty") xlab = "Mutation penalty")
# This looks like a normal distribution. Let's assume the effect of mutations # This looks like a normal distribution. Let's assume the effect of mutations
# under the standard genetic code is the mean of this distribution: # under the standard genetic code is the mean of this distribution:
effectSTDC <- mean(valSTDC) # 178.1 effectSTDC <- mean(valSTDC) # 178.1
# Now we can look at the effects of alternate genetic codes: # Now we can look at the effects of alternate genetic codes:
set.seed(112358) set.seed(112358)
# choose a new code # choose a new code
GC <- randomGC(stdCode) GC <- randomGC(stdCode)
set.seed(NULL) set.seed(NULL)
# reverse translate hypothetical sequence according to the new code # reverse translate hypothetical sequence according to the new code
x <- traRev(myAA, GC) x <- traRev(myAA, GC)
x <- randMut(x) # randomly mutate hypothetical nucleotide sequence x <- randMut(x) # randomly mutate hypothetical nucleotide sequence
x <- traFor(x, GC) # translate back, with the new code x <- traFor(x, GC) # translate back, with the new code
evalMut(myAA, x) # evaluate mutation effects: 298.5 evalMut(myAA, x) # evaluate mutation effects: 298.5
# That seems a fair bit higher than what we saw as "effectUGC" # That seems a fair bit higher than what we saw as "effectUGC"
# Let's try with different genetic codes. 200 trials - but this time every trial # Let's try with different genetic codes. 200 trials - but this time every trial
# is with a different, synthetic genetic code. # is with a different, synthetic genetic code.
N <- 200 N <- 200
valXGC <- numeric(N) valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
GC <- randomGC(stdCode) # Choose code GC <- randomGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate x <- randMut(x) # mutate
x <- traFor(x, GC) # translate x <- traFor(x, GC) # translate
valXGC[i] <- evalMut(myAA, x) # evaluate valXGC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
hist(valXGC, hist(valXGC,
col = "plum", col = "plum",
breaks = 15, breaks = 15,
add = TRUE) add = TRUE)
# These two distributions are very widely separated! # These two distributions are very widely separated!
# Task: Perform the same experiment with the swapped genetic code. # Task: Perform the same experiment with the swapped genetic code.
# Compare the distributions. Interpret the result. # Compare the distributions. Interpret the result.
# These are simple experiments, under assumptions that can be refined in # These are simple experiments, under assumptions that can be refined in
# meaningful ways. Yet, even those simple computational experiments show # meaningful ways. Yet, even those simple computational experiments show
# that the Universal Genetic Code has features that one would predict if # that the Universal Genetic Code has features that one would predict if
# it has evolved under selective pressure to minimize the effects of mutations. # it has evolved under selective pressure to minimize the effects of mutations.
# Gradual change under mutation is benificial to evolution, disruptive # Gradual change under mutation is benificial to evolution, disruptive
# change is not. # change is not.
# = 4 Task solutions ====================================================== # = 4 Task solutions ======================================================
N <- 200 N <- 200
valSGC <- numeric(N) valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) { for (i in 1:N) {
GC <- swappedGC(stdCode) # Choose code GC <- swappedGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate x <- randMut(x) # mutate
x <- traFor(x, GC) # translate x <- traFor(x, GC) # translate
valSGC[i] <- evalMut(myAA, x) # evaluate valSGC[i] <- evalMut(myAA, x) # evaluate
} }
set.seed(NULL) # reset the RNG set.seed(NULL) # reset the RNG
hist(valSGC, hist(valSGC,
col = "#6688FF88", col = "#6688FF88",
breaks = 15, breaks = 15,
add = TRUE) add = TRUE)
# [END] # [END]

View File

@ -1,50 +1,50 @@
# tocID <- "RPR-Introduction.R" # tocID <- "RPR-Introduction.R"
# #
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Introduction unit # R code accompanying the RPR-Introduction unit
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2020-09-18 # Date: 2020-09-18
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 1.0 Updtaed workflow; live # V 1.0 Updtaed workflow; live
# V 0.1 First code # V 0.1 First code
# #
# TODO: # TODO:
# #
# #
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# DO NOT SIMPLY source() THESE FILES! # DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
# === TASK: Local script # === TASK: Local script
# #
# - Open the file myScript.R # - Open the file myScript.R
# #
# - Create a section header with a date. # - Create a section header with a date.
# - Enter an R-expression that will produce the first 11 powers of 2 (starting # - Enter an R-expression that will produce the first 11 powers of 2 (starting
# from 0). Not a loop - a single expression. The first number you get must # from 0). Not a loop - a single expression. The first number you get must
# be 1. The last number you get must be 1024. # be 1. The last number you get must be 1024.
# #
# - Save the file in the myScripts folder, and close it. # - Save the file in the myScripts folder, and close it.
# #
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R) # - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
# to execute it. # to execute it.
# #
# - Done # - Done
# (This task is meant to make sure that writing R expressions, saving # (This task is meant to make sure that writing R expressions, saving
# them in scripts, opening script files and executing code in the file works # them in scripts, opening script files and executing code in the file works
# for you. If there is an issue, get in touch.) # for you. If there is an issue, get in touch.)
# [END] # [END]

View File

@ -1,168 +1,168 @@
# tocID <- "RPR-PROSITE_POST.R" # tocID <- "RPR-PROSITE_POST.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit. # R code accompanying the RPR-Scripting_data_downloads unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance # 1.2 2020 Maintenance
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout, # use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces # 1.0.1 Updates for slightly changed interfaces
# 1.0 First ABC units version # 1.0 First ABC units version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------------------- #TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 43 #TOC> 1 Constructing a POST command from a Web query 43
#TOC> 1.1 Task - fetchPrositeFeatures() function 148 #TOC> 1.1 Task - fetchPrositeFeatures() function 148
#TOC> 2 Task solutions 156 #TOC> 2 Task solutions 156
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Constructing a POST command from a Web query ======================== # = 1 Constructing a POST command from a Web query ========================
if (! requireNamespace("httr", quietly = TRUE)) { if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr") install.packages("httr")
} }
# Package information: # Package information:
# library(help = httr) # basic information # library(help = httr) # basic information
# browseVignettes("httr") # available vignettes # browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets # data(package = "httr") # available datasets
# We have reverse engineered the Web form for a ScanProsite request, and can # We have reverse engineered the Web form for a ScanProsite request, and can
# construct a valid POST request from knowing the required field names. The POST # construct a valid POST request from knowing the required field names. The POST
# command is similar to GET(), but we need an explicit request body that # command is similar to GET(), but we need an explicit request body that
# contains a list of key/value pairs # contains a list of key/value pairs
UniProtID <- "P39678" UniProtID <- "P39678"
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- httr::POST(URL, response <- httr::POST(URL,
body = list(meta = "opt1", body = list(meta = "opt1",
meta1_protein = "opt1", meta1_protein = "opt1",
seq = UniProtID, seq = UniProtID,
skip = "on", skip = "on",
output = "tabular")) output = "tabular"))
# Send off this request, and you should have a response in a few # Send off this request, and you should have a response in a few
# seconds. Let's check the status first: # seconds. Let's check the status first:
httr::status_code(response) # If this is not 200, something went wrong and it httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask # makes no sense to continue. If this persists, ask
# on the Discussion Board what to do. # on the Discussion Board what to do.
# The text contents of the response is available with the # The text contents of the response is available with the
# content() function: # content() function:
httr::content(response, "text") httr::content(response, "text")
# ... should show you the same as the page contents that you have seen in the # ... should show you the same as the page contents that you have seen in the
# browser. Now we need to extract the data from the page. For this simple # browser. Now we need to extract the data from the page. For this simple
# example we can get away with using regular expressions, but in general we need # example we can get away with using regular expressions, but in general we need
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
# strsplit() the response into individual lines, since each of our data elements # strsplit() the response into individual lines, since each of our data elements
# is on its own line, and then capture the contents. The way Prosite has # is on its own line, and then capture the contents. The way Prosite has
# formatted their HTML we can simply split on the "\\n" newline character - but # formatted their HTML we can simply split on the "\\n" newline character - but
# they could write the same valid HTML without any newline-characters at all. # they could write the same valid HTML without any newline-characters at all.
# Understand that we are working with a bit of a "hack" here: exploting # Understand that we are working with a bit of a "hack" here: exploting
# empirical assumptions rather than a formal specification. But sometimes quick # empirical assumptions rather than a formal specification. But sometimes quick
# and dirty is fine, because quick. # and dirty is fine, because quick.
lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines) head(lines)
# Now we define a query pattern for the lines we want: # Now we define a query pattern for the lines we want:
# we can use the uID, bracketed by two "|" pipe # we can use the uID, bracketed by two "|" pipe
# characters: # characters:
patt <- sprintf("\\|%s\\|", UniProtID) patt <- sprintf("\\|%s\\|", UniProtID)
# ... and select only the lines that match this # ... and select only the lines that match this
# pattern: # pattern:
( lines <- lines[grep(patt, lines)] ) ( lines <- lines[grep(patt, lines)] )
# ... captures the three lines of output. # ... captures the three lines of output.
# Now we break the lines apart into tokens: this is another application of # Now we break the lines apart into tokens: this is another application of
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
# "\t". Look at the regex "\\t|\\|" in the strsplit() call: # "\t". Look at the regex "\\t|\\|" in the strsplit() call:
unlist(strsplit(lines[1], "\\t|\\|")) unlist(strsplit(lines[1], "\\t|\\|"))
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped # Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
# with a backslash. "t" has to be escaped because we want to match a tab (\t), # with a backslash. "t" has to be escaped because we want to match a tab (\t),
# not the literal character "t". And "|" has to be escaped because we mean the # not the literal character "t". And "|" has to be escaped because we mean the
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the # literal pipe character, not its metacharacter meaning OR. Thus sometimes the
# backslash turns a special meaning off, and sometimes it turns a special # backslash turns a special meaning off, and sometimes it turns a special
# meaning on. Unfortunately there's no easy way to tell - you just need to # meaning on. Unfortunately there's no easy way to tell - you just need to
# remember the characters - or have a reference handy. The metacharacters are # remember the characters - or have a reference handy. The metacharacters are
# (){}[]^$?*+.|&- ... and some of them have different meanings depending on # (){}[]^$?*+.|&- ... and some of them have different meanings depending on
# where in the regex they are. # where in the regex they are.
# Let's put the tokens into named slots of a data frame # Let's put the tokens into named slots of a data frame
features <- data.frame() features <- data.frame()
for (line in lines) { for (line in lines) {
tokens <- unlist(strsplit(line, "\\t|\\|")) tokens <- unlist(strsplit(line, "\\t|\\|"))
features <- rbind(features, features <- rbind(features,
data.frame(uID = tokens[2], data.frame(uID = tokens[2],
start = as.numeric(tokens[4]), start = as.numeric(tokens[4]),
end = as.numeric(tokens[5]), end = as.numeric(tokens[5]),
psID = tokens[6], psID = tokens[6],
psName = tokens[7], psName = tokens[7],
psSeq = tokens[11])) psSeq = tokens[11]))
} }
features features
# This forms the base of a function that collects the features automatically # This forms the base of a function that collects the features automatically
# from a PrositeScan result. You can write this! # from a PrositeScan result. You can write this!
# == 1.1 Task - fetchPrositeFeatures() function ============================ # == 1.1 Task - fetchPrositeFeatures() function ============================
# Task: write a function that takes as input a UniProt ID, fetches the # Task: write a function that takes as input a UniProt ID, fetches the
# features it contains from ScanProsite and returns a data frame as given above, or # features it contains from ScanProsite and returns a data frame as given above, or
# an empty data frame if there is an error. # an empty data frame if there is an error.
# = 2 Task solutions ====================================================== # = 2 Task solutions ======================================================
# I have placed such a function into the ABC-dbUtilities.R script: look it up by # I have placed such a function into the ABC-dbUtilities.R script: look it up by
# clicking on dbFetchPrositeFeatures() in the Environment pane. # clicking on dbFetchPrositeFeatures() in the Environment pane.
# Test: # Test:
dbFetchPrositeFeatures("Q5KMQ9") dbFetchPrositeFeatures("Q5KMQ9")
# [END] # [END]

View File

@ -1,135 +1,135 @@
# tocID <- "RPR-Pipe.R" # tocID <- "RPR-Pipe.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# Discussing pipe operators. # Discussing pipe operators.
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2021 10 # Date: 2021 10
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.0 New code # 1.0 New code
# #
# #
# TODO: # TODO:
# - find more interesting examples # - find more interesting examples
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------ #TOC> ------------------------------------------------
#TOC> 1 Pipe Concept 41 #TOC> 1 Pipe Concept 41
#TOC> 2 Nested Expression 73 #TOC> 2 Nested Expression 73
#TOC> 3 magrittr:: Pipe 78 #TOC> 3 magrittr:: Pipe 78
#TOC> 4 Base R Pipe 93 #TOC> 4 Base R Pipe 93
#TOC> 5 Intermediate Assignment 108 #TOC> 5 Intermediate Assignment 108
#TOC> 6 Postscript 127 #TOC> 6 Postscript 127
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Pipe Concept ======================================================= # = 1 Pipe Concept =======================================================
# Pipes are actually an awesome idea for any code that implements a workflow - # Pipes are actually an awesome idea for any code that implements a workflow -
# a sequence of operations, each of which transforms data in a specialized way. # a sequence of operations, each of which transforms data in a specialized way.
# #
# This principle is familiar from maths: chained functions. If have a function # This principle is familiar from maths: chained functions. If have a function
# y = f(x) and want to use those results as in z = g(y), I can just write # y = f(x) and want to use those results as in z = g(y), I can just write
# z = g(f(x)) # z = g(f(x))
# #
# On the unix command line, pipes were used from the very beginning, implemented # On the unix command line, pipes were used from the very beginning, implemented
# with the "|" pipe character. # with the "|" pipe character.
# #
# In R, the magrittr package provided the %>% operator, and recently the |> # In R, the magrittr package provided the %>% operator, and recently the |>
# operator has been introduced into base R. # operator has been introduced into base R.
# #
# However there are alternatives: intermediate assignment, and nested functions # However there are alternatives: intermediate assignment, and nested functions
# that have always existed in base R anyway. # that have always existed in base R anyway.
# #
# Let us look at an example. In writing this, I found out that virtually # Let us look at an example. In writing this, I found out that virtually
# ALL non-trivial examples I came up with don't translate well into this idiom # ALL non-trivial examples I came up with don't translate well into this idiom
# at all. It is actually quite limited to simple filtering operations on # at all. It is actually quite limited to simple filtering operations on
# data. A more interesting example might be added in the future, let me know if # data. A more interesting example might be added in the future, let me know if
# you have a good idea. # you have a good idea.
# #
# A somewhat contrived example is to sort a list of files by the # A somewhat contrived example is to sort a list of files by the
# length of the file names: # length of the file names:
myFiles <- list.files(pattern = "\\.R$") myFiles <- list.files(pattern = "\\.R$")
# nchar() gives the number of characters in a string, order() produces indices # nchar() gives the number of characters in a string, order() produces indices
# that map an array to its sorted form. # that map an array to its sorted form.
# #
# = 2 Nested Expression =================================================== # = 2 Nested Expression ===================================================
myFiles[order(nchar(myFiles))] myFiles[order(nchar(myFiles))]
# = 3 magrittr:: Pipe ===================================================== # = 3 magrittr:: Pipe =====================================================
if (! requireNamespace("magrittr", quietly = TRUE)) { if (! requireNamespace("magrittr", quietly = TRUE)) {
install.packages("magrittr") install.packages("magrittr")
} }
# Package information: # Package information:
# library(help = magrittr) # basic information # library(help = magrittr) # basic information
# browseVignettes("magrittr") # available vignettes # browseVignettes("magrittr") # available vignettes
# data(package = "magrittr") # available datasets # data(package = "magrittr") # available datasets
library(magrittr) library(magrittr)
myFiles %>% nchar %>% order %>% myFiles[.] myFiles %>% nchar %>% order %>% myFiles[.]
# = 4 Base R Pipe ========================================================= # = 4 Base R Pipe =========================================================
# Since version 4.1, base R now supports a pipe operator without the need # Since version 4.1, base R now supports a pipe operator without the need
# to load a special package. Such an introductions of external functionality # to load a special package. Such an introductions of external functionality
# into the language is very rare. # into the language is very rare.
# #
# Unfortunately it won't (yet) work with the '[' function, so we need to write # Unfortunately it won't (yet) work with the '[' function, so we need to write
# an intermediate function for this example # an intermediate function for this example
extract <- function(x, v) { extract <- function(x, v) {
return(v[x]) return(v[x])
} }
myFiles |> nchar() |> order() |> extract(myFiles) myFiles |> nchar() |> order() |> extract(myFiles)
# = 5 Intermediate Assignment ============================================= # = 5 Intermediate Assignment =============================================
# So what's the problem? As you can see, the piped code may be concise and # So what's the problem? As you can see, the piped code may be concise and
# expressive. But there is also a large amount of implicit assignment and # expressive. But there is also a large amount of implicit assignment and
# processing going on and that is usually a bad idea because it makes code hard # processing going on and that is usually a bad idea because it makes code hard
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that # to maintain. I am NOT a big fan of the nested syntax, but I don't think that
# replacing it with the pipe makes things much better. My preferred idiom is # replacing it with the pipe makes things much better. My preferred idiom is
# to use intermediate assignments. Only then is it convenient to examine # to use intermediate assignments. Only then is it convenient to examine
# the code step by step and validate every single step. And that is the most # the code step by step and validate every single step. And that is the most
# important objective at all: no code is good if it does not compute # important objective at all: no code is good if it does not compute
# correctly. # correctly.
x <- nchar(myFiles) x <- nchar(myFiles)
x <- order(x) x <- order(x)
myFiles[x] myFiles[x]
# = 6 Postscript ========================================================== # = 6 Postscript ==========================================================
# I tried to write an example that strips all comments from a list of files, and # I tried to write an example that strips all comments from a list of files, and
# another example that finds all files that were not yet updated this year # another example that finds all files that were not yet updated this year
# (according to the "# Date: in the header). Neither examples can be well # (according to the "# Date: in the header). Neither examples can be well
# written without intermediate assignments, or at least sapply() functions # written without intermediate assignments, or at least sapply() functions
# that are not simpler at all than the intermediate assignment. # that are not simpler at all than the intermediate assignment.
# [END] # [END]

View File

@ -1,180 +1,180 @@
# tocID <- "RPR-RegEx.R" # tocID <- "RPR-RegEx.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-RegEx unit # R code accompanying the RPR-RegEx unit
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2017-08 - 2020-09 # Date: 2017-08 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# V 0.1 Maintenance 2020 # V 0.1 Maintenance 2020
# V 0.1 First code # V 0.1 First code
# #
# TODO: # TODO:
# #
# #
# == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
# #
# DO NOT SIMPLY source() THESE FILES! # DO NOT SIMPLY source() THESE FILES!
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------- #TOC> ----------------------------------------------------
#TOC> 1 A regex example 41 #TOC> 1 A regex example 41
#TOC> 2 Counting lines 108 #TOC> 2 Counting lines 108
#TOC> 2.1 Counting C-alpha atoms only 126 #TOC> 2.1 Counting C-alpha atoms only 126
#TOC> 3 Code Solutions 142 #TOC> 3 Code Solutions 142
#TOC> 3.1 Counting atoms 144 #TOC> 3.1 Counting atoms 144
#TOC> 3.2 Counting C-alpha records 160 #TOC> 3.2 Counting C-alpha records 160
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 A regex example ===================================================== # = 1 A regex example =====================================================
# The canonical FASTA version of yeast Mbp1 at Uniprot # The canonical FASTA version of yeast Mbp1 at Uniprot
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1 s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA" IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
nchar(s) nchar(s)
# Must be 969 # Must be 969
# Task: Fetch the Uniprot ID by retrieving the first string that appears between # Task: Fetch the Uniprot ID by retrieving the first string that appears between
# two vertical bars ("pipes") in the header record. # two vertical bars ("pipes") in the header record.
# #
# Develop the regular expression: # Develop the regular expression:
# Just five characters returned, so we know we are using # Just five characters returned, so we know we are using
patt <- "^>(.{5})" # the right functions patt <- "^>(.{5})" # the right functions
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
patt <- "^>(.*)|" # everything to the pipe character patt <- "^>(.*)|" # everything to the pipe character
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Ooops - "|" is a metacharacter - we must escape it # Ooops - "|" is a metacharacter - we must escape it
patt <- "^>(.*)\|" # using "\|" patt <- "^>(.*)\|" # using "\|"
# Ooops - that's not how we escape: must double the \ to send a literal # Ooops - that's not how we escape: must double the \ to send a literal
# "\" plus the character "|" to the regex engine. # "\" plus the character "|" to the regex engine.
patt <- "^>(.*)\\|" # using "\\|" patt <- "^>(.*)\\|" # using "\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Good. Now let's first match everything that is not a "|", then match a "|" # Good. Now let's first match everything that is not a "|", then match a "|"
patt <- "^>([^|]*)\\|" patt <- "^>([^|]*)\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# the same thing again, but capture the second match. And insist that there # the same thing again, but capture the second match. And insist that there
# must be at least one character captured # must be at least one character captured
patt <- "^>[^|]*\\|([^|]+)\\|" patt <- "^>[^|]*\\|([^|]+)\\|"
# Analyze this pattern: # Analyze this pattern:
# ^ anchor the match at the beginning of the line # ^ anchor the match at the beginning of the line
# > ">" must be the first character # > ">" must be the first character
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because # [^|]* all-characters-except-a-vertical-bar, 0 or more times because
# we don't know what other versions of the string "sp" # we don't know what other versions of the string "sp"
# might appear. Note that within the brackets "|" is NOT a # might appear. Note that within the brackets "|" is NOT a
# metacharacter. # metacharacter.
# \\| "|" character: ouside of square brackets "|" is a metacharacter # \\| "|" character: ouside of square brackets "|" is a metacharacter
# and means "OR"; we need to escape it to match a literal "|". # and means "OR"; we need to escape it to match a literal "|".
# ( open parenthesis: capture what comes next ... # ( open parenthesis: capture what comes next ...
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times # [^|]+ all-characters-except-a-vertical-bar, 1 or more times
# ) close parenthesis: stop capturing here # ) close parenthesis: stop capturing here
# \\| second "|" character, escaped # \\| second "|" character, escaped
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# = 2 Counting lines ====================================================== # = 2 Counting lines ======================================================
# Task: Write a function that returns the number of atoms in a PDB file. Call it # Task: Write a function that returns the number of atoms in a PDB file. Call it
# atomCount(). Sample data is here: # atomCount(). Sample data is here:
myPDB <- readLines("./data/0TST.pdb") myPDB <- readLines("./data/0TST.pdb")
# Specification: # Specification:
# Read a file from its path given as the only argument. # Read a file from its path given as the only argument.
# Return the number of lines in that file that begin with "ATOM " # Return the number of lines in that file that begin with "ATOM "
# or with "HETATM". # or with "HETATM".
# Try this. Write a function. Solution code is at the end of this file. # Try this. Write a function. Solution code is at the end of this file.
# Don't peek. # Don't peek.
atomCount("./data/0TST.pdb") # must return 6 atomCount("./data/0TST.pdb") # must return 6
# == 2.1 Counting C-alpha atoms only ======================================= # == 2.1 Counting C-alpha atoms only =======================================
# Task: write a function based on the previous one that matches only CA records, # Task: write a function based on the previous one that matches only CA records,
# i.e. it can be used to count the number of amino acids. Don't get # i.e. it can be used to count the number of amino acids. Don't get
# fooled by calcium atoms, or the string CA appearing elsewhere. # fooled by calcium atoms, or the string CA appearing elsewhere.
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM # cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
# Specification: # Specification:
# Read a file from its path given as the only argument. # Read a file from its path given as the only argument.
# Return the number of lines in that file that have a C-alpha atom. # Return the number of lines in that file that have a C-alpha atom.
# Try this. Solution code is at the end of this file. Don't peek. # Try this. Solution code is at the end of this file. Don't peek.
CAcount("./data/0TST.pdb") # must return 1 CAcount("./data/0TST.pdb") # must return 1
# = 3 Code Solutions ====================================================== # = 3 Code Solutions ======================================================
# == 3.1 Counting atoms ==================================================== # == 3.1 Counting atoms ====================================================
atomCount <- function(IN) { atomCount <- function(IN) {
# count the number of atoms in a PDB formatted file # count the number of atoms in a PDB formatted file
# Parameters: # Parameters:
# IN chr path of the file to read # IN chr path of the file to read
# Value: # Value:
# numeric number of lines that match "^ATOM " or "^HETATM" # numeric number of lines that match "^ATOM " or "^HETATM"
# Note: the regex MUST be anchored to the beginning of the line, otherwise # Note: the regex MUST be anchored to the beginning of the line, otherwise
# it might match somewhere in a comment! # it might match somewhere in a comment!
x <- readLines(IN) x <- readLines(IN)
patt <- "(^ATOM )|(^HETATM)" patt <- "(^ATOM )|(^HETATM)"
return(length(grep(patt, x))) return(length(grep(patt, x)))
} }
# == 3.2 Counting C-alpha records ========================================== # == 3.2 Counting C-alpha records ==========================================
CAcount <- function(IN) { CAcount <- function(IN) {
# count the number of C-alpha atoms in a PDB formatted file # count the number of C-alpha atoms in a PDB formatted file
# Parameters: # Parameters:
# IN chr path of the file to read # IN chr path of the file to read
# Value: # Value:
# numeric number of lines that match " CA " in position 13 - 16 of # numeric number of lines that match " CA " in position 13 - 16 of
# an ATOM record. # an ATOM record.
# Note: the regex MUST be aligned into the right position, otherwise it # Note: the regex MUST be aligned into the right position, otherwise it
# might match Calcium records! # might match Calcium records!
x <- readLines(IN) x <- readLines(IN)
patt <- "^ATOM ...... CA " patt <- "^ATOM ...... CA "
return(length(grep(patt, x))) return(length(grep(patt, x)))
} }
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,135 +1,135 @@
# tocID <- "RPR-UniProt_GET.R" # tocID <- "RPR-UniProt_GET.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit. # R code accompanying the RPR-Scripting_data_downloads unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and # 1.2 2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
# added FASTA headers as attribute # added FASTA headers as attribute
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 First ABC units version # 1.0 First ABC units version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ---------------------------------------------------------- #TOC> ----------------------------------------------------------
#TOC> 1 UniProt files via GET 43 #TOC> 1 UniProt files via GET 43
#TOC> 1.1 Task - fetchUniProtSeq() function 105 #TOC> 1.1 Task - fetchUniProtSeq() function 105
#TOC> 2 Task solutions 118 #TOC> 2 Task solutions 118
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 UniProt files via GET =============================================== # = 1 UniProt files via GET ===============================================
# Perhaps the simplest example of scripted download is to retrieve a protein # Perhaps the simplest example of scripted download is to retrieve a protein
# FASTA sequence from UniProt. All we need is to construct an URL with the # FASTA sequence from UniProt. All we need is to construct an URL with the
# correct UniProt ID. # correct UniProt ID.
# An interface between R scripts and Web servers is provided by the httr:: # An interface between R scripts and Web servers is provided by the httr::
# package. This sends and receives information via the http protocol, just like # package. This sends and receives information via the http protocol, just like
# a Web browser. Since this is a short and simple request, the GET verb is the # a Web browser. Since this is a short and simple request, the GET verb is the
# right tool: # right tool:
if (! requireNamespace("httr", quietly = TRUE)) { if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr") install.packages("httr")
} }
# Package information: # Package information:
# library(help = httr) # basic information # library(help = httr) # basic information
# browseVignettes("httr") # available vignettes # browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets # data(package = "httr") # available datasets
# The UniProt ID for Mbp1 is ... # The UniProt ID for Mbp1 is ...
UniProtID <- "P39678" UniProtID <- "P39678"
# and the base URL to retrieve data is ... # and the base URL to retrieve data is ...
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to # http://www.uniprot.org/uniprot/ . We can construct a simple URL to
# retrieve a FASTA sequence: # retrieve a FASTA sequence:
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
# the GET() function from httr will get the data. # the GET() function from httr will get the data.
response <- httr::GET(URL) response <- httr::GET(URL)
str(response) # the response object is a bit complex ... str(response) # the response object is a bit complex ...
as.character(response) # ... but it is easy to pull out the data. as.character(response) # ... but it is easy to pull out the data.
# to process ... # to process ...
x <- as.character(response) x <- as.character(response)
x <- strsplit(x, "\n") x <- strsplit(x, "\n")
dbSanitizeSequence(x) dbSanitizeSequence(x)
# Simple. # Simple.
# But what happens if there is an error, e.g. the uniprot ID does not exist? # But what happens if there is an error, e.g. the uniprot ID does not exist?
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
as.character(response) as.character(response)
# this is a large HTML page that tells us the URL was not found. So we need to # this is a large HTML page that tells us the URL was not found. So we need to
# check for errors. The Right Way to do this is to evaluate the staus code that # check for errors. The Right Way to do this is to evaluate the staus code that
# every Web server returns for every transaction. # every Web server returns for every transaction.
# #
httr::status_code(response) # 404 == Page Not Found httr::status_code(response) # 404 == Page Not Found
# There are many possible codes, but the only code we will be happy with # There are many possible codes, but the only code we will be happy with
# is 200 - oK. # is 200 - oK.
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
response <- httr::GET(URL) response <- httr::GET(URL)
httr::status_code(response) httr::status_code(response)
# == 1.1 Task - fetchUniProtSeq() function ================================= # == 1.1 Task - fetchUniProtSeq() function =================================
# Task: write a function that # Task: write a function that
# - takes as input a vector of UniProt IDs, # - takes as input a vector of UniProt IDs,
# - fetches the FASTA sequence for each # - fetches the FASTA sequence for each
# - returns a vector of the same length as the input, where an element is: # - returns a vector of the same length as the input, where an element is:
# - ... the sequence, if the query was successful # - ... the sequence, if the query was successful
# - ... NA if there was an error # - ... NA if there was an error
# - each element has the UniProt ID as the name() # - each element has the UniProt ID as the name()
# - bonus: the output has an attribute "headers" that is a vector of the # - bonus: the output has an attribute "headers" that is a vector of the
# FASTA headers ( cf. ?attr ) # FASTA headers ( cf. ?attr )
# = 2 Task solutions ====================================================== # = 2 Task solutions ======================================================
# I have placed such a function - dbFetchUniProtSeq() - into # I have placed such a function - dbFetchUniProtSeq() - into
# "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq() # "./scripts/ABC-dbUtilities.R": look it up by clicking on dbFetchUniProtSeq()
# in the Environment pane. # in the Environment pane.
# Test this: # Test this:
( x <- dbFetchUniProtSeq("P39678") ) ( x <- dbFetchUniProtSeq("P39678") )
names(x)[1] names(x)[1]
attr(x, "headers")[1] attr(x, "headers")[1]
x[1] x[1]
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]), cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq =x[1]),
width = 40), sep = "\n") width = 40), sep = "\n")
# [END] # [END]

View File

@ -1,234 +1,234 @@
# tocID <- "RPR-Unit_testing.R" # tocID <- "RPR-Unit_testing.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Unit_testing unit. # R code accompanying the RPR-Unit_testing unit.
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017 10 - 2019 01 # Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Updates. Discuss local tests. # 1.2 2020 Updates. Discuss local tests.
# 1.1 Change from require() to requireNamespace() # 1.1 Change from require() to requireNamespace()
# 1.0 New code # 1.0 New code
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------- #TOC> -------------------------------------------------
#TOC> 1 Unit Tests with testthat 42 #TOC> 1 Unit Tests with testthat 42
#TOC> 2 Organizing your tests 165 #TOC> 2 Organizing your tests 165
#TOC> 2.1 Testing scripts 189 #TOC> 2.1 Testing scripts 189
#TOC> 2.2 Rethinking testing 202 #TOC> 2.2 Rethinking testing 202
#TOC> 3 Task solutions 220 #TOC> 3 Task solutions 220
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Unit Tests with testthat ============================================ # = 1 Unit Tests with testthat ============================================
# The testthat package supports writing and executing unit tests in many ways. # The testthat package supports writing and executing unit tests in many ways.
if (! requireNamespace("testthat", quietly = TRUE)) { if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat") install.packages("testthat")
} }
# Package information: # Package information:
# library(help = testthat) # basic information # library(help = testthat) # basic information
# browseVignettes("testthat") # available vignettes # browseVignettes("testthat") # available vignettes
# data(package = "testthat") # available datasets # data(package = "testthat") # available datasets
# testthat is one of those packages that we either use A LOT in a script, # testthat is one of those packages that we either use A LOT in a script,
# or not at all. Therefore it's more reasonable to depart from our usual # or not at all. Therefore it's more reasonable to depart from our usual
# <package>::<function>() idiom, and load the entire library. In fact, if # <package>::<function>() idiom, and load the entire library. In fact, if
# we author packages, it is common practice to load testthat in the part # we author packages, it is common practice to load testthat in the part
# of the package that automates testing. # of the package that automates testing.
library(testthat) library(testthat)
# An atomic test consists of an expectation about the bahaviour of a function or # An atomic test consists of an expectation about the bahaviour of a function or
# the existence of an object. testthat provides a number of useful expectations: # the existence of an object. testthat provides a number of useful expectations:
# At the most basic level, you can use expect_true() and expect_false(): # At the most basic level, you can use expect_true() and expect_false():
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa")) expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
expect_true(file.exists("NO-SUCH-FILE.txt")) expect_true(file.exists("NO-SUCH-FILE.txt"))
expect_false(is.integer(NA)) expect_false(is.integer(NA))
# More commonly, you will test for equality of an output with a given result. # More commonly, you will test for equality of an output with a given result.
# But you need to consider what it means for two numbers to be "equal" on a # But you need to consider what it means for two numbers to be "equal" on a
# digital computer. Consider: # digital computer. Consider:
49*(1/49) == 1 # Surprised? Read FAQ 7.31 49*(1/49) == 1 # Surprised? Read FAQ 7.31
# https://cran.r-project.org/doc/FAQ/R-FAQ.html # https://cran.r-project.org/doc/FAQ/R-FAQ.html
49*(1/49) - 1 # NOT zero (but almost) 49*(1/49) - 1 # NOT zero (but almost)
# This is really unpredictable ... # This is really unpredictable ...
0.1 + 0.05 == 0.15 0.1 + 0.05 == 0.15
0.2 + 0.07 == 0.27 0.2 + 0.07 == 0.27
# It's easy to be caught on the wrong foot with numeric comparisons, therefore # It's easy to be caught on the wrong foot with numeric comparisons, therefore
# R uses the function all.equal() to test whether two numbers are equal for # R uses the function all.equal() to test whether two numbers are equal for
# practical puposes up to machine precision. # practical puposes up to machine precision.
49*(1/49) == 1 49*(1/49) == 1
all.equal(49*(1/49), 1) all.equal(49*(1/49), 1)
# The testthat function expect_equal() uses all.equal internally: # The testthat function expect_equal() uses all.equal internally:
expect_equal(49*(1/49), 1) expect_equal(49*(1/49), 1)
# ... which is reasonable, or, if things MUST be exactly the same ... # ... which is reasonable, or, if things MUST be exactly the same ...
expect_identical(49*(1/49), 1) expect_identical(49*(1/49), 1)
# ... but consider: # ... but consider:
expect_identical(2, 2L) # one is typeof() "double", the other is integer" expect_identical(2, 2L) # one is typeof() "double", the other is integer"
# Some very useful expectations are expect_warning(), and expect_error(), for # Some very useful expectations are expect_warning(), and expect_error(), for
# constructing tests that check for erroneous output: # constructing tests that check for erroneous output:
as.integer(c("1", "2", "three")) as.integer(c("1", "2", "three"))
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
# printed. # printed.
1/"x" 1/"x"
expect_warning(1/"x") expect_warning(1/"x")
expect_error(1/"x") # Again: note that the error is NOT printed, as well expect_error(1/"x") # Again: note that the error is NOT printed, as well
# code execution will continue. # code execution will continue.
# Even better, you can check if the warning or error is what you expect it # Even better, you can check if the warning or error is what you expect it
# to be - because it could actually have occured somewhere else in your code. # to be - because it could actually have occured somewhere else in your code.
v <- c("1", "x") v <- c("1", "x")
log(v[1:2]) log(v[1:2])
expect_error(log(v[1:2]), "non-numeric argument to mathematical function") expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message. expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
expect_error(log(v[1,2])) # This appears oK, but ... expect_error(log(v[1,2])) # This appears oK, but ...
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error! expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
# Producing unit tests simply means: we define a function, and then we check # Producing unit tests simply means: we define a function, and then we check
# whether all test pass. Consider a function that is loaded on startup from # whether all test pass. Consider a function that is loaded on startup from
# the .utilities.R script: # the .utilities.R script:
biCode biCode
# We could test it like so: # We could test it like so:
expect_equal(biCode(""), ".....") expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....") expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....") expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA") expect_equal(biCode("h sapiens"), "H..SA")
expect_equal(biCode("homo sapiens"), "HOMSA") expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU")) c("PHACI", "MACRU"))
expect_error(biCode(), "argument \"s\" is missing, with no default") expect_error(biCode(), "argument \"s\" is missing, with no default")
# The test_that() function allows to group related tests, include an informative # The test_that() function allows to group related tests, include an informative
# message which test is being executed, and run a number of tests that are # message which test is being executed, and run a number of tests that are
# passed to the function inside a code block - i.e. {...} # passed to the function inside a code block - i.e. {...}
# test_that("<descriptive string>, {<code block>}) # test_that("<descriptive string>, {<code block>})
test_that("NA values are preserved", { test_that("NA values are preserved", {
# bicode() respects vector length: input and output must have the smae length. # bicode() respects vector length: input and output must have the smae length.
# Therefore NA's can't be simply skipped, bust must be properly passed # Therefore NA's can't be simply skipped, bust must be properly passed
# into output: # into output:
expect_true(is.na((biCode(NA)))) expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")), expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST.")) c("FIRST", NA, "LAST."))
}) })
# Task: Write a function calcGC() that calculates GC content in a sequence. # Task: Write a function calcGC() that calculates GC content in a sequence.
# Hint: you could strsplit() the sequence into a vector, and count # Hint: you could strsplit() the sequence into a vector, and count
# G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove # G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
# A's and T's, and use nchar() before and after to calculate the content # A's and T's, and use nchar() before and after to calculate the content
# from the length difference. # from the length difference.
# Then write tests that: # Then write tests that:
# confirm that calcGC("AATT") is 0; # confirm that calcGC("AATT") is 0;
# confirm that calcGC("ATGC") is 0.5; # confirm that calcGC("ATGC") is 0.5;
# confirm that calcGC("AC") is 0.5; # confirm that calcGC("AC") is 0.5;
# confirm that calcGC("CGCG") is 1; # confirm that calcGC("CGCG") is 1;
# = 2 Organizing your tests =============================================== # = 2 Organizing your tests ===============================================
# Tests are only useful if they are actually executed and we need to make sure # Tests are only useful if they are actually executed and we need to make sure
# there are no barriers to do that. The testthat package supports automatic # there are no barriers to do that. The testthat package supports automatic
# execution of tests: # execution of tests:
# - put your tests into an R-script, # - put your tests into an R-script,
# - save your tests in a file called "test_<my-function-name>.R" # - save your tests in a file called "test_<my-function-name>.R"
# - execute the test with test_file("test_<my-function-name>.R") ... # - execute the test with test_file("test_<my-function-name>.R") ...
# ... or, if you are working on a project ... # ... or, if you are working on a project ...
# - place the file in a test-directory (e.g. the directory "test" in this # - place the file in a test-directory (e.g. the directory "test" in this
# project), # project),
# - execute all your tests with test_dir("<my-test-directory>") # - execute all your tests with test_dir("<my-test-directory>")
# For example I have provided a "tests" directory with this project, and # For example I have provided a "tests" directory with this project, and
# placed the file "test_biCode.R" inside. # placed the file "test_biCode.R" inside.
file.show("./tests/test_biCode.R") file.show("./tests/test_biCode.R")
# Execute the file ... # Execute the file ...
test_file("./tests/test_biCode.R") test_file("./tests/test_biCode.R")
# .. or execute all the test files in the directory: # .. or execute all the test files in the directory:
test_dir("./tests") test_dir("./tests")
# == 2.1 Testing scripts =================================================== # == 2.1 Testing scripts ===================================================
# Scripts need special consideration since we do not necessarily source() them # Scripts need special consideration since we do not necessarily source() them
# entirely. Therefore automated testing is not reasonable. What you can do # entirely. Therefore automated testing is not reasonable. What you can do
# instead is to place a conditional block at the end of your script, that # instead is to place a conditional block at the end of your script, that
# never gets executed - then you can manually execute the code in the block # never gets executed - then you can manually execute the code in the block
# whenever you wish to test your functions. For example: # whenever you wish to test your functions. For example:
if (FALSE) { if (FALSE) {
# ... your tests go here # ... your tests go here
} }
# == 2.2 Rethinking testing ================================================ # == 2.2 Rethinking testing ================================================
# However, it is important to keep in mind that different objectives lead to # However, it is important to keep in mind that different objectives lead to
# different ideas of what works best. There is never a "best" in and of itself, # different ideas of what works best. There is never a "best" in and of itself,
# the question is always: "Best for what?" While automated unit testing is a # the question is always: "Best for what?" While automated unit testing is a
# great way to assure the integrity of packages and larger software artefacts as # great way to assure the integrity of packages and larger software artefacts as
# they are being developed, more loosely conceived aggregates of code - like the # they are being developed, more loosely conceived aggregates of code - like the
# scripts for this course for example - have different objectives and in this # scripts for this course for example - have different objectives and in this
# case I find the testthat approach to actually be inferior. The reason is its # case I find the testthat approach to actually be inferior. The reason is its
# tendency to physically separate code and tests. Keeping assets, and functions # tendency to physically separate code and tests. Keeping assets, and functions
# that operate on those assets separated is always poor design. I have found # that operate on those assets separated is always poor design. I have found
# over time that a more stable approach is to move individual functions into # over time that a more stable approach is to move individual functions into
# their individual scripts, all in one folder, one function (and its helpers) # their individual scripts, all in one folder, one function (and its helpers)
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as # per file, and examples, demos and tests in an if (FALSE) { ... } block, as
# explained above. # explained above.
# = 3 Task solutions ====================================================== # = 3 Task solutions ======================================================
calcGC <- function(s) { calcGC <- function(s) {
s <- gsub("[^agctAGCT]", "", s) s <- gsub("[^agctAGCT]", "", s)
return(nchar(gsub("[atAT]", "", s)) / nchar(s)) return(nchar(gsub("[atAT]", "", s)) / nchar(s))
} }
expect_equal(calcGC("AATT"), 0) expect_equal(calcGC("AATT"), 0)
expect_equal(calcGC("ATGC"), 0.5) expect_equal(calcGC("ATGC"), 0.5)
expect_equal(calcGC("AC"), 0.5) expect_equal(calcGC("AC"), 0.5)
expect_equal(calcGC("CGCG"), 1) expect_equal(calcGC("CGCG"), 1)
# [END] # [END]

View File

@ -1,166 +1,166 @@
# tocID <- "RPR-eUtils_XML.R" # tocID <- "RPR-eUtils_XML.R"
# #
# Purpose: A Bioinformatics Course: # Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit. # R code accompanying the RPR-Scripting_data_downloads unit.
# #
# Version: 1.2.1 # Version: 1.2.1
# #
# Date: 2017-10 - 2021-09 # Date: 2017-10 - 2021-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2.1 2021 Maintenance # 1.2.1 2021 Maintenance
# 1.2 2020 Updates # 1.2 2020 Updates
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 First ABC units version # 1.0 First ABC units version
# 0.1 First code copied from 2016 material. # 0.1 First code copied from 2016 material.
# #
# #
# TODO: # TODO:
# #
# #
# == DO NOT SIMPLY source() THIS FILE! ======================================= # == DO NOT SIMPLY source() THIS FILE! =======================================
# #
# If there are portions you don't understand, use R's help system, Google for an # If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's # answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ... # going on. That's not how it works ...
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ----------------------------------------------------------- #TOC> -----------------------------------------------------------
#TOC> 1 Working with NCBI eUtils 43 #TOC> 1 Working with NCBI eUtils 43
#TOC> 1.1 Task - fetchNCBItaxData() function 145 #TOC> 1.1 Task - fetchNCBItaxData() function 145
#TOC> 2 Task solutions 152 #TOC> 2 Task solutions 152
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Working with NCBI eUtils ============================================ # = 1 Working with NCBI eUtils ============================================
# To begin, we load the xml2 package that contains functions # To begin, we load the xml2 package that contains functions
# we need to receive and parse html data. NCBI's eUtils send information in # we need to receive and parse html data. NCBI's eUtils send information in
# XML format so we need to be able to parse XML. # XML format so we need to be able to parse XML.
if (! requireNamespace("xml2", quietly=TRUE)) { if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2") install.packages("xml2")
} }
# Package information: # Package information:
# library(help = xml2) # basic information # library(help = xml2) # basic information
# browseVignettes("xml2") # available vignettes # browseVignettes("xml2") # available vignettes
# data(package = "xml2") # available datasets # data(package = "xml2") # available datasets
# We will walk through the process with the refSeqID # We will walk through the process with the refSeqID
# of yeast Mbp1 # of yeast Mbp1
refSeqID <- "NP_010227" refSeqID <- "NP_010227"
# First we build a query URL... # First we build a query URL...
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
# Then we assemble an URL that will search for get the # Then we assemble an URL that will search for get the
# unique, NCBI internal identifier, # unique, NCBI internal identifier,
# for our refSeqID... # for our refSeqID...
URL <- paste(eUtilsBase, URL <- paste(eUtilsBase,
"esearch.fcgi?", # ...using the esearch program "esearch.fcgi?", # ...using the esearch program
# that finds an entry in an # that finds an entry in an
# NCBI database # NCBI database
"db=protein", "db=protein",
"&term=", refSeqID, "&term=", refSeqID,
sep="") sep="")
# Copy the URL and paste it into your browser to see # Copy the URL and paste it into your browser to see
# what the response should look like. # what the response should look like.
URL URL
# To fetch a response in R, we use the function read_xml() # To fetch a response in R, we use the function read_xml()
# with our URL as its argument. # with our URL as its argument.
( myXML <- xml2::read_xml(URL) ) ( myXML <- xml2::read_xml(URL) )
# This is XML. We can take the response apart into # This is XML. We can take the response apart into
# its individual components with the as_list() function. # its individual components with the as_list() function.
xml2::as_list(myXML) xml2::as_list(myXML)
# Note how the XML "tree" is represented as a list of # Note how the XML "tree" is represented as a list of
# lists of lists ... # lists of lists ...
# If we know exactly what element we are looking for, # If we know exactly what element we are looking for,
# we can extract it from this structure: # we can extract it from this structure:
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
# But this is not very robust, it would break with the # But this is not very robust, it would break with the
# slightest change that the NCBI makes to their data format - # slightest change that the NCBI makes to their data format -
# and the NCBI changes things A LOT! # and the NCBI changes things A LOT!
# Somewhat more robust is to specify the type of element # Somewhat more robust is to specify the type of element
# we want - its the text contained in an <Id>...</Id> # we want - its the text contained in an <Id>...</Id>
# element, and use the XPath XML parsing language to # element, and use the XPath XML parsing language to
# retrieve it. # retrieve it.
xml2::xml_find_all(myXML, "//Id") # returns a "node set" xml2::xml_find_all(myXML, "//Id") # returns a "node set"
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
# of the node set # of the node set
# We will need to do this more than once, so we write a function # We will need to do this more than once, so we write a function
# for it... # for it...
node2text <- function(doc, tag) { node2text <- function(doc, tag) {
# an extractor function for the contents of elements # an extractor function for the contents of elements
# between given tags in an XML response. # between given tags in an XML response.
# Contents of all matching elements is returned in # Contents of all matching elements is returned in
# a vector of strings. # a vector of strings.
path <- paste0("//", tag) path <- paste0("//", tag)
nodes <- xml2::xml_find_all(doc, path) nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes)) return(xml2::xml_text(nodes))
} }
# using node2text() ... # using node2text() ...
(GID <- node2text(myXML, "Id")) (GID <- node2text(myXML, "Id"))
# The GI is the pivot for data requests at the # The GI is the pivot for data requests at the
# NCBI. # NCBI.
# Let's first get the associated data for this GI # Let's first get the associated data for this GI
URL <- paste0(eUtilsBase, URL <- paste0(eUtilsBase,
"esummary.fcgi?", "esummary.fcgi?",
"db=protein", "db=protein",
"&id=", "&id=",
GID, GID,
"&version=2.0") "&version=2.0")
(myXML <- xml2::read_xml(URL)) (myXML <- xml2::read_xml(URL))
(taxID <- node2text(myXML, "TaxId")) (taxID <- node2text(myXML, "TaxId"))
(organism <- node2text(myXML, "Organism")) (organism <- node2text(myXML, "Organism"))
# This forms the base of a function that gets taxonomy data # This forms the base of a function that gets taxonomy data
# from an Entrez result. You can write this! # from an Entrez result. You can write this!
# == 1.1 Task - fetchNCBItaxData() function ================================ # == 1.1 Task - fetchNCBItaxData() function ================================
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
# information, returns a list with taxID and organism, if the operation is # information, returns a list with taxID and organism, if the operation is
# successful, or a list of length 0 if there is an error. # successful, or a list of length 0 if there is an error.
# = 2 Task solutions ====================================================== # = 2 Task solutions ======================================================
# I have placed such a function into the dbUtilities script: look it up by # I have placed such a function into the dbUtilities script: look it up by
# clicking on dbFetchNCBItaxData() in the Environment pane. # clicking on dbFetchNCBItaxData() in the Environment pane.
# Test: # Test:
dbFetchNCBItaxData("XP_001837394") dbFetchNCBItaxData("XP_001837394")
# Expected outout: # Expected outout:
# ---------------- # ----------------
# taxID organism # taxID organism
# 1 240176 Coprinopsis cinerea okayama7#130 # 1 240176 Coprinopsis cinerea okayama7#130
# [END] # [END]

View File

@ -1,10 +1,10 @@
HEADER TEST 0TST 0TST 1 HEADER TEST 0TST 0TST 1
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2 REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3 ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4 ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5 ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6 ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
TER 5 GLY 1 0TST 7 TER 5 GLY 1 0TST 7
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8 HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9 HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
END 0TST 10 END 0TST 10

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
>2F1C:X|PDBID|CHAIN|SEQUENCE >2F1C:X|PDBID|CHAIN|SEQUENCE
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH

View File

@ -1,6 +1,6 @@
>3FG7:A|PDBID|CHAIN|SEQUENCE >3FG7:A|PDBID|CHAIN|SEQUENCE
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN

View File

@ -1,20 +1,20 @@
[ [
{ "name" : "MBP1_SACCE", { "name" : "MBP1_SACCE",
"RefSeqID" : "NP_010227", "RefSeqID" : "NP_010227",
"UniProtID" : "P39678", "UniProtID" : "P39678",
"taxonomyID" : 559292, "taxonomyID" : 559292,
"sequence" : [ "sequence" : [
"MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF", "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
"GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET", "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
"KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL", "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
"PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ", "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
"QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV", "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
"NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL", "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
"SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM", "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
"MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ", "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
"MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK", "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
"KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS", "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
"LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"] "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
} }
] ]

View File

@ -1,30 +1,30 @@
>PTPN5-201 cds:protein_coding (ENST00000358540.7) >PTPN5-201 cds:protein_coding (ENST00000358540.7)
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
CACCAGTCCCCAGAATGA CACCAGTCCCCAGAATGA

View File

@ -1,12 +1,12 @@
>RAB39B cds:protein_coding (ENST00000369454.4) >RAB39B cds:protein_coding (ENST00000369454.4)
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG

View File

@ -1,131 +1,131 @@
```{css, echo = FALSE} ```{css, echo = FALSE}
.striped tr:nth-child(even) { .striped tr:nth-child(even) {
background: #eaf1ff; background: #eaf1ff;
} }
.striped { .striped {
padding: 5px; padding: 5px;
} }
``` ```
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 --> <small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
```{r setup, include=FALSE} ```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(echo = TRUE)
``` ```
## Phobias! ## ## Phobias! ##
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>. We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it. To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
```{r packages} ```{r packages}
if (! requireNamespace("rvest", quietly=TRUE)) { if (! requireNamespace("rvest", quietly=TRUE)) {
install.packages("rvest") install.packages("rvest")
} }
if (! requireNamespace("xml2", quietly=TRUE)) { if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2") install.packages("xml2")
} }
``` ```
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable. As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames. `xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
```{r getPageData, cache=TRUE} ```{r getPageData, cache=TRUE}
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias") webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
allTables <- rvest::html_table(webPage, fill = TRUE) allTables <- rvest::html_table(webPage, fill = TRUE)
``` ```
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`. There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
```{r collateTables, cache=TRUE} ```{r collateTables, cache=TRUE}
phobiaTable <- data.frame(Phobia = character(), Condition = character()) phobiaTable <- data.frame(Phobia = character(), Condition = character())
for (i in seq_along(allTables)) { for (i in seq_along(allTables)) {
df <- allTables[[i]] df <- allTables[[i]]
if (all(colnames(df) == c("Phobia", "Condition"))) { if (all(colnames(df) == c("Phobia", "Condition"))) {
phobiaTable <- rbind(phobiaTable, df) phobiaTable <- rbind(phobiaTable, df)
} }
} }
``` ```
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them. Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
<p>&nbsp; <p>&nbsp;
<p> <p>
```{r , ref.label="randRow", echo=FALSE} ```{r , ref.label="randRow", echo=FALSE}
``` ```
**Table**: seven random phobias<br/> **Table**: seven random phobias<br/>
```{r renderPhobiaTable, echo=FALSE, results='asis'} ```{r renderPhobiaTable, echo=FALSE, results='asis'}
sel <- sample(1:nrow(phobiaTable), 7) sel <- sample(1:nrow(phobiaTable), 7)
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html") knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
``` ```
<p>&nbsp; <p>&nbsp;
<p> <p>
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function. To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
```{r randRow} ```{r randRow}
randRow <- function(M, seed = FALSE) { randRow <- function(M, seed = FALSE) {
# Return a random row from a dataframe M. # Return a random row from a dataframe M.
if (seed) { if (seed) {
oldseed <- .Random.seed # play nice and save the RNG state ... oldseed <- .Random.seed # play nice and save the RNG state ...
set.seed(as.integer(seed)) set.seed(as.integer(seed))
} }
r <- M[sample(1:nrow(M), 1), ] # fetch one random row r <- M[sample(1:nrow(M), 1), ] # fetch one random row
if (seed) { .Random.seed <- oldseed } # ... restore the RNG state if (seed) { .Random.seed <- oldseed } # ... restore the RNG state
return(r) return(r)
} }
``` ```
<p>&nbsp; <p>&nbsp;
<p> <p>
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`. With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful. _`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
<p>&nbsp; <p>&nbsp;
<p> <p>
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up. Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
```{r preProcess} ```{r preProcess}
# select only single-word phobias that end with "phobia" # select only single-word phobias that end with "phobia"
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia) sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
names <- phobiaTable$Phobia[sel] names <- phobiaTable$Phobia[sel]
# extract the ones we did _not_ select # extract the ones we did _not_ select
x <- phobiaTable$Phobia[! sel] x <- phobiaTable$Phobia[! sel]
# use strsplit() to split them apart and flatten the resulting list # use strsplit() to split them apart and flatten the resulting list
x <- unlist(strsplit(x, ", ")) x <- unlist(strsplit(x, ", "))
x <- unlist(strsplit(x, " ")) x <- unlist(strsplit(x, " "))
x <- unlist(strsplit(x, "/")) x <- unlist(strsplit(x, "/"))
# use the same selection as above, and append the result to our "names"" # use the same selection as above, and append the result to our "names""
sel <- ! grepl(" ", x) & grepl(".phobia$", x) sel <- ! grepl(" ", x) & grepl(".phobia$", x)
names <- c(names, x[sel]) names <- c(names, x[sel])
``` ```
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths. Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
```{r showHist} ```{r showHist}
x <- nchar(names) x <- nchar(names)
pShort <- names[which(x == min(x))[1]] # pull out the shortest name ... pShort <- names[which(x == min(x))[1]] # pull out the shortest name ...
pLong <- names[which(x == max(x))[1]] # ... and the longest name too. pLong <- names[which(x == max(x))[1]] # ... and the longest name too.
hist(x, hist(x,
main = "Length of phobia-names", main = "Length of phobia-names",
sub = sprintf("Shortest: %s (%d), Longest: %s (%d)", sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
pShort, nchar(pShort), pLong, nchar(pLong)), pShort, nchar(pShort), pLong, nchar(pLong)),
cex.sub = 0.8, cex.sub = 0.8,
xlab = "name", xlab = "name",
ylab = "counts", ylab = "counts",
col ="#aef5ee") col ="#aef5ee")
``` ```
That's all. That's all.
<!-- [END] --> <!-- [END] -->

View File

@ -1,43 +1,43 @@
>MBP1 YDL056W SGDID:S000002214 >MBP1 YDL056W SGDID:S000002214
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA

View File

@ -1,47 +1,47 @@
SGD_features.tab SGD_features.tab
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1. The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
The SGD_features.tab file is updated weekly (Saturday). The SGD_features.tab file is updated weekly (Saturday).
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
used chromosomal_feature.tab file. used chromosomal_feature.tab file.
File contents: File contents:
1. Information on current chromosomal features in SGD, including Dubious ORFs. 1. Information on current chromosomal features in SGD, including Dubious ORFs.
Also contains coordinates of intron, exons, and other subfeatures that are located Also contains coordinates of intron, exons, and other subfeatures that are located
within a chromosomal feature. within a chromosomal feature.
2. The relationship between subfeatures and the feature in which they 2. The relationship between subfeatures and the feature in which they
are located is identified by the feature name in column #7 (parent are located is identified by the feature name in column #7 (parent
feature). For example, the parent feature of the intron found in feature). For example, the parent feature of the intron found in
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
chromosome 6. chromosome 6.
3. The coordinates of all features are in chromosomal coordinates. 3. The coordinates of all features are in chromosomal coordinates.
Columns within SGD_features.tab: Columns within SGD_features.tab:
1. Primary SGDID (mandatory) 1. Primary SGDID (mandatory)
2. Feature type (mandatory) 2. Feature type (mandatory)
3. Feature qualifier (optional) 3. Feature qualifier (optional)
4. Feature name (optional) 4. Feature name (optional)
5. Standard gene name (optional) 5. Standard gene name (optional)
6. Alias (optional, multiples separated by |) 6. Alias (optional, multiples separated by |)
7. Parent feature name (optional) 7. Parent feature name (optional)
8. Secondary SGDID (optional, multiples separated by |) 8. Secondary SGDID (optional, multiples separated by |)
9. Chromosome (optional) 9. Chromosome (optional)
10. Start_coordinate (optional) 10. Start_coordinate (optional)
11. Stop_coordinate (optional) 11. Stop_coordinate (optional)
12. Strand (optional) 12. Strand (optional)
13. Genetic position (optional) 13. Genetic position (optional)
14. Coordinate version (optional) 14. Coordinate version (optional)
15. Sequence version (optional) 15. Sequence version (optional)
16. Description (optional) 16. Description (optional)
Note that "chromosome 17" is the mitochondrial chromosome. Note that "chromosome 17" is the mitochondrial chromosome.
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,179 +1,179 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334 1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936 1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334 1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936 1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078 1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078 1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000311936
3 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000557334
4 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000256078
5 93 + missense_variant 25398284 93 C 93 T G/D 12 93 12 ENSG00000133703 ENST00000556131
6 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000311936
7 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000557334
8 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000556131
9 86 + missense_variant 25398284 86 C 86 A G/V 12 86 12 ENSG00000133703 ENST00000256078
10 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000556131
11 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000256078
12 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000557334
13 72 + missense_variant 25398285 72 C 72 A G/C 12 72 12 ENSG00000133703 ENST00000311936
14 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000557334
15 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000556131
16 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000256078
17 63 - missense_variant 25398284 63 G 63 A G/D 12 63 12 ENSG00000133703 ENST00000311936
18 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000311936
19 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000256078
20 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000556131
21 36 - missense_variant 25398284 36 G 36 T G/V 12 36 12 ENSG00000133703 ENST00000557334
22 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000256078
23 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000311936
24 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000557334
25 24 + missense_variant 25398281 24 C 24 T G/D 12 24 13 ENSG00000133703 ENST00000556131
26 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000556131
27 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000311936
28 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000557334
29 23 + missense_variant 25398284 23 C 23 G G/A 12 23 12 ENSG00000133703 ENST00000256078
30 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000556131
31 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000311936
32 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000557334
33 16 - missense_variant 25398285 16 G 16 C G/R 12 16 12 ENSG00000133703 ENST00000256078
34 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000311936
35 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000556131
36 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000557334
37 13 + missense_variant 25398285 13 C 13 G G/R 12 13 12 ENSG00000133703 ENST00000256078
38 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000311936
39 11 + missense_variant 25380275 11 T 11 G Q/H 12 11 61 ENSG00000133703 ENST00000256078
40 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000557334
41 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000311936
42 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000556131
43 10 + missense_variant 25398282 10 C 10 A G/C 12 10 13 ENSG00000133703 ENST00000256078
44 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000557334
45 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000556131
46 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000311936
47 9 + missense_variant 25398285 9 C 9 T G/S 12 9 12 ENSG00000133703 ENST00000256078
48 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000256078
49 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000256078
50 7 + missense_variant 25378562 7 C 7 T A/T 12 7 146 ENSG00000133703 ENST00000311936
51 7 + missense_variant 25380276 7 T 7 A Q/L 12 7 61 ENSG00000133703 ENST00000311936
52 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000311936
53 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000256078
54 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000311936
55 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000557334
56 5 + missense_variant 25398284 5 CC 5 AA G/F 12 5 12 ENSG00000133703 ENST00000556131
57 5 + missense_variant 25380276 5 T 5 C Q/R 12 5 61 ENSG00000133703 ENST00000256078
58 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000256078
59 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000557334
60 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000311936
61 4 + missense_variant 25398284 4 C 4 A G/V 12 4 12.0 ENSG00000133703 ENST00000556131
62 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000256078
63 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000256078
64 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000256078
65 3 + missense_variant 25380275 3 T 3 A Q/H 12 3 61 ENSG00000133703 ENST00000311936
66 3 + missense_variant 25378647 3 T 3 G K/N 12 3 117 ENSG00000133703 ENST00000311936
67 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000256078
68 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000256078
69 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000256078
70 3 + missense_variant 25380277 3 G 3 T Q/K 12 3 61 ENSG00000133703 ENST00000311936
71 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000311936
72 3 - missense_variant 25380275 3 A 3 C Q/H 12 3 61 ENSG00000133703 ENST00000311936
73 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000311936
74 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000556131
75 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000557334
76 3 + missense_variant 25398284 3 C 3 T G/D 12 3 12.0 ENSG00000133703 ENST00000557334
77 3 - missense_variant 25398281 3 G 3 A G/D 12 3 13 ENSG00000133703 ENST00000556131
78 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000556131
79 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000311936
80 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000311936
81 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000557334
82 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000556131
83 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000311936
84 2 - missense_variant 25378562 2 G 2 A A/T 12 2 146 ENSG00000133703 ENST00000256078
85 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000256078
86 2 - missense_variant 25380276 2 A 2 G Q/R 12 2 61 ENSG00000133703 ENST00000256078
87 2 + missense_variant 25398255 2 G 2 T Q/K 12 2 22 ENSG00000133703 ENST00000311936
88 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000311936
89 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000256078
90 2 - missense_variant 25398285 2 G 2 A G/S 12 2 12 ENSG00000133703 ENST00000557334
91 2 + missense_variant 25378561 2 G 2 A A/V 12 2 146 ENSG00000133703 ENST00000256078
92 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000557334
93 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000557334
94 1 - missense_variant 25362743 1 A 1 T S/C 12 1 72 ENSG00000133703 ENST00000557334
95 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000557334
96 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000557334
97 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000557334
98 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000557334
99 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000557334
100 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000557334
101 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000311936
102 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000256078
103 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000256078
104 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000256078
105 1 + missense_variant 25362743 1 A 1 G C/R 12 1 185 ENSG00000133703 ENST00000311936
106 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 183-184 ENSG00000133703 ENST00000311936
107 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000311936
108 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000311936
109 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000311936
110 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000311936
111 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000311936
112 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000311936
113 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000311936
114 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000311936
115 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000311936
116 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000311936
117 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000256078
118 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000256078
119 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000256078
120 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000256078
121 1 + missense_variant 25380271 1 C 1 T E/K 12 1 63.0 ENSG00000133703 ENST00000256078
122 1 + missense_variant 25380274 1 C 1 T E/K 12 1 62 ENSG00000133703 ENST00000256078
123 1 + missense_variant 25380275 1 T 1 G Q/H 12 1 61.0 ENSG00000133703 ENST00000256078
124 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000256078
125 0 + synonymous_variant 25380278 0 A 1 G - 12 1 60 ENSG00000133703 ENST00000256078
126 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000256078
127 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000256078
128 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000256078
129 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000256078
130 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000256078
131 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000256078
132 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000256078
133 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000256078
134 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000256078
135 1 + missense_variant 25380277 1 GA 1 TT GQ/GK 12 1 60-61 ENSG00000133703 ENST00000311936
136 0 + synonymous_variant 25380278 0 A 1 T - 12 1 60 ENSG00000133703 ENST00000311936
137 1 + missense_variant 25380240 1 C 1 A R/M 12 1 73.0 ENSG00000133703 ENST00000256078
138 1 + missense_variant 25380282 1 G 1 C A/G 12 1 59 ENSG00000133703 ENST00000311936
139 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000556131
140 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000556131
141 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000556131
142 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000556131
143 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000556131
144 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000556131
145 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000556131
146 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000556131
147 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000556131
148 1 + missense_variant 25362743 1 A 1 G C/R 12 1 72 ENSG00000133703 ENST00000557334
149 0 + inframe_deletion 25362744 0 CTTTGT 1 - - 12 1 70-71 ENSG00000133703 ENST00000557334
150 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000557334
151 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000557334
152 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000557334
153 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000557334
154 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000556131
155 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000556131
156 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000556131
157 1 + missense_variant 25398281 1 C 1 A G/V 12 1 13 ENSG00000133703 ENST00000311936
158 1 + missense_variant 25380282 1 G 1 T A/E 12 1 59 ENSG00000133703 ENST00000311936
159 1 + missense_variant 25398211 1 T 1 C I/M 12 1 36 ENSG00000133703 ENST00000311936
160 1 + missense_variant 25398213 1 T 1 A I/L 12 1 36 ENSG00000133703 ENST00000311936
161 0 + synonymous_variant 25398250 0 T 1 C - 12 1 23 ENSG00000133703 ENST00000311936
162 1 + missense_variant 25398260 1 G 1 C T/R 12 1 20 ENSG00000133703 ENST00000311936
163 0 + synonymous_variant 25398280 0 G 1 A - 12 1 13 ENSG00000133703 ENST00000311936
164 0 + synonymous_variant 25398283 0 A 1 C - 12 1 12 ENSG00000133703 ENST00000311936
165 1 - missense_variant 25398284 1 G 1 C G/A 12 1 12 ENSG00000133703 ENST00000311936
166 1 + missense_variant 25398285 1 C 1 A G/C 12 1 12.0 ENSG00000133703 ENST00000311936
167 1 + missense_variant 25398285 1 C 1 G G/R 12 1 12.0 ENSG00000133703 ENST00000311936
168 1 + missense_variant 25398306 1 T 1 C K/E 12 1 5 ENSG00000133703 ENST00000311936
169 1 - missense_variant 25362743 1 A 1 T S/C 12 1 185 ENSG00000133703 ENST00000311936
170 1 - missense_variant 25378647 1 A 1 T K/N 12 1 117 ENSG00000133703 ENST00000311936
171 1 - missense_variant 25398282 1 G 1 T G/C 12 1 13 ENSG00000133703 ENST00000311936
172 1 + missense_variant 25380254 1 C 1 A R/S 12 1 68 ENSG00000133703 ENST00000256078
173 1 + missense_variant 25378645 1 C 1 G C/S 12 1 118 ENSG00000133703 ENST00000256078
174 1 + missense_variant 25378594 1 C 1 G R/T 12 1 135 ENSG00000133703 ENST00000256078
175 1 + missense_variant 25368454 1 C 1 T R/Q 12 1 164 ENSG00000133703 ENST00000256078
176 1 + missense_variant 25368473 1 T 1 C T/A 12 1 158 ENSG00000133703 ENST00000256078
177 1 + missense_variant 25378557 1 C 1 G K/N 12 1 147 ENSG00000133703 ENST00000256078
178 1 + missense_variant 25378562 1 C 1 G A/P 12 1 146 ENSG00000133703 ENST00000256078
179 1 + missense_variant 25378562 1 C 1 T A/T 12 1 146.0 ENSG00000133703 ENST00000256078

View File

@ -1,49 +1,49 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094 2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094 2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094 1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094 1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094 1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094 1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094 1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094 1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094 1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094 1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094 1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094 1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094 1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094 1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094 1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094 1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094 1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094 1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094 1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094 1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094 1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094 1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094 1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094 1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094 1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094 1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094 1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094 1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094 1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094 1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094 0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094 1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 2 + missense_variant 3119330 2 G 2 A R/Q 17 2 139 ENSG00000172146 ENST00000304094
3 2 + missense_variant 3119138 2 C 2 T S/L 17 2 75 ENSG00000172146 ENST00000304094
4 0 + synonymous_variant 3119772 0 C 2 T - 17 2 286 ENSG00000172146 ENST00000304094
5 1 + missense_variant 3119791 1 C 1 T R/W 17 1 293 ENSG00000172146 ENST00000304094
6 1 + missense_variant 3119799 1 G 1 A M/I 17 1 295 ENSG00000172146 ENST00000304094
7 0 + synonymous_variant 3119805 0 T 1 C - 17 1 297 ENSG00000172146 ENST00000304094
8 0 + synonymous_variant 3119823 0 C 1 T - 17 1 303 ENSG00000172146 ENST00000304094
9 1 + missense_variant 3119786 1 G 1 A R/K 17 1 291 ENSG00000172146 ENST00000304094
10 1 + missense_variant 3119744 1 C 1 G T/R 17 1 277 ENSG00000172146 ENST00000304094
11 0 + synonymous_variant 3119691 0 C 1 T - 17 1 259 ENSG00000172146 ENST00000304094
12 0 + synonymous_variant 3119589 0 C 1 T - 17 1 225 ENSG00000172146 ENST00000304094
13 1 + missense_variant 3119408 1 G 1 A S/N 17 1 165 ENSG00000172146 ENST00000304094
14 1 + missense_variant 3119431 1 G 1 A E/K 17 1 173 ENSG00000172146 ENST00000304094
15 1 + missense_variant 3119462 1 C 1 T P/L 17 1 183 ENSG00000172146 ENST00000304094
16 1 + stop_gained 3119514 1 C 1 G - 17 1 200 ENSG00000172146 ENST00000304094
17 1 + missense_variant 3119530 1 T 1 G F/V 17 1 206 ENSG00000172146 ENST00000304094
18 1 + missense_variant 3119581 1 A 1 G T/A 17 1 223 ENSG00000172146 ENST00000304094
19 1 + stop_gained 3119590 1 C 1 T - 17 1 226 ENSG00000172146 ENST00000304094
20 1 + missense_variant 3119679 1 G 1 T M/I 17 1 255 ENSG00000172146 ENST00000304094
21 0 + synonymous_variant 3119592 0 G 1 A - 17 1 226 ENSG00000172146 ENST00000304094
22 1 + missense_variant 3119596 1 C 1 T P/S 17 1 228 ENSG00000172146 ENST00000304094
23 0 + synonymous_variant 3119610 0 C 1 T - 17 1 232 ENSG00000172146 ENST00000304094
24 1 + missense_variant 3119627 1 C 1 T S/F 17 1 238 ENSG00000172146 ENST00000304094
25 0 + synonymous_variant 3119640 0 C 1 A - 17 1 242 ENSG00000172146 ENST00000304094
26 1 + missense_variant 3119672 1 C 1 T T/I 17 1 253 ENSG00000172146 ENST00000304094
27 1 + missense_variant 3119395 1 C 1 A L/M 17 1 161 ENSG00000172146 ENST00000304094
28 0 + synonymous_variant 3119403 0 A 1 G - 17 1 163 ENSG00000172146 ENST00000304094
29 1 + missense_variant 3119386 1 C 1 T P/S 17 1 158 ENSG00000172146 ENST00000304094
30 0 + synonymous_variant 3119289 0 C 1 A - 17 1 125 ENSG00000172146 ENST00000304094
31 1 + stop_gained 3118972 1 C 1 T - 17 1 20 ENSG00000172146 ENST00000304094
32 1 + missense_variant 3118978 1 G 1 A E/K 17 1 22 ENSG00000172146 ENST00000304094
33 1 + missense_variant 3118986 1 A 1 C E/D 17 1 24 ENSG00000172146 ENST00000304094
34 1 + missense_variant 3119002 1 C 1 T L/F 17 1 30 ENSG00000172146 ENST00000304094
35 0 + synonymous_variant 3119029 0 T 1 C - 17 1 39 ENSG00000172146 ENST00000304094
36 1 + missense_variant 3119074 1 C 1 T R/C 17 1 54 ENSG00000172146 ENST00000304094
37 1 + missense_variant 3119075 1 G 1 A R/H 17 1 54 ENSG00000172146 ENST00000304094
38 0 + synonymous_variant 3119076 0 C 1 T - 17 1 54 ENSG00000172146 ENST00000304094
39 0 + synonymous_variant 3119115 0 C 1 T - 17 1 67 ENSG00000172146 ENST00000304094
40 0 + synonymous_variant 3119139 0 G 1 A - 17 1 75 ENSG00000172146 ENST00000304094
41 0 + synonymous_variant 3119187 0 C 1 T - 17 1 91 ENSG00000172146 ENST00000304094
42 1 + missense_variant 3119210 1 C 1 T T/M 17 1 99 ENSG00000172146 ENST00000304094
43 1 + missense_variant 3119217 1 G 1 A M/I 17 1 101 ENSG00000172146 ENST00000304094
44 1 + missense_variant 3119264 1 C 1 T A/V 17 1 117 ENSG00000172146 ENST00000304094
45 1 + missense_variant 3119269 1 G 1 A A/T 17 1 119 ENSG00000172146 ENST00000304094
46 1 + missense_variant 3118961 1 G 1 A G/E 17 1 16 ENSG00000172146 ENST00000304094
47 0 + synonymous_variant 3118956 0 C 1 A - 17 1 14 ENSG00000172146 ENST00000304094
48 0 + synonymous_variant 3118944 0 G 1 A - 17 1 10 ENSG00000172146 ENST00000304094
49 1 + missense_variant 3118928 1 A 1 C N/T 17 1 5 ENSG00000172146 ENST00000304094

View File

@ -1,113 +1,113 @@
MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677 5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677 4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677 2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677 2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677 2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818 0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818 1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818 0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818 1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818 0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818 0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677 1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677 1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677 1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677 1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677 1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677 1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677 1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677 1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677 1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677 1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677 1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677 1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677 1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677 1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677 1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677 1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677

1 MUTS_PAM STRAND MOST_SEVERE START MUTS_PAM_SAMPLES REF MUTS_CS ALT AA_CHANGE CHR MUTS_CS_SAMPLES PROTEIN_POS GENE TRANSCRIPT
2 5 + missense_variant 112926888 5 G 5 T G/V 12 5 503 ENSG00000179295 ENST00000351677
3 4 + missense_variant 112926270 4 C 4 T T/M 12 4 468 ENSG00000179295 ENST00000351677
4 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000392597
5 3 + missense_variant 112888198 3 G 3 A A/T 12 3 72 ENSG00000179295 ENST00000351677
6 2 + missense_variant 112926910 2 G 2 C Q/H 12 2 510 ENSG00000179295 ENST00000351677
7 2 + missense_variant 112926909 2 A 2 T Q/L 12 2 510 ENSG00000179295 ENST00000351677
8 2 + missense_variant 112926900 2 C 2 A T/K 12 2 507 ENSG00000179295 ENST00000351677
9 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000392597
10 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000392597
11 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000392597
12 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000392597
13 2 + missense_variant 112891006 2 C 2 T H/Y 12 2 114 ENSG00000179295 ENST00000351677
14 2 + missense_variant 112888210 2 G 2 A E/K 12 2 76 ENSG00000179295 ENST00000351677
15 2 + missense_variant 112888199 2 C 2 T A/V 12 2 72 ENSG00000179295 ENST00000351677
16 2 + missense_variant 112888199 2 C 2 A A/D 12 2 72 ENSG00000179295 ENST00000351677
17 0 + synonymous_variant 112893822 0 T 1 C - 12 1 82 ENSG00000179295 ENST00000530818
18 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000392597
19 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000392597
20 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000392597
21 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000392597
22 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000392597
23 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000392597
24 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000392597
25 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000392597
26 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000392597
27 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000392597
28 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000392597
29 1 + missense_variant 112892383 1 G 1 C V/L 12 1 26 ENSG00000179295 ENST00000530818
30 0 + synonymous_variant 112892409 0 T 1 C - 12 1 34 ENSG00000179295 ENST00000530818
31 1 + stop_gained 112893784 1 G 1 T - 12 1 70 ENSG00000179295 ENST00000530818
32 0 + synonymous_variant 112893798 0 A 1 G - 12 1 74 ENSG00000179295 ENST00000530818
33 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000392597
34 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000392597
35 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000392597
36 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000392597
37 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000392597
38 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000392597
39 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000392597
40 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000392597
41 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000392597
42 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000392597
43 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000392597
44 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000392597
45 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000392597
46 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000392597
47 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000392597
48 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000392597
49 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000392597
50 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000392597
51 0 + synonymous_variant 112893802 0 C 1 A - 12 1 76 ENSG00000179295 ENST00000530818
52 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000392597
53 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000392597
54 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000392597
55 0 + synonymous_variant 112893822 0 T 1 C - 12 1 237 ENSG00000179295 ENST00000351677
56 1 + missense_variant 112888165 1 G 1 T D/Y 12 1 61 ENSG00000179295 ENST00000351677
57 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69.0 ENSG00000179295 ENST00000351677
58 1 + missense_variant 112888189 1 G 1 A E/K 12 1 69 ENSG00000179295 ENST00000351677
59 1 + missense_variant 112888195 1 T 1 C F/L 12 1 71 ENSG00000179295 ENST00000351677
60 1 + missense_variant 112888197 1 T 1 A F/L 12 1 71 ENSG00000179295 ENST00000351677
61 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76.0 ENSG00000179295 ENST00000351677
62 1 + missense_variant 112888211 1 A 1 C E/A 12 1 76 ENSG00000179295 ENST00000351677
63 1 + missense_variant 112891015 1 C 1 T L/F 12 1 117 ENSG00000179295 ENST00000351677
64 1 + missense_variant 112891073 1 T 1 A L/H 12 1 136 ENSG00000179295 ENST00000351677
65 0 + synonymous_variant 112891116 0 T 1 C - 12 1 150 ENSG00000179295 ENST00000351677
66 1 + missense_variant 112891129 1 G 1 T D/Y 12 1 155 ENSG00000179295 ENST00000351677
67 1 + missense_variant 112892383 1 G 1 C V/L 12 1 181 ENSG00000179295 ENST00000351677
68 0 + synonymous_variant 112892409 0 T 1 C - 12 1 189 ENSG00000179295 ENST00000351677
69 1 + stop_gained 112893784 1 G 1 T - 12 1 225 ENSG00000179295 ENST00000351677
70 0 + synonymous_variant 112893798 0 A 1 G - 12 1 229 ENSG00000179295 ENST00000351677
71 1 + missense_variant 112888165 1 G 1 A D/N 12 1 61 ENSG00000179295 ENST00000351677
72 1 + missense_variant 112888163 1 G 1 T G/V 12 1 60 ENSG00000179295 ENST00000351677
73 1 + missense_variant 112888162 1 G 1 C G/R 12 1 60 ENSG00000179295 ENST00000351677
74 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000351677
75 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000351677
76 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000351677
77 0 + synonymous_variant 112893802 0 C 1 A - 12 1 231 ENSG00000179295 ENST00000351677
78 1 + missense_variant 112910775 1 C 1 T L/F 12 1 262 ENSG00000179295 ENST00000351677
79 0 + synonymous_variant 112888161 0 T 1 C - 12 1 59 ENSG00000179295 ENST00000392597
80 1 + missense_variant 112910837 1 C 1 G I/M 12 1 282 ENSG00000179295 ENST00000351677
81 1 + missense_variant 112926887 1 G 1 C G/R 12 1 503 ENSG00000179295 ENST00000351677
82 1 + missense_variant 112926908 1 C 1 G Q/E 12 1 510.0 ENSG00000179295 ENST00000351677
83 1 + missense_variant 112939963 1 G 1 C G/R 12 1 539 ENSG00000179295 ENST00000351677
84 1 + missense_variant 112939970 1 A 1 T E/V 12 1 541 ENSG00000179295 ENST00000351677
85 1 + missense_variant 112939981 1 A 1 C I/L 12 1 545 ENSG00000179295 ENST00000351677
86 0 + synonymous_variant 112939993 0 C 1 T - 12 1 549 ENSG00000179295 ENST00000351677
87 1 + missense_variant 112939999 1 G 1 A D/N 12 1 551 ENSG00000179295 ENST00000351677
88 1 + missense_variant 112940012 1 G 1 A G/E 12 1 555 ENSG00000179295 ENST00000351677
89 0 + synonymous_variant 112940025 0 T 1 C - 12 1 559 ENSG00000179295 ENST00000351677
90 1 + missense_variant 112940027 1 T 1 C L/P 12 1 560 ENSG00000179295 ENST00000351677
91 0 + synonymous_variant 112940031 0 G 1 A - 12 1 561 ENSG00000179295 ENST00000351677
92 1 + missense_variant 112940036 1 G 1 T C/F 12 1 563 ENSG00000179295 ENST00000351677
93 0 + synonymous_variant 112940052 0 C 1 T - 12 1 568 ENSG00000179295 ENST00000351677
94 1 + missense_variant 112884103 1 G 1 A G/D 12 1 13 ENSG00000179295 ENST00000392597
95 1 + missense_variant 112888139 1 C 1 G T/S 12 1 52 ENSG00000179295 ENST00000392597
96 1 + missense_variant 112926885 1 C 1 T S/L 12 1 502 ENSG00000179295 ENST00000351677
97 1 + missense_variant 112926884 1 T 1 C S/P 12 1 502 ENSG00000179295 ENST00000351677
98 0 + synonymous_variant 112926862 0 C 1 T - 12 1 494 ENSG00000179295 ENST00000351677
99 1 + missense_variant 112924286 1 C 1 T T/M 12 1 411 ENSG00000179295 ENST00000351677
100 1 + missense_variant 112910844 1 T 1 G F/V 12 1 285.0 ENSG00000179295 ENST00000351677
101 0 + synonymous_variant 112915507 0 A 1 G - 12 1 302 ENSG00000179295 ENST00000351677
102 1 + missense_variant 112915523 1 A 1 G N/D 12 1 308 ENSG00000179295 ENST00000351677
103 1 + missense_variant 112915743 1 A 1 G N/S 12 1 339 ENSG00000179295 ENST00000351677
104 1 + missense_variant 112919908 1 T 1 G Y/D 12 1 375 ENSG00000179295 ENST00000351677
105 1 + frameshift_variant 112920002 1 - 1 T - 12 1 406 ENSG00000179295 ENST00000351677
106 1 + stop_gained 112924308 1 C 1 A - 12 1 418 ENSG00000179295 ENST00000351677
107 1 + missense_variant 112926852 1 C 1 T P/L 12 1 491 ENSG00000179295 ENST00000351677
108 1 + missense_variant 112924331 1 A 1 T H/L 12 1 426 ENSG00000179295 ENST00000351677
109 1 + missense_variant 112924336 1 G 1 A V/M 12 1 428 ENSG00000179295 ENST00000351677
110 1 + missense_variant 112926248 1 G 1 A A/T 12 1 461 ENSG00000179295 ENST00000351677
111 1 + missense_variant 112926249 1 C 1 G A/G 12 1 461 ENSG00000179295 ENST00000351677
112 1 + missense_variant 112926291 1 TT 1 CA L/P 12 1 475 ENSG00000179295 ENST00000351677
113 1 + missense_variant 112926839 1 G 1 T D/Y 12 1 487 ENSG00000179295 ENST00000351677

View File

@ -1,39 +1,39 @@
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6 >MBP1_ASPNI AN3154 XP_660758 Q5B8H6
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI -VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86 >MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
>MBP1_COPCI - XP_001837394 A8NYC6 >MBP1_COPCI - XP_001837394 A8NYC6
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
>MBP1_CRYNE - XP_569090 Q5KMQ9 >MBP1_CRYNE - XP_569090 Q5KMQ9
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
>MBP1_NEUCR Swi4 XP_955821 Q7RW59 >MBP1_NEUCR Swi4 XP_955821 Q7RW59
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI -IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4 >MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV -IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
>MBP1_SACCE Mbp1 NP_010227 P39678 >MBP1_SACCE Mbp1 NP_010227 P39678
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
>MBP1_SCHPO Res2 NP_593032 P41412 >MBP1_SCHPO Res2 NP_593032 P41412
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV -VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS- LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35 >MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV -IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
>MBP1_WALME - XP_006957051 I4YGC0 >MBP1_WALME - XP_006957051 I4YGC0
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI -IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY

View File

@ -1,490 +1,490 @@
[ [
{ "name" : "68476_WALME", { "name" : "68476_WALME",
"RefSeqID" : "XP_006957790", "RefSeqID" : "XP_006957790",
"UniProtID" : "I4YDD8", "UniProtID" : "I4YDD8",
"taxonomyID" : "671144", "taxonomyID" : "671144",
"sequence" : [ "sequence" : [
"MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ", "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
"PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER", "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
"GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG", "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
"LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE", "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
"LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA", "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
"LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD", "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
"FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE", "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
"IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE", "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
"GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"] "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
}, },
{ "name" : "00846_COPCI", { "name" : "00846_COPCI",
"RefSeqID" : "XP_001831299", "RefSeqID" : "XP_001831299",
"UniProtID" : "A8N8X1", "UniProtID" : "A8N8X1",
"taxonomyID" : "240176", "taxonomyID" : "240176",
"sequence" : [ "sequence" : [
"MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG", "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
"GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS", "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
"NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH", "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
"SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP", "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
"PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF", "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
"DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK", "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
"KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN", "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
"KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT", "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
"RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS", "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
"FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE", "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
"SEAQVVDIGRVSGFMQKVRDGII"] "SEAQVVDIGRVSGFMQKVRDGII"]
}, },
{ "name" : "8533_BIPOR", { "name" : "8533_BIPOR",
"RefSeqID" : "XP_007691662", "RefSeqID" : "XP_007691662",
"UniProtID" : "W6ZE71", "UniProtID" : "W6ZE71",
"taxonomyID" : "930090", "taxonomyID" : "930090",
"sequence" : [ "sequence" : [
"MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR", "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
"SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS", "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
"HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT", "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
"RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT", "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
"EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT", "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
"ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA", "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
"IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME", "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
"VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID", "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
"QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE", "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
"DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML", "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
"KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"] "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
}, },
{ "name" : "PGTG_02039", { "name" : "PGTG_02039",
"RefSeqID" : "XP_003320997", "RefSeqID" : "XP_003320997",
"UniProtID" : "E3JX03", "UniProtID" : "E3JX03",
"taxonomyID" : "418459", "taxonomyID" : "418459",
"sequence" : [ "sequence" : [
"MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV", "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
"AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS", "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
"ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP", "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
"HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP", "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
"NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL", "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
"LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH", "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
"LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI", "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
"RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN", "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
"GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC", "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
"TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK", "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
"LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"] "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
}, },
{ "name" : "MBPA_ASPNI", { "name" : "MBPA_ASPNI",
"RefSeqID" : "XP_664319", "RefSeqID" : "XP_664319",
"UniProtID" : "Q5AYB5", "UniProtID" : "Q5AYB5",
"taxonomyID" : "227321", "taxonomyID" : "227321",
"sequence" : [ "sequence" : [
"MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG", "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
"FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK", "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
"VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP", "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
"QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG", "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
"RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER", "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
"FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK", "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
"AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS", "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
"QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA", "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
"GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS", "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
"IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP", "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
"SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL", "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
"MRDRGQDW"] "MRDRGQDW"]
}, },
{ "name" : "05520_CRYNE", { "name" : "05520_CRYNE",
"RefSeqID" : "XP_570545", "RefSeqID" : "XP_570545",
"UniProtID" : "Q5KHS0", "UniProtID" : "Q5KHS0",
"taxonomyID" : "214684", "taxonomyID" : "214684",
"sequence" : [ "sequence" : [
"MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH", "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
"FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART", "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
"KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL", "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
"PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM", "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
"SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL", "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
"DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS", "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
"ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG", "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
"VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD", "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
"AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT", "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
"EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP", "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
"GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS", "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
"VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"] "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
}, },
{ "name" : "RES1_SCHPO", { "name" : "RES1_SCHPO",
"RefSeqID" : "NP_595496", "RefSeqID" : "NP_595496",
"UniProtID" : "P33520", "UniProtID" : "P33520",
"taxonomyID" : "284812", "taxonomyID" : "284812",
"sequence" : [ "sequence" : [
"MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP", "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
"SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS", "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
"GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA", "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
"LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS", "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
"KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS", "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
"KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL", "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
"AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA", "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
"DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"] "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
}, },
{ "name" : "CDC10_SCHPO", { "name" : "CDC10_SCHPO",
"RefSeqID" : "NP_596132", "RefSeqID" : "NP_596132",
"UniProtID" : "P01129", "UniProtID" : "P01129",
"taxonomyID" : "284812", "taxonomyID" : "284812",
"sequence" : [ "sequence" : [
"MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL", "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
"SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP", "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
"LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ", "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
"NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS", "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
"KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV", "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
"TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ", "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
"DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR", "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
"QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ", "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
"KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS", "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
"LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"] "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
}, },
{ "name" : "05338_USTMA", { "name" : "05338_USTMA",
"RefSeqID" : "XP_011392041", "RefSeqID" : "XP_011392041",
"UniProtID" : "A0A0D1BWD8", "UniProtID" : "A0A0D1BWD8",
"taxonomyID" : "237631", "taxonomyID" : "237631",
"sequence" : [ "sequence" : [
"MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT", "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
"SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI", "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
"PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR", "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
"AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL", "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
"RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA", "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
"DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI", "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
"TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA", "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
"DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT", "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
"NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL", "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
"GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL", "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
"TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN", "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
"LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG", "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
"TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP", "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
"DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"] "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
}, },
{ "name" : "SWI4_SACCE", { "name" : "SWI4_SACCE",
"RefSeqID" : "NP_011036", "RefSeqID" : "NP_011036",
"UniProtID" : "P25302", "UniProtID" : "P25302",
"taxonomyID" : "559292", "taxonomyID" : "559292",
"sequence" : [ "sequence" : [
"MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF", "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
"SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP", "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
"GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN", "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
"NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS", "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
"SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK", "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
"QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY", "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
"KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK", "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
"SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN", "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
"IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ", "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
"QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS", "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
"KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV", "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
"SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL", "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
"EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL", "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
"QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"] "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
}, },
{ "name" : "SWI6_NEUCR", { "name" : "SWI6_NEUCR",
"RefSeqID" : "XP_962967", "RefSeqID" : "XP_962967",
"UniProtID" : "Q7SBG9", "UniProtID" : "Q7SBG9",
"taxonomyID" : "367110", "taxonomyID" : "367110",
"sequence" : [ "sequence" : [
"MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN", "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
"ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT", "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
"PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF", "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
"PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT", "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
"EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA", "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
"LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV", "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
"KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG", "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
"ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL", "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
"HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT", "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
"QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES", "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
"EKPELEIARVRRFLGGVEGVVH"] "EKPELEIARVRRFLGGVEGVVH"]
}, },
{ "name" : "15042_USTMA", { "name" : "15042_USTMA",
"RefSeqID" : "XP_011388143", "RefSeqID" : "XP_011388143",
"UniProtID" : "A0A0D1CVS5", "UniProtID" : "A0A0D1CVS5",
"taxonomyID" : "237631", "taxonomyID" : "237631",
"sequence" : [ "sequence" : [
"MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA", "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
"HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN", "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
"GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE", "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
"DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI", "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
"ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS", "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
"SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT", "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
"AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR", "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
"SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"] "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
}, },
{ "name" : "04778_USTMA", { "name" : "04778_USTMA",
"RefSeqID" : "XP_011391646", "RefSeqID" : "XP_011391646",
"UniProtID" : "A0A0D1DQM4", "UniProtID" : "A0A0D1DQM4",
"taxonomyID" : "237631", "taxonomyID" : "237631",
"sequence" : [ "sequence" : [
"MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK", "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
"VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS", "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
"QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG", "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
"LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG", "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
"YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD", "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
"PESAQLFTIHDFGSDPFYAEQVERG"] "PESAQLFTIHDFGSDPFYAEQVERG"]
}, },
{ "name" : "STUA_ASPNI", { "name" : "STUA_ASPNI",
"RefSeqID" : "XP_663440", "RefSeqID" : "XP_663440",
"UniProtID" : "P36011", "UniProtID" : "P36011",
"taxonomyID" : "227321", "taxonomyID" : "227321",
"sequence" : [ "sequence" : [
"MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS", "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
"SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM", "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
"INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ", "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
"RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL", "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
"SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR", "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
"GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP", "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
"RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD", "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
"SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"] "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
}, },
{ "name" : "STUA_NEUCR", { "name" : "STUA_NEUCR",
"RefSeqID" : "XP_960837", "RefSeqID" : "XP_960837",
"UniProtID" : "Q1K6U0", "UniProtID" : "Q1K6U0",
"taxonomyID" : "367110", "taxonomyID" : "367110",
"sequence" : [ "sequence" : [
"MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG", "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
"QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT", "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
"RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK", "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
"DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG", "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
"NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ", "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
"SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV", "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
"NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM", "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
"PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH", "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
"RRR"] "RRR"]
}, },
{ "name" : "PHD1_SACCE", { "name" : "PHD1_SACCE",
"RefSeqID" : "NP_012881", "RefSeqID" : "NP_012881",
"UniProtID" : "P36093", "UniProtID" : "P36093",
"taxonomyID" : "559292", "taxonomyID" : "559292",
"sequence" : [ "sequence" : [
"MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA", "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
"VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS", "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
"IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS", "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
"EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI", "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
"ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"] "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
}, },
{ "name" : "08099_COPCI", { "name" : "08099_COPCI",
"RefSeqID" : "XP_001836714", "RefSeqID" : "XP_001836714",
"UniProtID" : "A8NVH3", "UniProtID" : "A8NVH3",
"taxonomyID" : "240176", "taxonomyID" : "240176",
"sequence" : [ "sequence" : [
"MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS", "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
"KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS", "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
"NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS", "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
"HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV", "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
"LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ", "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
"EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA", "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
"PHRPW"] "PHRPW"]
}, },
{ "name" : "68479_WALME", { "name" : "68479_WALME",
"RefSeqID" : "XP_006957792", "RefSeqID" : "XP_006957792",
"UniProtID" : "I4YDE0", "UniProtID" : "I4YDE0",
"taxonomyID" : "671144", "taxonomyID" : "671144",
"sequence" : [ "sequence" : [
"MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF", "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
"ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS", "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
"PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS", "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
"SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"] "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
}, },
{ "name" : "11943_PUCGR", { "name" : "11943_PUCGR",
"RefSeqID" : "XP_003330006", "RefSeqID" : "XP_003330006",
"UniProtID" : "E3KMR2", "UniProtID" : "E3KMR2",
"taxonomyID" : "418459", "taxonomyID" : "418459",
"sequence" : [ "sequence" : [
"MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV", "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
"HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP", "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
"GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA", "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
"RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS", "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
"MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN", "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
"QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA", "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
"GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL", "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
"TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"] "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
}, },
{ "name" : "03082_PUCGR", { "name" : "03082_PUCGR",
"RefSeqID" : "XP_003321545", "RefSeqID" : "XP_003321545",
"UniProtID" : "E3JYK1", "UniProtID" : "E3JYK1",
"taxonomyID" : "418459", "taxonomyID" : "418459",
"sequence" : [ "sequence" : [
"MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV", "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
"STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP", "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
"PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL", "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
"FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK", "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
"RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP", "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
"RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA", "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
"QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD", "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
"AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR", "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
"LIMEWNPSC"] "LIMEWNPSC"]
}, },
{ "name" : "SOK2_SACCE", { "name" : "SOK2_SACCE",
"RefSeqID" : "NP_013729", "RefSeqID" : "NP_013729",
"UniProtID" : "P53438", "UniProtID" : "P53438",
"taxonomyID" : "559292", "taxonomyID" : "559292",
"sequence" : [ "sequence" : [
"MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS", "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
"QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS", "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
"MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP", "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
"PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY", "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
"NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG", "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
"SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM", "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
"HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS", "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
"AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA", "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
"TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ", "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
"LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"] "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
}, },
{ "name" : "14426_COPCI", { "name" : "14426_COPCI",
"RefSeqID" : "XP_002911429", "RefSeqID" : "XP_002911429",
"UniProtID" : "D6RMB0", "UniProtID" : "D6RMB0",
"taxonomyID" : "240176", "taxonomyID" : "240176",
"sequence" : [ "sequence" : [
"MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA", "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
"EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP", "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
"TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT", "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
"KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK", "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
"KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA", "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
"ITYLPNFL"] "ITYLPNFL"]
}, },
{ "name" : "BQT4_SCHPO", { "name" : "BQT4_SCHPO",
"RefSeqID" : "NP_596166", "RefSeqID" : "NP_596166",
"UniProtID" : "O60158", "UniProtID" : "O60158",
"taxonomyID" : "284812", "taxonomyID" : "284812",
"sequence" : [ "sequence" : [
"MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA", "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
"FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG", "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
"EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS", "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
"SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN", "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
"KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK", "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
"LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"] "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
}, },
{ "name" : "PGTG_05590", { "name" : "PGTG_05590",
"RefSeqID" : "XP_003323688", "RefSeqID" : "XP_003323688",
"UniProtID" : "E3K4V4", "UniProtID" : "E3K4V4",
"taxonomyID" : "418459", "taxonomyID" : "418459",
"sequence" : [ "sequence" : [
"MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT", "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
"KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS", "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
"SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS", "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
"PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND", "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
"ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN", "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
"TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG", "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
"GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"] "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
}, },
{ "name" : "06560_NEUCR", { "name" : "06560_NEUCR",
"RefSeqID" : "XP_962267", "RefSeqID" : "XP_962267",
"UniProtID" : "Q7S9H5", "UniProtID" : "Q7S9H5",
"taxonomyID" : "367110", "taxonomyID" : "367110",
"sequence" : [ "sequence" : [
"MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP", "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
"PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP", "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
"SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT", "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
"VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV", "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
"DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE", "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
"TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV", "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
"ANVL"] "ANVL"]
}, },
{ "name" : "81480_BIPOR", { "name" : "81480_BIPOR",
"RefSeqID" : "XP_007682909", "RefSeqID" : "XP_007682909",
"UniProtID" : "W6ZKJ4", "UniProtID" : "W6ZKJ4",
"taxonomyID" : "930090", "taxonomyID" : "930090",
"sequence" : [ "sequence" : [
"MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN", "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
"RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI", "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
"EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP", "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
"DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE", "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
"TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK", "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
"KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"] "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
}, },
{ "name" : "01622_ASPNI", { "name" : "01622_ASPNI",
"RefSeqID" : "XP_657766", "RefSeqID" : "XP_657766",
"UniProtID" : "Q5BH18", "UniProtID" : "Q5BH18",
"taxonomyID" : "227321", "taxonomyID" : "227321",
"sequence" : [ "sequence" : [
"MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP", "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
"QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA", "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
"LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL", "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
"QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV", "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
"EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE", "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
"RVRNRALMGVTAAFALAKPALVLLEA"] "RVRNRALMGVTAAFALAKPALVLLEA"]
}, },
{ "name" : "05405_ASPNI", { "name" : "05405_ASPNI",
"RefSeqID" : "XP_663009", "RefSeqID" : "XP_663009",
"UniProtID" : "Q5B225", "UniProtID" : "Q5B225",
"taxonomyID" : "227321", "taxonomyID" : "227321",
"sequence" : [ "sequence" : [
"MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD", "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
"FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM", "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
"LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA", "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
"TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS", "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
"PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP", "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
"SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR", "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
"DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD", "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
"ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"] "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
}, },
{ "name" : "105954_BIPOR", { "name" : "105954_BIPOR",
"RefSeqID" : "XP_007691967", "RefSeqID" : "XP_007691967",
"UniProtID" : "W6Z1H5", "UniProtID" : "W6Z1H5",
"taxonomyID" : "930090", "taxonomyID" : "930090",
"sequence" : [ "sequence" : [
"MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI", "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
"YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR", "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
"ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE", "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
"TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP", "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
"RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV", "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
"GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"] "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
}, },
{ "name" : "69819_WALME", { "name" : "69819_WALME",
"RefSeqID" : "XP_006959479", "RefSeqID" : "XP_006959479",
"UniProtID" : "I4Y911", "UniProtID" : "I4Y911",
"taxonomyID" : "671144", "taxonomyID" : "671144",
"sequence" : [ "sequence" : [
"MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI", "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
"ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS", "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
"PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL", "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
"KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ", "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
"NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"] "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
}, },
{ "name" : "02840_CRYNE", { "name" : "02840_CRYNE",
"RefSeqID" : "XP_568872", "RefSeqID" : "XP_568872",
"UniProtID" : "Q5KM59", "UniProtID" : "Q5KM59",
"taxonomyID" : "214684", "taxonomyID" : "214684",
"sequence" : [ "sequence" : [
"MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG", "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
"THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV", "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
"QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT", "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
"EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA", "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
"VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"] "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
}, },
{ "name" : "11055_USTMA", { "name" : "11055_USTMA",
"RefSeqID" : "XP_011390537", "RefSeqID" : "XP_011390537",
"UniProtID" : "A0A0D1DZM8", "UniProtID" : "A0A0D1DZM8",
"taxonomyID" : "237631", "taxonomyID" : "237631",
"sequence" : [ "sequence" : [
"MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK", "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
"LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF", "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
"LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV", "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
"AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR", "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
"VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV", "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
"VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL", "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
"ASILPW"] "ASILPW"]
}, },
{ "name" : "XBP1_NEUCR", { "name" : "XBP1_NEUCR",
"RefSeqID" : "XP_962373", "RefSeqID" : "XP_962373",
"UniProtID" : "Q7S9W7", "UniProtID" : "Q7S9W7",
"taxonomyID" : "367110", "taxonomyID" : "367110",
"sequence" : [ "sequence" : [
"MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT", "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
"MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR", "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
"GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH", "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
"SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP", "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
"RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ", "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
"DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT", "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
"SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT", "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
"TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD", "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
"DEGDYEHEQQYRRKRRRLLLVGRAKSF"] "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
}, },
{ "name" : "XBP1_SACCE", { "name" : "XBP1_SACCE",
"RefSeqID" : "NP_012165", "RefSeqID" : "NP_012165",
"UniProtID" : "P40489", "UniProtID" : "P40489",
"taxonomyID" : "559292", "taxonomyID" : "559292",
"sequence" : [ "sequence" : [
"MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA", "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
"QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR", "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
"ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE", "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
"FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS", "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
"YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ", "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
"NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND", "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
"KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY", "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
"DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK", "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
"FKTNSKQ"] "FKTNSKQ"]
} }
] ]

View File

@ -1,116 +1,116 @@
[ [
{"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
{"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"}, {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"}, {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"}, {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"}, {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
{"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"}, {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"}, {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"}, {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
{"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"}, {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
{"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"}, {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
{"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"}, {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
{"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"}, {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
{"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"}, {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"}, {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"}, {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
{"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"}, {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
{"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"}, {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
{"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"}, {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
{"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"}, {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
{"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"}, {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"}, {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"}, {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
{"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"}, {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"}, {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
{"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"}, {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
{"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"}, {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"}, {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
{"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"}, {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
{"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"}, {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
{"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"}, {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
{"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"}, {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"}, {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
{"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"}, {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
{"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"}, {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
{"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"}, {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
{"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"}, {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
{"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"}, {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"}, {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"}, {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"}, {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
{"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"}, {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"}, {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
{"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"}, {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
{"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"}, {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
{"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"}, {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
{"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"}, {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
{"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"}, {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"}, {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"}, {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
{"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"}, {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"}, {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"}, {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
{"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"}, {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
{"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"}, {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"}, {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
{"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"}, {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
{"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"}, {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
{"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"}, {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"}, {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"}, {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"}, {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
{"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"}, {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"}, {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
{"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"}, {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
{"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"}, {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
{"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"}, {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
{"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"}, {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"}, {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
{"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"}, {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"}, {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"}, {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
{"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"}, {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
{"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"}, {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
{"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"}, {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
{"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"}, {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
{"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"}, {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"}, {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"}, {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"}, {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
{"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"}, {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
{"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"}, {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"}, {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"}, {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
{"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"}, {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
{"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"}, {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
{"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"}, {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
{"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"}, {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
{"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"}, {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"}, {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"}, {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"}, {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
{"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"}, {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"}, {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
{"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"}, {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
{"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"}, {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
{"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"} {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
] ]

View File

@ -1,47 +1,47 @@
[ [
{ "name" : "APSES fold", { "name" : "APSES fold",
"description " : "DNA binding domain by similarity to structure", "description " : "DNA binding domain by similarity to structure",
"sourceDB" : "PDB", "sourceDB" : "PDB",
"accession" : "1BM8_A_1_99"}, "accession" : "1BM8_A_1_99"},
{ "name" : "KilA-N", { "name" : "KilA-N",
"description " : "DNA binding domain by Pfam annotation", "description " : "DNA binding domain by Pfam annotation",
"sourceDB" : "Pfam", "sourceDB" : "Pfam",
"accession" : "PF04383"}, "accession" : "PF04383"},
{ "name" : "AT hook", { "name" : "AT hook",
"description " : "DNA interaction motif by SMART annotation", "description " : "DNA interaction motif by SMART annotation",
"sourceDB" : "SMART", "sourceDB" : "SMART",
"accession" : null}, "accession" : null},
{ "name" : "low complexity", { "name" : "low complexity",
"description " : "SEG annotation by SMART", "description " : "SEG annotation by SMART",
"sourceDB" : "SMART", "sourceDB" : "SMART",
"accession" : null}, "accession" : null},
{ "name" : "Ankyrin fold", { "name" : "Ankyrin fold",
"description " : "Ankyrin domain by SMART annotation", "description " : "Ankyrin domain by SMART annotation",
"sourceDB" : "SMART", "sourceDB" : "SMART",
"accession" : "SM00248"}, "accession" : "SM00248"},
{ "name" : "Swi6 fold", { "name" : "Swi6 fold",
"description " : "Swi6 fold by similarity to structure", "description " : "Swi6 fold by similarity to structure",
"sourceDB" : "PDB", "sourceDB" : "PDB",
"accession" : "1SW6_B"}, "accession" : "1SW6_B"},
{ "name" : "coiled coil", { "name" : "coiled coil",
"description " : "Coiled coil by SMART annotation", "description " : "Coiled coil by SMART annotation",
"sourceDB" : "SMART", "sourceDB" : "SMART",
"accession" : null}, "accession" : null},
{ "name" : "McInerny 2011", { "name" : "McInerny 2011",
"description " : "Yeast cell cycle review", "description " : "Yeast cell cycle review",
"sourceDB" : "PubMed", "sourceDB" : "PubMed",
"accession" : "21310294"} "accession" : "21310294"}
] ]

View File

@ -1,155 +1,155 @@
[ [
{ "name" : "MBP1_SCHPO", { "name" : "MBP1_SCHPO",
"RefSeqID" : "NP_593032", "RefSeqID" : "NP_593032",
"UniProtID" : "P41412", "UniProtID" : "P41412",
"taxonomyID" : 284812, "taxonomyID" : 284812,
"sequence" : [ "sequence" : [
"MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ", "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
"GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS", "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
"STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK", "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
"YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL", "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
"SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI", "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
"SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL", "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
"QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL", "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
"NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS", "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
"DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI", "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
"AMSCGINPEDLSLEILDAVEEALTREK"] "AMSCGINPEDLSLEILDAVEEALTREK"]
}, },
{ "name" : "MBP1_ASPNI", { "name" : "MBP1_ASPNI",
"RefSeqID" : "XP_660758", "RefSeqID" : "XP_660758",
"UniProtID" : "Q5B8H6", "UniProtID" : "Q5B8H6",
"taxonomyID" : 227321, "taxonomyID" : 227321,
"sequence" : [ "sequence" : [
"MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV", "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV", "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
"FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA", "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
"AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL", "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
"LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS", "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
"SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS", "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
"SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL", "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
"TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE", "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
"QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF", "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
"DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"] "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
}, },
{ "name" : "MBP1_BIPOR", { "name" : "MBP1_BIPOR",
"RefSeqID" : "XP_007682304", "RefSeqID" : "XP_007682304",
"UniProtID" : "W6ZM86", "UniProtID" : "W6ZM86",
"taxonomyID" : 930090, "taxonomyID" : 930090,
"sequence" : [ "sequence" : [
"MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV", "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
"QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA", "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
"AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG", "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
"NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV", "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
"GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS", "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
"SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK", "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
"GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA", "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
"SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND", "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
"EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV", "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
"REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"] "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
}, },
{ "name" : "MBP1_NEUCR", { "name" : "MBP1_NEUCR",
"RefSeqID" : "XP_955821", "RefSeqID" : "XP_955821",
"UniProtID" : "Q7RW59", "UniProtID" : "Q7RW59",
"taxonomyID" : 367110, "taxonomyID" : 367110,
"sequence" : [ "sequence" : [
"MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV", "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
"QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV", "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
"PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS", "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
"TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG", "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
"DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK", "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
"IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP", "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
"NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV", "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
"QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE", "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
"MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE", "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
"NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA", "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
"RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG", "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
"GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"] "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
}, },
{ "name" : "MBP1_COPCI", { "name" : "MBP1_COPCI",
"RefSeqID" : "XP_001837394", "RefSeqID" : "XP_001837394",
"UniProtID" : "A8NYC6", "UniProtID" : "A8NYC6",
"taxonomyID" : 240176, "taxonomyID" : 240176,
"sequence" : [ "sequence" : [
"MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG", "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
"YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN", "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
"TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY", "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
"RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA", "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
"MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD", "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
"VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN", "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
"HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST", "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
"RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE", "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
"NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ", "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
"VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA", "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
"GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"] "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
}, },
{ "name" : "MBP1_CRYNE", { "name" : "MBP1_CRYNE",
"RefSeqID" : "XP_569090", "RefSeqID" : "XP_569090",
"UniProtID" : "Q5KMQ9", "UniProtID" : "Q5KMQ9",
"taxonomyID" : 214684, "taxonomyID" : 214684,
"sequence" : [ "sequence" : [
"MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV", "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
"QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE", "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
"KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM", "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
"TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG", "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
"IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT", "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
"AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK", "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
"YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE", "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
"DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE", "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
"RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR", "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
"EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ", "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
"IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"] "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
}, },
{ "name" : "MBP1_PUCGR", { "name" : "MBP1_PUCGR",
"RefSeqID" : "XP_003327086", "RefSeqID" : "XP_003327086",
"UniProtID" : "E3KED4", "UniProtID" : "E3KED4",
"taxonomyID" : 418459, "taxonomyID" : 418459,
"sequence" : [ "sequence" : [
"MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY", "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
"HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE", "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
"REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK", "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
"VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN", "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
"LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS", "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
"QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS", "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
"QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH", "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
"DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE", "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
"DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG", "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
"GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD", "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
"LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR", "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
"RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE", "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
"SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL", "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
"NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"] "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
}, },
{ "name" : "MBP1_USTMA", { "name" : "MBP1_USTMA",
"RefSeqID" : "XP_011392621", "RefSeqID" : "XP_011392621",
"UniProtID" : "A0A0D1DP35", "UniProtID" : "A0A0D1DP35",
"taxonomyID" : 237631, "taxonomyID" : 237631,
"sequence" : [ "sequence" : [
"MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG", "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
"GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS", "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
"RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY", "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
"ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ", "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
"TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY", "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
"GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP", "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
"AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE", "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
"IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS", "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
"TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM", "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
"AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA", "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
"P"] "P"]
}, },
{ "name" : "MBP1_WALME", { "name" : "MBP1_WALME",
"RefSeqID" : "XP_006957051", "RefSeqID" : "XP_006957051",
"UniProtID" : "I4YGC0", "UniProtID" : "I4YGC0",
"taxonomyID" : 671144, "taxonomyID" : 671144,
"sequence" : [ "sequence" : [
"MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG", "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
"YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK", "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
"VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID", "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
"GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV", "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
"NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS", "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
"KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP", "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
"LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR", "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
"NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE", "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
"ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA", "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
"NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"] "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
} }
] ]

View File

@ -1,22 +1,22 @@
[ [
{ "ID" : 227321, { "ID" : 227321,
"species" : "Aspergillus nidulans FGSC A4"}, "species" : "Aspergillus nidulans FGSC A4"},
{ "ID" : 930090, { "ID" : 930090,
"species" : "Bipolaris oryzae ATCC 44560"}, "species" : "Bipolaris oryzae ATCC 44560"},
{ "ID" : 240176, { "ID" : 240176,
"species" : "Coprinopsis cinerea okayama7#130"}, "species" : "Coprinopsis cinerea okayama7#130"},
{ "ID" : 214684, { "ID" : 214684,
"species" : "Cryptococcus neoformans var. neoformans JEC21"}, "species" : "Cryptococcus neoformans var. neoformans JEC21"},
{ "ID" : 367110, { "ID" : 367110,
"species" : "Neurospora crassa OR74A"}, "species" : "Neurospora crassa OR74A"},
{ "ID" : 418459, { "ID" : 418459,
"species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"}, "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
{ "ID" : 559292, { "ID" : 559292,
"species" : "Saccharomyces cerevisiae S288C"}, "species" : "Saccharomyces cerevisiae S288C"},
{ "ID" : 284812, { "ID" : 284812,
"species" : "Schizosaccharomyces pombe 972h-"}, "species" : "Schizosaccharomyces pombe 972h-"},
{ "ID" : 237631, { "ID" : 237631,
"species" : "Ustilago maydis 521"}, "species" : "Ustilago maydis 521"},
{ "ID" : 671144, { "ID" : 671144,
"species" : "Wallemia mellicola CBS 633.66"} "species" : "Wallemia mellicola CBS 633.66"}
] ]

View File

@ -1,115 +1,115 @@
ID protein.ID feature.ID start end note ID protein.ID feature.ID start end note
# MBP1_SACCE # MBP1_SACCE
NA ref_pro_4 ref_ftr_1 4 102 APSES fold NA ref_pro_4 ref_ftr_1 4 102 APSES fold
NA ref_pro_4 ref_ftr_2 22 105 KilA-N NA ref_pro_4 ref_ftr_2 22 105 KilA-N
NA ref_pro_4 ref_ftr_4 108 122 low complexity NA ref_pro_4 ref_ftr_4 108 122 low complexity
NA ref_pro_4 ref_ftr_4 236 241 low complexity NA ref_pro_4 ref_ftr_4 236 241 low complexity
NA ref_pro_4 ref_ftr_4 279 307 low complexity NA ref_pro_4 ref_ftr_4 279 307 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_4 700 717 low complexity NA ref_pro_4 ref_ftr_4 700 717 low complexity
NA ref_pro_4 ref_ftr_5 394 423 Ankyrin NA ref_pro_4 ref_ftr_5 394 423 Ankyrin
NA ref_pro_4 ref_ftr_5 427 463 Ankyrin NA ref_pro_4 ref_ftr_5 427 463 Ankyrin
NA ref_pro_4 ref_ftr_5 512 541 Ankyrin NA ref_pro_4 ref_ftr_5 512 541 Ankyrin
NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold NA ref_pro_4 ref_ftr_6 381 547 Swi6 fold
NA ref_pro_4 ref_ftr_7 633 655 coiled coil NA ref_pro_4 ref_ftr_7 633 655 coiled coil
# MBP1_ASPNI # MBP1_ASPNI
NA ref_pro_1 ref_ftr_1 9 106 APSES fold NA ref_pro_1 ref_ftr_1 9 106 APSES fold
NA ref_pro_1 ref_ftr_2 26 109 KilA-N NA ref_pro_1 ref_ftr_2 26 109 KilA-N
NA ref_pro_1 ref_ftr_4 529 534 low complexity NA ref_pro_1 ref_ftr_4 529 534 low complexity
NA ref_pro_1 ref_ftr_5 260 289 Ankyrin NA ref_pro_1 ref_ftr_5 260 289 Ankyrin
NA ref_pro_1 ref_ftr_5 381 413 Ankyrin NA ref_pro_1 ref_ftr_5 381 413 Ankyrin
NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold NA ref_pro_1 ref_ftr_6 193 402 Swi6 fold
NA ref_pro_1 ref_ftr_7 509 572 coiled coil NA ref_pro_1 ref_ftr_7 509 572 coiled coil
# MBP1_BIPOR # MBP1_BIPOR
NA ref_pro_2 ref_ftr_1 8 106 APSES fold NA ref_pro_2 ref_ftr_1 8 106 APSES fold
NA ref_pro_2 ref_ftr_2 26 109 KilA-N NA ref_pro_2 ref_ftr_2 26 109 KilA-N
NA ref_pro_2 ref_ftr_4 134 152 low complexity NA ref_pro_2 ref_ftr_4 134 152 low complexity
NA ref_pro_2 ref_ftr_4 267 278 low complexity NA ref_pro_2 ref_ftr_4 267 278 low complexity
NA ref_pro_2 ref_ftr_4 670 685 low complexity NA ref_pro_2 ref_ftr_4 670 685 low complexity
NA ref_pro_2 ref_ftr_5 266 295 Ankyrin NA ref_pro_2 ref_ftr_5 266 295 Ankyrin
NA ref_pro_2 ref_ftr_5 387 416 Ankyrin NA ref_pro_2 ref_ftr_5 387 416 Ankyrin
NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold NA ref_pro_2 ref_ftr_6 253 421 Swi6 fold
NA ref_pro_2 ref_ftr_7 659 681 coiled coil NA ref_pro_2 ref_ftr_7 659 681 coiled coil
NA ref_pro_2 ref_ftr_7 500 590 coiled coil NA ref_pro_2 ref_ftr_7 500 590 coiled coil
# MBP1_NEUCR # MBP1_NEUCR
NA ref_pro_3 ref_ftr_1 14 114 APSES fold NA ref_pro_3 ref_ftr_1 14 114 APSES fold
NA ref_pro_3 ref_ftr_2 34 117 KilA-N NA ref_pro_3 ref_ftr_2 34 117 KilA-N
NA ref_pro_3 ref_ftr_4 130 141 low complexity NA ref_pro_3 ref_ftr_4 130 141 low complexity
NA ref_pro_3 ref_ftr_4 253 266 low complexity NA ref_pro_3 ref_ftr_4 253 266 low complexity
NA ref_pro_3 ref_ftr_4 514 525 low complexity NA ref_pro_3 ref_ftr_4 514 525 low complexity
NA ref_pro_3 ref_ftr_4 554 564 low complexity NA ref_pro_3 ref_ftr_4 554 564 low complexity
NA ref_pro_3 ref_ftr_4 601 618 low complexity NA ref_pro_3 ref_ftr_4 601 618 low complexity
NA ref_pro_3 ref_ftr_4 620 629 low complexity NA ref_pro_3 ref_ftr_4 620 629 low complexity
NA ref_pro_3 ref_ftr_4 636 652 low complexity NA ref_pro_3 ref_ftr_4 636 652 low complexity
NA ref_pro_3 ref_ftr_4 658 672 low complexity NA ref_pro_3 ref_ftr_4 658 672 low complexity
NA ref_pro_3 ref_ftr_4 725 735 low complexity NA ref_pro_3 ref_ftr_4 725 735 low complexity
NA ref_pro_3 ref_ftr_4 752 771 low complexity NA ref_pro_3 ref_ftr_4 752 771 low complexity
NA ref_pro_3 ref_ftr_5 268 297 Ankyrin NA ref_pro_3 ref_ftr_5 268 297 Ankyrin
NA ref_pro_3 ref_ftr_5 390 419 Ankyrin NA ref_pro_3 ref_ftr_5 390 419 Ankyrin
NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold NA ref_pro_3 ref_ftr_6 270 426 Swi6 fold
NA ref_pro_3 ref_ftr_7 500 550 coiled coil NA ref_pro_3 ref_ftr_7 500 550 coiled coil
# MBP1_SCHPO # MBP1_SCHPO
NA ref_pro_5 ref_ftr_1 8 104 APSES fold NA ref_pro_5 ref_ftr_1 8 104 APSES fold
NA ref_pro_5 ref_ftr_2 25 113 KilA-N NA ref_pro_5 ref_ftr_2 25 113 KilA-N
NA ref_pro_5 ref_ftr_4 111 125 low complexity NA ref_pro_5 ref_ftr_4 111 125 low complexity
NA ref_pro_5 ref_ftr_4 136 145 low complexity NA ref_pro_5 ref_ftr_4 136 145 low complexity
NA ref_pro_5 ref_ftr_4 176 191 low complexity NA ref_pro_5 ref_ftr_4 176 191 low complexity
NA ref_pro_5 ref_ftr_4 422 447 low complexity NA ref_pro_5 ref_ftr_4 422 447 low complexity
NA ref_pro_5 ref_ftr_5 247 276 Ankyrin NA ref_pro_5 ref_ftr_5 247 276 Ankyrin
NA ref_pro_5 ref_ftr_5 368 397 Ankyrin NA ref_pro_5 ref_ftr_5 368 397 Ankyrin
NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold NA ref_pro_5 ref_ftr_6 234 400 Swi6 fold
NA ref_pro_5 ref_ftr_7 457 538 coiled coil NA ref_pro_5 ref_ftr_7 457 538 coiled coil
# MBP1_COPCI # MBP1_COPCI
NA ref_pro_6 ref_ftr_1 5 103 APSES fold NA ref_pro_6 ref_ftr_1 5 103 APSES fold
NA ref_pro_6 ref_ftr_2 23 106 KilA-N NA ref_pro_6 ref_ftr_2 23 106 KilA-N
NA ref_pro_6 ref_ftr_4 170 191 low complexity NA ref_pro_6 ref_ftr_4 170 191 low complexity
NA ref_pro_6 ref_ftr_4 435 450 low complexity NA ref_pro_6 ref_ftr_4 435 450 low complexity
NA ref_pro_6 ref_ftr_4 611 626 low complexity NA ref_pro_6 ref_ftr_4 611 626 low complexity
NA ref_pro_6 ref_ftr_5 270 299 Ankyrin NA ref_pro_6 ref_ftr_5 270 299 Ankyrin
NA ref_pro_6 ref_ftr_5 389 418 Ankyrin NA ref_pro_6 ref_ftr_5 389 418 Ankyrin
NA ref_pro_6 ref_ftr_5 474 509 Ankyrin NA ref_pro_6 ref_ftr_5 474 509 Ankyrin
NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold NA ref_pro_6 ref_ftr_6 257 429 Swi6 fold
NA ref_pro_6 ref_ftr_7 500 570 coiled coil NA ref_pro_6 ref_ftr_7 500 570 coiled coil
NA ref_pro_6 ref_ftr_7 651 678 coiled coil NA ref_pro_6 ref_ftr_7 651 678 coiled coil
# MBP1_CRYNE # MBP1_CRYNE
NA ref_pro_7 ref_ftr_1 113 211 APSES fold NA ref_pro_7 ref_ftr_1 113 211 APSES fold
NA ref_pro_7 ref_ftr_2 131 215 KilA-N NA ref_pro_7 ref_ftr_2 131 215 KilA-N
NA ref_pro_7 ref_ftr_4 66 85 low complexity NA ref_pro_7 ref_ftr_4 66 85 low complexity
NA ref_pro_7 ref_ftr_4 413 423 low complexity NA ref_pro_7 ref_ftr_4 413 423 low complexity
NA ref_pro_7 ref_ftr_4 633 644 low complexity NA ref_pro_7 ref_ftr_4 633 644 low complexity
NA ref_pro_7 ref_ftr_4 697 709 low complexity NA ref_pro_7 ref_ftr_4 697 709 low complexity
NA ref_pro_7 ref_ftr_5 477 506 Ankyrin NA ref_pro_7 ref_ftr_5 477 506 Ankyrin
NA ref_pro_7 ref_ftr_5 618 647 Ankyrin NA ref_pro_7 ref_ftr_5 618 647 Ankyrin
NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold NA ref_pro_7 ref_ftr_6 452 663 Swi6 fold
# MBP1_PUCGR # MBP1_PUCGR
NA ref_pro_8 ref_ftr_1 90 187 APSES fold NA ref_pro_8 ref_ftr_1 90 187 APSES fold
NA ref_pro_8 ref_ftr_2 107 190 KilA-N NA ref_pro_8 ref_ftr_2 107 190 KilA-N
NA ref_pro_8 ref_ftr_4 208 227 low complexity NA ref_pro_8 ref_ftr_4 208 227 low complexity
NA ref_pro_8 ref_ftr_4 273 291 low complexity NA ref_pro_8 ref_ftr_4 273 291 low complexity
NA ref_pro_8 ref_ftr_5 442 271 Ankyrin NA ref_pro_8 ref_ftr_5 442 271 Ankyrin
NA ref_pro_8 ref_ftr_5 475 509 Ankyrin NA ref_pro_8 ref_ftr_5 475 509 Ankyrin
NA ref_pro_8 ref_ftr_5 561 590 Ankyrin NA ref_pro_8 ref_ftr_5 561 590 Ankyrin
NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold NA ref_pro_8 ref_ftr_6 429 601 Swi6 fold
NA ref_pro_8 ref_ftr_7 827 863 coiled coil NA ref_pro_8 ref_ftr_7 827 863 coiled coil
# MBP1_USTMA # MBP1_USTMA
NA ref_pro_9 ref_ftr_1 7 104 APSES fold NA ref_pro_9 ref_ftr_1 7 104 APSES fold
NA ref_pro_9 ref_ftr_2 24 107 KilA-N NA ref_pro_9 ref_ftr_2 24 107 KilA-N
NA ref_pro_9 ref_ftr_4 106 116 low complexity NA ref_pro_9 ref_ftr_4 106 116 low complexity
NA ref_pro_9 ref_ftr_4 161 183 low complexity NA ref_pro_9 ref_ftr_4 161 183 low complexity
NA ref_pro_9 ref_ftr_4 657 672 low complexity NA ref_pro_9 ref_ftr_4 657 672 low complexity
NA ref_pro_9 ref_ftr_4 776 796 low complexity NA ref_pro_9 ref_ftr_4 776 796 low complexity
NA ref_pro_9 ref_ftr_5 245 274 Ankyrin NA ref_pro_9 ref_ftr_5 245 274 Ankyrin
NA ref_pro_9 ref_ftr_5 355 384 Ankyrin NA ref_pro_9 ref_ftr_5 355 384 Ankyrin
NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold NA ref_pro_9 ref_ftr_6 232 395 Swi6 fold
NA ref_pro_9 ref_ftr_7 581 609 coiled coil NA ref_pro_9 ref_ftr_7 581 609 coiled coil
# MBP1_WALME # MBP1_WALME
NA ref_pro_10 ref_ftr_1 6 103 APSES fold NA ref_pro_10 ref_ftr_1 6 103 APSES fold
NA ref_pro_10 ref_ftr_2 23 106 KilA-N NA ref_pro_10 ref_ftr_2 23 106 KilA-N
NA ref_pro_10 ref_ftr_4 149 162 low complexity NA ref_pro_10 ref_ftr_4 149 162 low complexity
NA ref_pro_10 ref_ftr_4 171 188 low complexity NA ref_pro_10 ref_ftr_4 171 188 low complexity
NA ref_pro_10 ref_ftr_4 618 628 low complexity NA ref_pro_10 ref_ftr_4 618 628 low complexity
NA ref_pro_10 ref_ftr_4 634 660 low complexity NA ref_pro_10 ref_ftr_4 634 660 low complexity
NA ref_pro_10 ref_ftr_5 250 279 Ankyrin NA ref_pro_10 ref_ftr_5 250 279 Ankyrin
NA ref_pro_10 ref_ftr_5 369 398 Ankyrin NA ref_pro_10 ref_ftr_5 369 398 Ankyrin
NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold NA ref_pro_10 ref_ftr_6 237 409 Swi6 fold
NA ref_pro_10 ref_ftr_7 461 585 coiled coil NA ref_pro_10 ref_ftr_7 461 585 coiled coil

View File

@ -1,37 +1,37 @@
# functionTemplate.R # functionTemplate.R
# #
# Purpose: (General) # Purpose: (General)
# #
# ToDo: # ToDo:
# Notes: # Notes:
# #
# ============================================================================== # ==============================================================================
myFunction <- function(a, b=1) { myFunction <- function(a, b=1) {
# Purpose: # Purpose:
# Describe ... # Describe ...
# Version: # Version:
# Date: # Date:
# Author: # Author:
# #
# Parameters: # Parameters:
# a: ... # a: ...
# b: ... # b: ...
# Value: # Value:
# result: ... # result: ...
# Example: <example invocation> # Example: <example invocation>
# code ... # code ...
return(result) return(result)
} }
# ==== TESTS ================================================================= # ==== TESTS =================================================================
# Enter your function tests here... # Enter your function tests here...
if (FALSE) { if (FALSE) {
# test ... # test ...
} }
# [END] # [END]

View File

@ -1,21 +1,21 @@
# .myProfile.R # .myProfile.R
# This contains information which the course framework needs from time to time # This contains information which the course framework needs from time to time
# to personalize assignments, validate submissions etc. Make sure that # to personalize assignments, validate submissions etc. Make sure that
# the information correctly matches our official records. # the information correctly matches our official records.
# myEmail char A string with your eMail address. Use your official # myEmail char A string with your eMail address. Use your official
# UofT eMail address. # UofT eMail address.
# myStudentNumber numeric Your UofT student number. Take care to have this # myStudentNumber numeric Your UofT student number. Take care to have this
# correct. # correct.
# #
# NOTE: # NOTE:
# After you have updated this script, move the file to your "myScripts" folder. # After you have updated this script, move the file to your "myScripts" folder.
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R" # Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
# #
# ============================================================================== # ==============================================================================
# options(stringsAsFactors = FALSE) # options(stringsAsFactors = FALSE)
myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca" myEMail <- "yh.deng@mail.utoronto.ca" # e.g. "u.franklin@utoronto.ca"
myStudentNumber <- 1005845285 # e.g. 1003141592 myStudentNumber <- 1005845285 # e.g. 1003141592
MYSPE <- "Cutaneotrichosporon oleaginosum" MYSPE <- "Cutaneotrichosporon oleaginosum"
# [END] # [END]

View File

@ -1,54 +1,51 @@
myFA <- readFASTA("data/RAB39B_HSa_coding.fa") gen_mutations <- function(seq, N) {
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa")) sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa")) stats <- c()
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa")) stats <- cbind(stats, c(0, 0, 0))
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names rownames(stats) <- c("silent", "missense", "nonsense")
colnames(stats) <- c("occurrences")
gen_mutations <- function(seq, N) { # Actual function
stats <- c() for (i in 1:N) {
stats <- cbind(stats, c(0, 0, 0)) original_seq <- Biostrings::DNAString(seq)
rownames(stats) <- c("silent", "missense", "nonsense") aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
colnames(stats) <- c("occurrences")
# Actual function mut_seq <- Biostrings::DNAString(seq)
for (i in 1:217) { mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
# select index for mutation possible_mutations <- Biostrings::DNA_BASES
working_seq <- Biostrings::DNAString(seq) possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE) mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
mut_action <- sample(c("ins", "del", "sub"), 1, TRUE) mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
mut_seq <- Biostrings::DNAString(seq)
if (mut_action == "sub") {
mut_index <- sample(1:length(working_seq), 1, replace = TRUE) term_aa <- regexpr(pattern = "\\*", aa_seq)
possible_mutations <- Biostrings::DNA_BASES term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))] if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
mut_change <- sample(possible_mutations, 1, replace = TRUE) stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change) } else if (mut_aa == aa_seq) {
} else if (mut_action == "ins") { stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE) } else {
possible_mutations <- Biostrings::DNA_BASES stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = "")) }
} else { }
mut_index <- sample(1:length(working_seq), 1, replace = TRUE) sealKey()
mut_seq <- mut_seq[-mut_index] return(stats)
} }
mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE) gen_mutations("ATGATGATGATGATGATG", 1000)
gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
# Note: we need silent, nonsense, and missense gen_mutations("TATTACTATTACTATTAC", 500)
mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa)) gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq)) gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) { gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
} else if (mut_aa == aa_seq) {
stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"] myFA <- readFASTA("data/RAB39B_HSa_coding.fa")
} else { myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"] myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
} myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
} rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
return(stats)
} gen_mutations(myFA["RAB39B", 2], 10000)
N_test <- 1200 gen_mutations(myFA["PTPN5", 2], 10000)
gen_mutations("ATGATGATGATGATGATG", N_test) gen_mutations(myFA["PTPN11", 2], 10000)
gen_mutations("CCCCCCCCCCCCCCCCCC", N_test) gen_mutations(myFA["KRAS", 2], 10000)
gen_mutations("TATTACTATTACTATTAC", N_test)
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)

View File

@ -1,41 +1,41 @@
# == 1.3 Task: submit for credit (part 1/2) ================================ # == 1.3 Task: submit for credit (part 1/2) ================================
# == Submission - Code to add another philosopher to the datamodel: # == Submission - Code to add another philosopher to the datamodel:
pID <- autoincrement(philDB$person) pID <- autoincrement(philDB$person)
immanuelKant <- data.frame(id = pID, immanuelKant <- data.frame(id = pID,
name = "Immanuel Kant", name = "Immanuel Kant",
born = "1724", born = "1724",
died = "1804", died = "1804",
school = "Enlightenment Philosophy") school = "Enlightenment Philosophy")
philDB$person <- rbind(philDB$person, immanuelKant) philDB$person <- rbind(philDB$person, immanuelKant)
bID = autoincrement(philDB$books) bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID, immanuelKantWork <- data.frame(id = bID,
title = "Critique of Pure Reason", title = "Critique of Pure Reason",
published = "1781") published = "1781")
philDB$books <- rbind(philDB$books, immanuelKantWork) philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
bID = autoincrement(philDB$books) bID = autoincrement(philDB$books)
immanuelKantWork <- data.frame(id = bID, immanuelKantWork <- data.frame(id = bID,
title = "Critique of Judgement", title = "Critique of Judgement",
published = "1790") published = "1790")
philDB$books <- rbind(philDB$books, immanuelKantWork) philDB$books <- rbind(philDB$books, immanuelKantWork)
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order. # == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
schools <- unique(philDB$person$school) schools <- unique(philDB$person$school)
schools <- sort(schools) schools <- sort(schools)
for (s in schools) { for (s in schools) {
cat(sprintf("%s\n", s)) cat(sprintf("%s\n", s))
authors = which(philDB$person$school == s) authors = which(philDB$person$school == s)
for (author in authors) { for (author in authors) {
works = which(philDB$works$personID == author) works = which(philDB$works$personID == author)
for (work in works) { for (work in works) {
bookId = which(philDB$books$id == philDB$works$bookID[work]) bookId = which(philDB$books$id == philDB$works$bookID[work])
cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId])) cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
} }
} }
} }

View File

@ -1,4 +1,4 @@
[{ [{
"ID": 879819, "ID": 879819,
"species": "Cutaneotrichosporon oleaginosum"} "species": "Cutaneotrichosporon oleaginosum"}
] ]

View File

@ -1,19 +1,19 @@
[ [
{ "name" : "MBP1_CUTOL", { "name" : "MBP1_CUTOL",
"RefSeqID" : "XP_018278493.1", "RefSeqID" : "XP_018278493.1",
"UniProtID" : "A0A0J0XLN0", "UniProtID" : "A0A0J0XLN0",
"taxonomyID" : 879819, "taxonomyID" : 879819,
"sequence" : [ "sequence" : [
"MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ", "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
"KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK", "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
"PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS", "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
"MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA", "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
"TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN", "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
"YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD", "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
"EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT", "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
"SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND", "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
"AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD", "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
"LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID", "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
"DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"] "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
} }
] ]

View File

@ -1,8 +1,8 @@
README - myScripts folder: README - myScripts folder:
========================== ==========================
The "myScripts" folder is a place to keep your personal files The "myScripts" folder is a place to keep your personal files
safe. No files will be submitted into this folder on the GitHub, master safe. No files will be submitted into this folder on the GitHub, master
copy. Thefore everything you put into this folder is safe from being copy. Thefore everything you put into this folder is safe from being
inadvertently overwritten by a file with the same name that would be inadvertently overwritten by a file with the same name that would be
downloaded in a GitHub "pull" request. downloaded in a GitHub "pull" request.

View File

@ -1,4 +1,4 @@
source("./scripts/ABC-createRefDB.R") source("./scripts/ABC-createRefDB.R")
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json")) myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json")) myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))

View File

@ -1,38 +1,38 @@
# myScript.R # myScript.R
# #
# --- As you work with this file, you can delete the instructions below -------- # --- As you work with this file, you can delete the instructions below --------
# Write your notes and code experiments into this document. Save it # Write your notes and code experiments into this document. Save it
# from time to time - however I recommend that you do not _commit_ # from time to time - however I recommend that you do not _commit_
# your saved version. # your saved version.
# #
# As long as you do not _commit_ this script to version control, # As long as you do not _commit_ this script to version control,
# you can _pull_ updated versions of the entire project from GitHub # you can _pull_ updated versions of the entire project from GitHub
# by using the RStudio version control interface. However, once # by using the RStudio version control interface. However, once
# you _commit_ any file in your local version, RStudio will require # you _commit_ any file in your local version, RStudio will require
# you to resolve conflicts before you can _pull_ updates. # you to resolve conflicts before you can _pull_ updates.
# --- As you work with this file, you can delete the instructions above -------- # --- As you work with this file, you can delete the instructions above --------
# #
## Purpose: <...> ## Purpose: <...>
# #
# Version: <...> # Version: <...>
# #
# Date: <...> # Date: <...>
# Author: <Name> (<namee@mail.utoronto.ca>) # Author: <Name> (<namee@mail.utoronto.ca>)
# #
# Versions: # Versions:
# #
# <number> <Features> # <number> <Features>
# #
# TODO: # TODO:
# <...> # <...>
# #
# ==================================================================== # ====================================================================
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,75 +1,75 @@
# scriptTemplate.R # scriptTemplate.R
# #
# Purpose: # Purpose:
# Version: # Version:
# Date: # Date:
# Author: # Author:
# #
# Input: # Input:
# Output: # Output:
# Dependencies: # Dependencies:
# #
# ToDo: # ToDo:
# Notes: # Notes:
# #
# ============================================================================== # ==============================================================================
setwd("<your/project/directory>") setwd("<your/project/directory>")
# ==== PARAMETERS ============================================================ # ==== PARAMETERS ============================================================
# Define and explain all parameters. No "magic numbers" in your code below. # Define and explain all parameters. No "magic numbers" in your code below.
# ==== PACKAGES ============================================================== # ==== PACKAGES ==============================================================
# Check that required packages have been installed. Install if needed. # Check that required packages have been installed. Install if needed.
if (! requireNamespace("seqinr", quietly=TRUE)) { if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr") install.packages("seqinr")
} }
# Package information: # Package information:
# library(help = seqinr) # basic information # library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes # browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets # data(package = "seqinr") # available datasets
# Note: use package functions with the :: operator - eg. # Note: use package functions with the :: operator - eg.
# seqinr::aaa("K") # seqinr::aaa("K")
# ==== FUNCTIONS ============================================================= # ==== FUNCTIONS =============================================================
# Define functions or source external files # Define functions or source external files
source("<myUtilityFunctionsScript.R>") source("<myUtilityFunctionsScript.R>")
myFunction <- function(a, b=1) { myFunction <- function(a, b=1) {
# Purpose: # Purpose:
# Describe ... # Describe ...
# Parameters: # Parameters:
# a: ... # a: ...
# b: ... # b: ...
# Value: # Value:
# result: ... # result: ...
# code ... # code ...
return(result) return(result)
} }
# ==== PROCESS =============================================================== # ==== PROCESS ===============================================================
# Enter the step-by-step process of your project here. Strive to write your # Enter the step-by-step process of your project here. Strive to write your
# code so that you can simply run this entire file and re-create all # code so that you can simply run this entire file and re-create all
# intermediate results. # intermediate results.
# ==== TESTS ================================================================= # ==== TESTS =================================================================
# Enter your function tests here... # Enter your function tests here...
# [END] # [END]

View File

@ -1,30 +1,30 @@
# ABC-createRefDB.R # ABC-createRefDB.R
# #
# Create a reference protein database for Mbp1-like proteins # Create a reference protein database for Mbp1-like proteins
# #
# Boris Steipe for ABC learning units # Boris Steipe for ABC learning units
# #
# For the species, see: # For the species, see:
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
# #
# For the data model, see # For the data model, see
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
# #
# ============================================================================== # ==============================================================================
myDB <- dbInit() myDB <- dbInit()
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json")) myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json")) myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json")) myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json")) myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json")) myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
# [END] # [END]

File diff suppressed because it is too large Load Diff

View File

@ -1,443 +1,443 @@
# tocID <- "scripts/ABC-makeMYSPElist.R" # tocID <- "scripts/ABC-makeMYSPElist.R"
# #
# Purpose: Create a list of genome sequenced fungi with protein annotations and # Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues. # Mbp1 homologues.
# #
# Version: 1.4 # Version: 1.4
# #
# Date: 2016 09 - 2021 09 # Date: 2016 09 - 2021 09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions # Versions
# 1.4 New retrieval logic # 1.4 New retrieval logic
# 1.3 Rewrite to change datasource. NCBI has not been updated # 1.3 Rewrite to change datasource. NCBI has not been updated
# since 2012. Use ensembl fungi as initial source. # since 2012. Use ensembl fungi as initial source.
# 1.2 Change from require() to requireNamespace() # 1.2 Change from require() to requireNamespace()
# 1.1.2 Moved BLAST.R to ./scripts directory # 1.1.2 Moved BLAST.R to ./scripts directory
# 1.1 Update 2017 # 1.1 Update 2017
# 1.0 First code 2016 # 1.0 First code 2016
# #
# TODO: # TODO:
# #
# ============================================================================== # ==============================================================================
# #
# DO NOT source() THIS FILE! # DO NOT source() THIS FILE!
# #
# This file is code I provide for your deeper understanding of a process and # This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for # to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss # you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand. # if there are parts you don't understand.
# #
# Run the commands that interact with the NCBI servers only if you want to # Run the commands that interact with the NCBI servers only if you want to
# experiment specifically with the code and/or parameters. I have commented out # experiment specifically with the code and/or parameters. I have commented out
# those parts. If you only want to study the general workflow, just load() # those parts. If you only want to study the general workflow, just load()
# the respective intermediate results. # the respective intermediate results.
# #
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> -------------------------------------------------------- #TOC> --------------------------------------------------------
#TOC> 1 The strategy 55 #TOC> 1 The strategy 55
#TOC> 2 PACKAGES AND INITIALIZATIONS 67 #TOC> 2 PACKAGES AND INITIALIZATIONS 67
#TOC> 3 ENSEMBL FUNGI 75 #TOC> 3 ENSEMBL FUNGI 75
#TOC> 3.1 Import 78 #TOC> 3.1 Import 78
#TOC> 4 BLAST SEARCH 155 #TOC> 4 BLAST SEARCH 155
#TOC> 4.1 find homologous proteins 161 #TOC> 4.1 find homologous proteins 161
#TOC> 4.2 Identify species in "hits" 192 #TOC> 4.2 Identify species in "hits" 192
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282 #TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
#TOC> 6 STUDENT NUMBERS 375 #TOC> 6 STUDENT NUMBERS 375
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 The strategy ======================================================== # = 1 The strategy ========================================================
# This script will create a list of "MYSPE" species and save it in an R object # This script will create a list of "MYSPE" species and save it in an R object
# MYSPEspecies that is stored in the data subdirectory of this project from # MYSPEspecies that is stored in the data subdirectory of this project from
# where it can be loaded. The strategy is as follows: we download a list of # where it can be loaded. The strategy is as follows: we download a list of
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced # annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
# species that have been annotated. # species that have been annotated.
# Next we perform a BLAST search, to identify fungal species that have # Next we perform a BLAST search, to identify fungal species that have
# genes that are homologous to yeast MBP1. # genes that are homologous to yeast MBP1.
# #
# ... # ...
# = 2 PACKAGES AND INITIALIZATIONS ======================================== # = 2 PACKAGES AND INITIALIZATIONS ========================================
# httr provides interfaces to Webservers on the Internet # httr provides interfaces to Webservers on the Internet
if (! requireNamespace("httr", quietly = TRUE)) { if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr") install.packages("httr")
} }
# = 3 ENSEMBL FUNGI ======================================================= # = 3 ENSEMBL FUNGI =======================================================
# == 3.1 Import ============================================================ # == 3.1 Import ============================================================
# Navigate to https://fungi.ensembl.org and click on the link to the full # Navigate to https://fungi.ensembl.org and click on the link to the full
# list of all species: https://fungi.ensembl.org/species.html # list of all species: https://fungi.ensembl.org/species.html
# On the page, click on the spreadsheet symbol top right and choose # On the page, click on the spreadsheet symbol top right and choose
# "download whole table". The file will be named "Species.csv", in your # "download whole table". The file will be named "Species.csv", in your
# usual downloads folder. Move it to the data folder, and read it. # usual downloads folder. Move it to the data folder, and read it.
sDat <- read.csv("./data/Species.csv") sDat <- read.csv("./data/Species.csv")
str(sDat) str(sDat)
# The most obvious way to partition these is according to Classification ... # The most obvious way to partition these is according to Classification ...
# (poking around a bit in the UniProt taxonomy database shows that the # (poking around a bit in the UniProt taxonomy database shows that the
# classification used here is the taxonomic rank of "order"). # classification used here is the taxonomic rank of "order").
# how many classifications do we have? # how many classifications do we have?
length(unique(sDat$Classification)) # 66 length(unique(sDat$Classification)) # 66
# To have a good set for the class, we should have about 100. # To have a good set for the class, we should have about 100.
# Let's see for which of these we can find Mbp1 homologues. # Let's see for which of these we can find Mbp1 homologues.
# First, we'll keep only the colums for name, classification, and taxID, and # First, we'll keep only the colums for name, classification, and taxID, and
# drop the rest ... # drop the rest ...
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")] sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
colnames(sDat) <- c("name", "order", "taxID") colnames(sDat) <- c("name", "order", "taxID")
# Next, we make an extra column: genus - the first part of the binomial name. # Next, we make an extra column: genus - the first part of the binomial name.
# We'll use the gsub() function, and for that we need a "regular expression" # We'll use the gsub() function, and for that we need a "regular expression"
# that matches to all characters from the first blank to the end of the string: # that matches to all characters from the first blank to the end of the string:
myPatt <- "\\s.*$" # one whitespace (\\s) ... myPatt <- "\\s.*$" # one whitespace (\\s) ...
# followed by any character (.) 0..n times (*) ... # followed by any character (.) 0..n times (*) ...
# until the end of the string # until the end of the string
# using gsub() we substitue all matching characters with the empty string "" - # using gsub() we substitue all matching characters with the empty string "" -
# this deletes the matching characters # this deletes the matching characters
# Test this: # Test this:
gsub(myPatt, "", "Genus") # one word: unchanged gsub(myPatt, "", "Genus") # one word: unchanged
gsub(myPatt, "", "gEnus species") # two words: return only first gsub(myPatt, "", "gEnus species") # two words: return only first
gsub(myPatt, "", "geNus species strain 123") # many words: return only first gsub(myPatt, "", "geNus species strain 123") # many words: return only first
# apply this to the "name" column and add the result as a separate column # apply this to the "name" column and add the result as a separate column
# called "genus" # called "genus"
sDat$genus <- gsub(myPatt, "", sDat$name) sDat$genus <- gsub(myPatt, "", sDat$name)
# what do we get? # what do we get?
c(head(unique(sDat$genus)), c(head(unique(sDat$genus)),
tail(unique(sDat$genus))) # inspect the first and last few. Note that there tail(unique(sDat$genus))) # inspect the first and last few. Note that there
# is a problem that we have to keep in mind. # is a problem that we have to keep in mind.
# (Always inspect your results!) # (Always inspect your results!)
# Drop all rows for which the genus contains special chracters - # Drop all rows for which the genus contains special chracters -
# like "[Candida]" # like "[Candida]"
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ] sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
length(table(sDat$genus)) # how many genus? length(table(sDat$genus)) # how many genus?
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ... hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
# most genus have very few, but # most genus have very few, but
# some have very many species. # some have very many species.
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten... sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
# We should have at least one species from each taxonomic order, but we can # We should have at least one species from each taxonomic order, but we can
# add a few genus until we have about 100 validated species. # add a few genus until we have about 100 validated species.
# Let's add a column for species, by changing our regular expression a bit, # Let's add a column for species, by changing our regular expression a bit,
# using ^ (start of string), \\S (NOT a whitespace), # using ^ (start of string), \\S (NOT a whitespace),
# and + (one or more matches), capturing the match (...), and returning # and + (one or more matches), capturing the match (...), and returning
# it as the substitution (\\1) ... # it as the substitution (\\1) ...
myPatt <- "^(\\S+\\s\\S+)\\s.*$" myPatt <- "^(\\S+\\s\\S+)\\s.*$"
sDat$species <- gsub(myPatt, "\\1", sDat$name) sDat$species <- gsub(myPatt, "\\1", sDat$name)
# And we reorder the columns, just for aesthetics: # And we reorder the columns, just for aesthetics:
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")] sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
# Final check: # Final check:
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
# #
# Now we check which of these have Mbp1 homologues ... # Now we check which of these have Mbp1 homologues ...
# = 4 BLAST SEARCH ======================================================== # = 4 BLAST SEARCH ========================================================
# We run a BLAST search to find all proteins related to yeast Mbp1 in any # We run a BLAST search to find all proteins related to yeast Mbp1 in any
# fungus. With the results, we'll annotate our sDat table. # fungus. With the results, we'll annotate our sDat table.
# == 4.1 find homologous proteins ========================================== # == 4.1 find homologous proteins ==========================================
# #
# Use BLAST to fetch proteins related to Mbp1 and identify the species that # Use BLAST to fetch proteins related to Mbp1 and identify the species that
# contain them. # contain them.
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a # amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw together # principled way but requires rather ad hoc solutions. The code I threw together
# to make a BLAST interface (demo-quality, not research-quality) is in the file # to make a BLAST interface (demo-quality, not research-quality) is in the file
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
# standard task of communicating with servers and parsing responses - everyday # standard task of communicating with servers and parsing responses - everyday
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
# parser in currently available packages. # parser in currently available packages.
# #
# DON'T use this for BLAST searches unless you have read the NCBI policy # DON'T use this for BLAST searches unless you have read the NCBI policy
# for automated tasks. If you indicriminately pound on the NCBI's BLAST # for automated tasks. If you indicriminately pound on the NCBI's BLAST
# server, they will blacklist your IP-address. See: # server, they will blacklist your IP-address. See:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
# #
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID # BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# db = "refseq_protein", # database to search in # db = "refseq_protein", # database to search in
# nHits = 3000, # 945 hits in 2020 # nHits = 3000, # 945 hits in 2020
# E = 0.01, # # E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi # limits = "txid4751[ORGN]") # = fungi
# saveRDS(BLASThits, file="data/BLASThits.rds") # saveRDS(BLASThits, file="data/BLASThits.rds")
# #
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
# #
BLASThits <- readRDS(file = "data/BLASThits.rds") BLASThits <- readRDS(file = "data/BLASThits.rds")
# == 4.2 Identify species in "hits" ======================================== # == 4.2 Identify species in "hits" ========================================
# This is a very big list that can't be usefully analyzed manually. Here # This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains. # we are only interested in the species names that it contains.
# How many hits in the list? # How many hits in the list?
length(BLASThits$hits) # 1,134 length(BLASThits$hits) # 1,134
# Let's look at a hit somewhere down the list # Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]]) str(BLASThits$hit[[277]])
# A fair amount of parsing has gone into the BLAST.R code to prepare the results # A fair amount of parsing has gone into the BLAST.R code to prepare the results
# in a useful way. The species information is in the $species element of every # in a useful way. The species information is in the $species element of every
# hit. # hit.
# Run a loop to extract all the species names into a vector. We subset ... # Run a loop to extract all the species names into a vector. We subset ...
# Blasthits$hits ... the list of hits, from which we choose ... # Blasthits$hits ... the list of hits, from which we choose ...
# Blasthits$hits[[i]] ... the i-th hit, and get ... # Blasthits$hits[[i]] ... the i-th hit, and get ...
# Blasthits$hits[[i]]$species ... the species element from that. # Blasthits$hits[[i]]$species ... the species element from that.
# Subsetting FTW. # Subsetting FTW.
BLASTspecies <- character() BLASTspecies <- character()
for (i in seq_along(BLASThits$hits)) { for (i in seq_along(BLASThits$hits)) {
BLASTspecies[i] <- BLASThits$hits[[i]]$species BLASTspecies[i] <- BLASThits$hits[[i]]$species
} }
# You can confirm that BLASTspecies has the expected size. # You can confirm that BLASTspecies has the expected size.
length(BLASTspecies) length(BLASTspecies)
# if we delete some of these later on, we still want to remember which hit # if we delete some of these later on, we still want to remember which hit
# they came from. Thus we name() the elements with their index, which is the # they came from. Thus we name() the elements with their index, which is the
# same as the index of the hit in BLASThits # same as the index of the hit in BLASThits
names(BLASTspecies) <- 1:length(BLASTspecies) names(BLASTspecies) <- 1:length(BLASTspecies)
# let's plot the distribution of E-values # let's plot the distribution of E-values
eVals <- numeric() eVals <- numeric()
for (i in seq_along(BLASThits$hits)) { for (i in seq_along(BLASThits$hits)) {
eVals[i] <- BLASThits$hits[[i]]$E eVals[i] <- BLASThits$hits[[i]]$E
} }
range(eVals) range(eVals)
sum(eVals == 0) sum(eVals == 0)
# let's plot the log of all values > 0 to see how they are distributed # let's plot the log of all values > 0 to see how they are distributed
# plotting only one vectyor of numbers plots their index as x, and # plotting only one vectyor of numbers plots their index as x, and
# their value as y ... # their value as y ...
plot(log(eVals[eVals > 0]), col = "#CC0000") plot(log(eVals[eVals > 0]), col = "#CC0000")
# This is very informative: I would suspect that the first ten or so are # This is very informative: I would suspect that the first ten or so are
# virtually identical to the yeast protein, then we have about 800 hits with # virtually identical to the yeast protein, then we have about 800 hits with
# decreasing similarity, and then about 200 more that may actually be false # decreasing similarity, and then about 200 more that may actually be false
# positives. Also - we plotted them by index, that means the table is SORTED: # positives. Also - we plotted them by index, that means the table is SORTED:
# Lower E-values strictly come before higher E-values. # Lower E-values strictly come before higher E-values.
# Again, some species appear more than once, e.g. ... # Again, some species appear more than once, e.g. ...
sum(BLASTspecies == "Saccharomyces cerevisiae") sum(BLASTspecies == "Saccharomyces cerevisiae")
# ... corresponding to the five homologous gene sequences (paralogues) of yeast. # ... corresponding to the five homologous gene sequences (paralogues) of yeast.
# Therefore we remove duplicates. Removing duplicates will leave the FIRST # Therefore we remove duplicates. Removing duplicates will leave the FIRST
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each # in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
# species, we will retain only the protein that has the highest similarity # species, we will retain only the protein that has the highest similarity
# to yeast Mbp1, not any of its more distant paralogues. # to yeast Mbp1, not any of its more distant paralogues.
sel <- ! duplicated(BLASTspecies) sel <- ! duplicated(BLASTspecies)
BLASTspecies <- BLASTspecies[sel] BLASTspecies <- BLASTspecies[sel]
length(BLASTspecies) length(BLASTspecies)
# i.e. we got rid of about two thirds of the hits. # i.e. we got rid of about two thirds of the hits.
tail(BLASTspecies) # see how the names are useful! tail(BLASTspecies) # see how the names are useful!
# again - there are some special characters ... # again - there are some special characters ...
# what are they? # what are they?
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)] BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
# remove the brackets ... # remove the brackets ...
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies) BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
# drop any new duplicates ... # drop any new duplicates ...
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)] BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
# check the number again: # check the number again:
length(BLASTspecies) length(BLASTspecies)
# Think a bit about this: what may be the biological reason to find that # Think a bit about this: what may be the biological reason to find that
# on average, in 388 fungi across the entire phylogenetic tree, we have # on average, in 388 fungi across the entire phylogenetic tree, we have
# three sequences that are homologous to yeast Mbp1? # three sequences that are homologous to yeast Mbp1?
# Let's look at the distribution of E-values in this selection (Subsetting FTW): # Let's look at the distribution of E-values in this selection (Subsetting FTW):
# we plot all values that are TRUE in the vector "sel" that we created above, # we plot all values that are TRUE in the vector "sel" that we created above,
# AND greater than 0 # AND greater than 0
plot(log(eVals[sel & eVals > 0]), col = "#00CC00") plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
# = 5 MERGE ENSEMBL AND BLAST RESULTS ===================================== # = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
# Next we add the blast result to our sDat dataframe. We'll store the index, # Next we add the blast result to our sDat dataframe. We'll store the index,
# the E-value, and the Query-bounds from which we can estimate which domains # the E-value, and the Query-bounds from which we can estimate which domains
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with # of Mbp1 are actually covered by the hit. (True orthologues MUST align with
# Mbp1's N-terminal APSES domain.) # Mbp1's N-terminal APSES domain.)
# #
# First we pull the hits we wanted from the BLASTspecies: # First we pull the hits we wanted from the BLASTspecies:
iHits <- as.numeric(names(BLASTspecies)) iHits <- as.numeric(names(BLASTspecies))
length(iHits) # one index for each TRUE in sel length(iHits) # one index for each TRUE in sel
# add columns to sDat # add columns to sDat
l <- nrow(sDat) l <- nrow(sDat)
sDat$iHit <- numeric(l) # index of the hit in the BLAST results sDat$iHit <- numeric(l) # index of the hit in the BLAST results
sDat$eVal <- numeric(l) # E-value of the hit sDat$eVal <- numeric(l) # E-value of the hit
sDat$lAli <- numeric(l) # length of the aligned region sDat$lAli <- numeric(l) # length of the aligned region
# extract and merge # extract and merge
for (iHit in iHits) { for (iHit in iHits) {
thisSp <- BLASThits$hits[[iHit]]$species thisSp <- BLASThits$hits[[iHit]]$species
sel <- sDat$species == thisSp sel <- sDat$species == thisSp
sDat$iHit[sel] <- iHit sDat$iHit[sel] <- iHit
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
} }
# Are all reference species accounted for? # Are all reference species accounted for?
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
REFspecies %in% sDat$species[selA] # yes, all there REFspecies %in% sDat$species[selA] # yes, all there
selB <- sDat$species %in% REFspecies # all rows which have one of REF species selB <- sDat$species %in% REFspecies # all rows which have one of REF species
sum(selA & selB) # How many rows? sum(selA & selB) # How many rows?
# sDat of course includes all duplicates. Some may be multiply sequenced, some # sDat of course includes all duplicates. Some may be multiply sequenced, some
# may be different strains. We'll use the same strategy as before and keep # may be different strains. We'll use the same strategy as before and keep
# only the best hit: order the rows by E-value, then drop all rows which # only the best hit: order the rows by E-value, then drop all rows which
# are duplicated. # are duplicated.
# drop all rows without BLAST hits ... # drop all rows without BLAST hits ...
sDat <- sDat[ ! (sDat$iHit == 0) , ] sDat <- sDat[ ! (sDat$iHit == 0) , ]
# order sDat by E-value ... # order sDat by E-value ...
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ] sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
# drop all rows with duplicated species ... # drop all rows with duplicated species ...
sDat <- sDat[ ! duplicated(sDat$species) , ] sDat <- sDat[ ! duplicated(sDat$species) , ]
# Lets look at the E-values ... # Lets look at the E-values ...
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00") plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
# and alignment lengths ... # and alignment lengths ...
plot(sDat$lAli, col = "#00DDAA") plot(sDat$lAli, col = "#00DDAA")
# How many ... # How many ...
length(unique(sDat$name)) length(unique(sDat$name))
length(unique(sDat$species)) length(unique(sDat$species))
length(unique(sDat$genus)) length(unique(sDat$genus))
length(unique(sDat$order)) length(unique(sDat$order))
# I need an extra species for admin purposes later on ... # I need an extra species for admin purposes later on ...
sel <- grep("Sporothrix schenckii", sDat$species) sel <- grep("Sporothrix schenckii", sDat$species)
SPOSCdat <- sDat[sel, ] SPOSCdat <- sDat[sel, ]
sDat <- sDat[-sel, ] sDat <- sDat[-sel, ]
# To get the final dataset, we remove the reference species with their # To get the final dataset, we remove the reference species with their
# entire orders ... # entire orders ...
REForders <- unique(sDat$order[sDat$species %in% REFspecies]) REForders <- unique(sDat$order[sDat$species %in% REFspecies])
sel <- sDat$order %in% REForders sel <- sDat$order %in% REForders
REFdat <- sDat[sel , ] REFdat <- sDat[sel , ]
sDat <- sDat[ ! sel , ] sDat <- sDat[ ! sel , ]
# REFdat should now contain only the REFspecies ... # REFdat should now contain only the REFspecies ...
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] ) ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
# ... but all of them # ... but all of them
sum(REFspecies %in% REFdat$species) sum(REFspecies %in% REFdat$species)
# ... and we have enough left in sDat to prune sDat to unique genus # ... and we have enough left in sDat to prune sDat to unique genus
sDat <- sDat[ ! duplicated(sDat$genus) , ] sDat <- sDat[ ! duplicated(sDat$genus) , ]
nrow(sDat) # 84 nrow(sDat) # 84
# I add back "Sporothrix schenckii" ... # I add back "Sporothrix schenckii" ...
sDat <- rbind(SPOSCdat, sDat) sDat <- rbind(SPOSCdat, sDat)
# ... and save for future use. # ... and save for future use.
# saveRDS(sDat, file = "data/sDat.rds") # saveRDS(sDat, file = "data/sDat.rds")
# saveRDS(REFdat, file = "data/REFdat.rds") # saveRDS(REFdat, file = "data/REFdat.rds")
# = 6 STUDENT NUMBERS ===================================================== # = 6 STUDENT NUMBERS =====================================================
# #
# An asymmetric function to retrieve a MYSPE species # An asymmetric function to retrieve a MYSPE species
# #
sDat <- readRDS(file = "data/sDat.rds") sDat <- readRDS(file = "data/sDat.rds")
students <- read.csv("../BCH441-2021-students.csv") students <- read.csv("../BCH441-2021-students.csv")
sN <- students$Integration.ID sN <- students$Integration.ID
sN <- sN[! is.na(sN)] sN <- sN[! is.na(sN)]
sN <- as.character(sN) sN <- as.character(sN)
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii" sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
set.seed(112358) set.seed(112358)
theseSpecies <- sDat[sample(1:nrow(sDat)), ] theseSpecies <- sDat[sample(1:nrow(sDat)), ]
all(sort(theseSpecies$name) == sort(sDat$name)) all(sort(theseSpecies$name) == sort(sDat$name))
nrow((theseSpecies)) nrow((theseSpecies))
(iX <- grep("Sporothrix schenckii", theseSpecies$name)) (iX <- grep("Sporothrix schenckii", theseSpecies$name))
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ]) theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
rndMin <- 992000000 rndMin <- 992000000
rndMax <- 1020000000 rndMax <- 1020000000
N <- 10000 N <- 10000
keys <- as.character(sample(rndMin:rndMax, N + 1000)) keys <- as.character(sample(rndMin:rndMax, N + 1000))
keys <- keys[! (keys %in% sN)] keys <- keys[! (keys %in% sN)]
keys <- keys[1:N] keys <- keys[1:N]
keys[1:length(sN)] <- sN keys[1:length(sN)] <- sN
nRep <- floor(N/nrow(theseSpecies)) nRep <- floor(N/nrow(theseSpecies))
MYSPEdat <- theseSpecies MYSPEdat <- theseSpecies
for(i in 1:nRep) { for(i in 1:nRep) {
MYSPEdat <- rbind(MYSPEdat, theseSpecies) MYSPEdat <- rbind(MYSPEdat, theseSpecies)
} }
MYSPEdat <- MYSPEdat[1:N, ] MYSPEdat <- MYSPEdat[1:N, ]
for (i in 1:N) { for (i in 1:N) {
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5") rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
} }
set.seed(NULL) set.seed(NULL)
MYSPEdat <- MYSPEdat[sample(1:N), ] MYSPEdat <- MYSPEdat[sample(1:N), ]
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds") # saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
# === validate # === validate
x <- character() x <- character()
for (n in sN) { for (n in sN) {
sp <- getMYSPE(n) sp <- getMYSPE(n)
if (length(sp) != 1) { if (length(sp) != 1) {
stop(print(as.character(n))) stop(print(as.character(n)))
} else { } else {
x <- c(x, sp) x <- c(x, sp)
} }
} }
# === species for late-comers # === species for late-comers
y <- unique(MYSPEdat$species) y <- unique(MYSPEdat$species)
print(y[!(y %in% x)]) print(y[!(y %in% x)])
# === validate # === validate
l <- length(sN) l <- length(sN)
sp <- character(l) sp <- character(l)
for(i in 1:l) { for(i in 1:l) {
sp[i] <- getMYSPE(sN[i]) sp[i] <- getMYSPE(sN[i])
} }
any(duplicated(sp)) any(duplicated(sp))
length(unique(sp)) length(unique(sp))
which(! sDat$species %in% sp) # these can be assigned to late-comers which(! sDat$species %in% sp) # these can be assigned to late-comers
# Done. # Done.
# [END] # [END]

View File

@ -1,168 +1,168 @@
# tocID <- "scripts/ABC-makeSTRINGedges.R" # tocID <- "scripts/ABC-makeSTRINGedges.R"
# #
# Create a subnetwork of high-confidence human STRING edges. # Create a subnetwork of high-confidence human STRING edges.
# #
# Notes: # Notes:
# #
# The large source- datafile is NOT posted to github. If you want to # The large source- datafile is NOT posted to github. If you want to
# experiment with the original data, download it and place it into your # experiment with the original data, download it and place it into your
# local ./data directory. # local ./data directory.
# #
# STRING data source: # STRING data source:
# Download page: # Download page:
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens # https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
# Data: (127.6 Mb) # Data: (127.6 Mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz # https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
# #
# Version: 1.0 # Version: 1.0
# #
# Date: 2020-09 # Date: 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.0 Rewrite # 1.0 Rewrite
# #
# TODO: # TODO:
# #
# ============================================================================== # ==============================================================================
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> ------------------------------------------------- #TOC> -------------------------------------------------
#TOC> 1 Initialize 44 #TOC> 1 Initialize 44
#TOC> 2 Read STRING Data 51 #TOC> 2 Read STRING Data 51
#TOC> 3 Define cutoff and subset 63 #TOC> 3 Define cutoff and subset 63
#TOC> 4 Drop duplicates 103 #TOC> 4 Drop duplicates 103
#TOC> 5 Simple statistics 127 #TOC> 5 Simple statistics 127
#TOC> 6 Write to file 160 #TOC> 6 Write to file 160
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 Initialize ========================================================== # = 1 Initialize ==========================================================
if (! requireNamespace("readr", quietly = TRUE)) { if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr") install.packages("readr")
} }
# = 2 Read STRING Data ==================================================== # = 2 Read STRING Data ====================================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes) # Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 127.6MB, the uncompressed version is probably # The .gz compressed version is 127.6MB, the uncompressed version is probably
# 848 Mb. Fortunately readr:: can read from compressed # 848 Mb. Fortunately readr:: can read from compressed
# files, and does so automatically, based on the file extension. # files, and does so automatically, based on the file extension.
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) ( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ") STR <- readr::read_delim(fn, delim = " ")
nrow(STR) # 11,759,454 rows nrow(STR) # 11,759,454 rows
head(STR) head(STR)
# = 3 Define cutoff and subset ============================================ # = 3 Define cutoff and subset ============================================
# approximate distribution of combined_score # approximate distribution of combined_score
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
# Let's table the counts >= 850 and plot them for better resolution. # Let's table the counts >= 850 and plot them for better resolution.
myTb <- table(STR$combined_score[STR$combined_score >= 850]) myTb <- table(STR$combined_score[STR$combined_score >= 850])
is.unsorted(as.integer(names(myTb))) # Good - they are all in order is.unsorted(as.integer(names(myTb))) # Good - they are all in order
plot(myTb, type = "b", cex = 0.5, col = "#BB0000") plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
# frequently assigns a combined score of 0.900 # frequently assigns a combined score of 0.900
# Let's plot these counts as cumulative sums, in reverse order, scaled # Let's plot these counts as cumulative sums, in reverse order, scaled
# as combined scores. # as combined scores.
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
plot(myX, plot(myX,
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
xlim = c(1.0, 0.85), # reverse x-axis xlim = c(1.0, 0.85), # reverse x-axis
type = "l", type = "l",
main = "STRING interactions for 9606 (top 600,000)", main = "STRING interactions for 9606 (top 600,000)",
xlab = "combined_score", xlab = "combined_score",
ylab = "cumulative counts", ylab = "cumulative counts",
col = "#CC0000") col = "#CC0000")
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
# What's the cutoff for 100,000 edges? # What's the cutoff for 100,000 edges?
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
# confirm # confirm
sum(STR$combined_score >= 964) # 101,348 sum(STR$combined_score >= 964) # 101,348
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
# subset the table, and use only the protein IDs and the combined_score # subset the table, and use only the protein IDs and the combined_score
STR <- STR[STR$combined_score >= 964, STR <- STR[STR$combined_score >= 964,
c("protein1", "protein2", "combined_score")] c("protein1", "protein2", "combined_score")]
colnames(STR) <- c("a", "b", "score") colnames(STR) <- c("a", "b", "score")
# = 4 Drop duplicates ==================================================== # = 4 Drop duplicates ====================================================
# identify duplicate interactions by creating keys in a defined alphabetical # identify duplicate interactions by creating keys in a defined alphabetical
# sort order, then checking for duplicated(). # sort order, then checking for duplicated().
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that # e.g if we have (X:U, U:X), we change U:X to X:U and now find that
# (X:U, X:U) has a duplicate. # (X:U, X:U) has a duplicate.
AB <- STR$a < STR$b # logical vector: genes we need to swap AB <- STR$a < STR$b # logical vector: genes we need to swap
tmp <- STR$b # copy column b tmp <- STR$b # copy column b
STR$b[AB] <- STR$a[AB] # copy a's into b STR$b[AB] <- STR$a[AB] # copy a's into b
STR$a[AB] <- tmp[AB] # copy tmp's into a STR$a[AB] <- tmp[AB] # copy tmp's into a
all(STR$a >= STR$b) # confirm: TRUE all(STR$a >= STR$b) # confirm: TRUE
# now, make combined keys, like this: # now, make combined keys, like this:
paste0(STR$a[1:10], ":", STR$b[1:10]) paste0(STR$a[1:10], ":", STR$b[1:10])
tmp <- paste0(STR$a, ":", STR$b) tmp <- paste0(STR$a, ":", STR$b)
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
# both a:b and b:a ! # both a:b and b:a !
# drop all duplicated interactions from tmp # drop all duplicated interactions from tmp
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
# = 5 Simple statistics =================================================== # = 5 Simple statistics ===================================================
# how many unique genes? # how many unique genes?
length(unique(c(STR$a, STR$b))) # 8,445 length(unique(c(STR$a, STR$b))) # 8,445
# how many self-edges? # how many self-edges?
sum(STR$a == STR$b) # none sum(STR$a == STR$b) # none
# log(rank) / log(frequency) # log(rank) / log(frequency)
myTbl <- table(c(STR$a, STR$b)) myTbl <- table(c(STR$a, STR$b))
myTbl <- myTbl[order(myTbl, decreasing = TRUE)] myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
hist(myTbl, breaks = 40, col = "#FFEEBB") hist(myTbl, breaks = 40, col = "#FFEEBB")
# number of singletons # number of singletons
sum(myTbl == 1) # almost a quarter sum(myTbl == 1) # almost a quarter
# maximum? # maximum?
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465 myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
# Google: CDC5L # Google: CDC5L
# Zipf-plot # Zipf-plot
plot(log(1:length(myTbl)), log(as.numeric(myTbl)), plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
type = "b", cex = 0.7, type = "b", cex = 0.7,
main = "STRINGedges - degrees", main = "STRINGedges - degrees",
xlab = "log(rank)", xlab = "log(rank)",
ylab = "log(frequency)", ylab = "log(frequency)",
col = "#FFBB88") col = "#FFBB88")
sprintf("Average number of interactions: %5.2f", sprintf("Average number of interactions: %5.2f",
nrow(STR) / length(unique(c(STR$a, STR$b)))) nrow(STR) / length(unique(c(STR$a, STR$b))))
# = 6 Write to file ======================================================= # = 6 Write to file =======================================================
saveRDS(STR, file = "./data/STRINGedges.rds") saveRDS(STR, file = "./data/STRINGedges.rds")
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the # STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
# object when needed # object when needed
# [END] # [END]

View File

@ -1,167 +1,167 @@
# tocID <- "scripts/ABC-makeScCCnet.R" # tocID <- "scripts/ABC-makeScCCnet.R"
# #
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
# GOSlim annotation. # GOSlim annotation.
# #
# Boris Steipe for ABC learning units # Boris Steipe for ABC learning units
# #
# Notes: # Notes:
# #
# The large source- datafiles are NOT posted to github. If you want to # The large source- datafiles are NOT posted to github. If you want to
# experiment with your own code, download them and place them into your # experiment with your own code, download them and place them into your
# local ./data directory. # local ./data directory.
# #
# STRING data source: # STRING data source:
# Download page: # Download page:
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
# Data: (20.1 mb) # Data: (20.1 mb)
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz # https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
# #
# GOSlim data source: (Note: this has moved from GO to SGD) # GOSlim data source: (Note: this has moved from GO to SGD)
# Info page: https://www.yeastgenome.org/downloads # Info page: https://www.yeastgenome.org/downloads
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/ # Info page: http://sgd-archive.yeastgenome.org/curation/literature/
# Data: (3 mb) # Data: (3 mb)
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab # http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
# #
# #
# Version: 1.2 # Version: 1.2
# #
# Date: 2017-10 - 2020-09 # Date: 2017-10 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca) # Author: Boris Steipe (boris.steipe@utoronto.ca)
# #
# Versions: # Versions:
# 1.2 2020 Update. GO Slim Yeast mow at SGD # 1.2 2020 Update. GO Slim Yeast mow at SGD
# 1.1 Change from require() to requireNamespace(), # 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 1.0 First code copied from 2016 material. # 1.0 First code copied from 2016 material.
# #
# TODO: # TODO:
# #
# ============================================================================== # ==============================================================================
# SRCDIR <- "./instructor" # SRCDIR <- "./instructor"
#TOC> ========================================================================== #TOC> ==========================================================================
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------------- #TOC> ---------------------------------------------------------------
#TOC> 1 INITIALIZE 58 #TOC> 1 INITIALIZE 58
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66 #TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96 #TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
#TOC> 3.1 Intersect interactions and annotations 122 #TOC> 3.1 Intersect interactions and annotations 122
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128 #TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
# = 1 INITIALIZE ========================================================== # = 1 INITIALIZE ==========================================================
SRCDIR <- "./data" SRCDIR <- "./data"
if (! requireNamespace("readr", quietly = TRUE)) { if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr") install.packages("readr")
} }
# = 2 STRING FUNCTIONAL INTERACTION DATA ================================== # = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
# Read STRING Data (needs to be downloaded from database, see URL in Notes) # Read STRING Data (needs to be downloaded from database, see URL in Notes)
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB - # The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
# really not necessary to uncompress since readr:: can read from compressed # really not necessary to uncompress since readr:: can read from compressed
# files, and does so automatically, based on the file extension. # files, and does so automatically, based on the file extension.
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) ( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
STR <- readr::read_delim(fn, delim = " ") STR <- readr::read_delim(fn, delim = " ")
# Subset only IDs and combined_score column # Subset only IDs and combined_score column
STR <- STR[ , c("protein1", "protein2", "combined_score")] STR <- STR[ , c("protein1", "protein2", "combined_score")]
# head(STR) # head(STR)
# sum(STR$combined_score > 909) # 100270 edges # sum(STR$combined_score > 909) # 100270 edges
# subset for 100,000 highest confidence edges # subset for 100,000 highest confidence edges
STR <- STR[(STR$combined_score > 909), ] STR <- STR[(STR$combined_score > 909), ]
head(STR) head(STR)
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
STR$protein1 <- gsub("^4932\\.", "", STR$protein1) STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
STR$protein2 <- gsub("^4932\\.", "", STR$protein2) STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
head(STR) head(STR)
# get a vector of gene names in this list # get a vector of gene names in this list
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
# names # names
length(myIntxGenes) length(myIntxGenes)
sample(myIntxGenes, 10) # choose 10 at random (sanity check) sample(myIntxGenes, 10) # choose 10 at random (sanity check)
# = 3 GOSlim FUNCTIONAL ANNOTATIONS ======================================= # = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
# #
# Read GOSlim data (needs to be downloaded from database, see URL in Notes) # Read GOSlim data (needs to be downloaded from database, see URL in Notes)
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) ( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
Gsl <- readr::read_tsv(fn, Gsl <- readr::read_tsv(fn,
col_names = c("ID", col_names = c("ID",
"name", "name",
"SGDId", "SGDId",
"Ontology", "Ontology",
"termName", "termName",
"termID", "termID",
"status")) "status"))
head(Gsl) head(Gsl)
# What cell cycle names does it contain? # What cell cycle names does it contain?
myGslTermNames <- unique(Gsl$termName) # 169 unique terms myGslTermNames <- unique(Gsl$termName) # 169 unique terms
myGslTermNames[grep("cycle", myGslTermNames)] myGslTermNames[grep("cycle", myGslTermNames)]
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle" # [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
# Choose "mitotic cell cycle" as the GOslim term to subset with # Choose "mitotic cell cycle" as the GOslim term to subset with
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
length(scCCgenes) # 324 genes annotated to that term length(scCCgenes) # 324 genes annotated to that term
# == 3.1 Intersect interactions and annotations ============================ # == 3.1 Intersect interactions and annotations ============================
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
# # functional interactions # # functional interactions
# = 4 DEFINE THE CELL-CYCLE NETWORK ======================================= # = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
# #
# Define scCCnet ... the S. Cervisiae Cell Cycle network # Define scCCnet ... the S. Cervisiae Cell Cycle network
# Subset all rows for which BOTH genes are in the GOslim cell cycle set # Subset all rows for which BOTH genes are in the GOslim cell cycle set
# #
scCCnet <- STR[(STR$protein1 %in% scCCgenes) & scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
(STR$protein2 %in% scCCgenes), ] (STR$protein2 %in% scCCgenes), ]
# How many genes are there? # How many genes are there?
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283 length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
# Each edge is listed twice - now remove duplicates. # Each edge is listed twice - now remove duplicates.
# Step 1: make a vector: sort two names so the fiRst one is alphabetically # Step 1: make a vector: sort two names so the fiRst one is alphabetically
# smaller Than the second one. This brings the two names into a defined # smaller Than the second one. This brings the two names into a defined
# order. Then concatenate them with a "." - the resulting string # order. Then concatenate them with a "." - the resulting string
# is always the same, for any order. E.g. c("A", "B") gives "A.B" # is always the same, for any order. E.g. c("A", "B") gives "A.B"
# and c("B", "A") also gives "A.B". This identifies duplicates. # and c("B", "A") also gives "A.B". This identifies duplicates.
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
1, 1,
FUN = function(x) { return(paste(sort(x), collapse = ".")) }) FUN = function(x) { return(paste(sort(x), collapse = ".")) })
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
sum(duplicated(x)) # 1453 sum(duplicated(x)) # 1453
# Step 2: drop all rows that contain duplicates in x # Step 2: drop all rows that contain duplicates in x
scCCnet <- scCCnet[! duplicated(x), ] scCCnet <- scCCnet[! duplicated(x), ]
# Confirm we didn't loose genes # Confirm we didn't loose genes
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
nrow(scCCnet) nrow(scCCnet)
# Network has 283 nodes, 1453 edges # Network has 283 nodes, 1453 edges
saveRDS(scCCnet, file = "./data/scCCnet.rds") saveRDS(scCCnet, file = "./data/scCCnet.rds")
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the # scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
# object when needed # object when needed
# [END] # [END]

View File

@ -1,135 +1,135 @@
# tocID <- "scripts/ABC-writeALN.R" # tocID <- "scripts/ABC-writeALN.R"
# #
# ToDo: calculate consensus line # ToDo: calculate consensus line
# append sequence numbers # append sequence numbers
# Notes: # Notes:
# #
# ============================================================================== # ==============================================================================
writeALN <- function(ali, writeALN <- function(ali,
range, range,
note = "", note = "",
myCon = stdout(), myCon = stdout(),
blockWidth = 60) { blockWidth = 60) {
# Purpose: # Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or # Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format. # a file in multi-FASTA format.
# Version: 2.0 # Version: 2.0
# Date: 2017 10 # Date: 2017 10
# Author: Boris Steipe # Author: Boris Steipe
# #
# Parameters: # Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character # ali MsaAAMultipleAlignment or AAStringSet or character
# vector. # vector.
# range num a two-integer vector of start and end positions if # range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g. # only a range of the MSA should be written, e.g.
# a domain. Defaults to the full alignment length. # a domain. Defaults to the full alignment length.
# note chr a vector of character that is appended to the name # note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of # of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one # shorter vectors applies, thus a vector of length one
# is added to all headers. # is added to all headers.
# myCon a connection (cf. the con argument for writeLines). # myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout() # Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters. # blockWidth int width of sequence block. Default 80 characters.
# Value: # Value:
# NA the function is invoked for its side effect of printing an # NA the function is invoked for its side effect of printing an
# alignment to stdout() or file. # alignment to stdout() or file.
blockWidth <- as.integer(blockWidth) blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) { if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.") stop("PANIC: parameter \"blockWidth\" must be numeric.")
} }
if (blockWidth < 1) { if (blockWidth < 1) {
stop("PANIC: parameter \"blockWidth\" must be greater than zero.") stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
} }
if (blockWidth > 60) { if (blockWidth > 60) {
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.") warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
} }
# Extract the raw data from the objects depending on their respective class # Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings. # and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ... # Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" | if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" | class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") { class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked ali <- ali@unmasked
} }
# Process XStringSet # Process XStringSet
if (class(ali) == "AAStringSet" | if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" | class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") { class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings # we don't _have_ to load Biostrings
} else if (class(ali) == "character") { } else if (class(ali) == "character") {
sSet <- ali sSet <- ali
} else { } else {
stop(paste("Input object of class", stop(paste("Input object of class",
class(ali), class(ali),
"can't be handled by this function.")) "can't be handled by this function."))
} }
if (missing(range)) { if (missing(range)) {
range <- 1 range <- 1
range[2] <- max(nchar(sSet)) range[2] <- max(nchar(sSet))
} else { } else {
range <- as.integer(range) range <- as.integer(range)
if(length(range) != 2 || if(length(range) != 2 ||
any(is.na(range)) || any(is.na(range)) ||
range[1] > range[2] || range[1] > range[2] ||
range[1] < 1) { range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.") stop("PANIC: \"range\" parameter must contain valid start and end index.")
} }
} }
# Right-pad any sequence with "-" that is shorter than ranges[2] # Right-pad any sequence with "-" that is shorter than ranges[2]
for (i in seq_along(sSet)) { for (i in seq_along(sSet)) {
if (nchar(sSet[i]) < range[2]) { if (nchar(sSet[i]) < range[2]) {
sSet[i] <- paste0(sSet[i], sSet[i] <- paste0(sSet[i],
paste0(rep("-", range[2] - nchar(sSet[i])), paste0(rep("-", range[2] - nchar(sSet[i])),
collapse = "")) collapse = ""))
} }
} }
# Right-pad sequence names # Right-pad sequence names
sNames <- names(sSet) sNames <- names(sSet)
len <- max(nchar(sNames)) + 2 # longest name plus two spaces len <- max(nchar(sNames)) + 2 # longest name plus two spaces
for (i in seq_along(sNames)) { for (i in seq_along(sNames)) {
sNames[i] <- paste0(sNames[i], sNames[i] <- paste0(sNames[i],
paste0(rep(" ", len - nchar(sNames[i])), paste0(rep(" ", len - nchar(sNames[i])),
collapse = "")) collapse = ""))
} }
# Process each sequence # Process each sequence
txt <- paste0("CLUSTAL W format. ", note) txt <- paste0("CLUSTAL W format. ", note)
txt[2] <- "" txt[2] <- ""
iStarts <- seq(range[1], range[2], by = blockWidth) iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2]) iEnds <- c((iStarts[-1] - 1), range[2])
for (i in seq_along(iStarts)) { for (i in seq_along(iStarts)) {
for (j in seq_along(sSet)) { for (j in seq_along(sSet)) {
txt <- c(txt, txt <- c(txt,
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
} }
txt <- c(txt, "") # append a blank consenus line txt <- c(txt, "") # append a blank consenus line
txt <- c(txt, "") # append a separator line txt <- c(txt, "") # append a separator line
} }
writeLines(txt, con= myCon) writeLines(txt, con= myCon)
} }
# ==== TESTS ================================================================= # ==== TESTS =================================================================
# Enter your function tests here... # Enter your function tests here...
if (FALSE) { if (FALSE) {
# test ... # test ...
} }
# [END] # [END]

View File

@ -1,121 +1,121 @@
# ABC-writeMFA.R # ABC-writeMFA.R
# #
# ToDo: # ToDo:
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header. # Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
# #
# #
# ============================================================================== # ==============================================================================
writeMFA <- function(ali, writeMFA <- function(ali,
range, range,
note = "", note = "",
myCon = stdout(), myCon = stdout(),
blockWidth = 80) { blockWidth = 80) {
# Purpose: # Purpose:
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or # Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
# a file in multi-FASTA format. # a file in multi-FASTA format.
# Version: 2.1 # Version: 2.1
# Date: 2017 10 # Date: 2017 10
# Author: Boris Steipe # Author: Boris Steipe
# #
# Parameters: # Parameters:
# ali MsaAAMultipleAlignment or AAStringSet or character # ali MsaAAMultipleAlignment or AAStringSet or character
# vector # vector
# range num a two-integer vector of start and end positions if # range num a two-integer vector of start and end positions if
# only a range of the MSA should be written, e.g. # only a range of the MSA should be written, e.g.
# a domain. Defaults to the full sequence length. # a domain. Defaults to the full sequence length.
# note chr a vector of character that is appended to the name # note chr a vector of character that is appended to the name
# of a sequence in the FASTA header. Recycling of # of a sequence in the FASTA header. Recycling of
# shorter vectors applies, thus a vector of length one # shorter vectors applies, thus a vector of length one
# is added to all headers. # is added to all headers.
# myCon a connection (cf. the con argument for writeLines). # myCon a connection (cf. the con argument for writeLines).
# Defaults to stdout() # Defaults to stdout()
# blockWidth int width of sequence block. Default 80 characters. # blockWidth int width of sequence block. Default 80 characters.
# Value: # Value:
# NA the function is invoked for its side effect of printing an # NA the function is invoked for its side effect of printing an
# alignment to stdout() or file. # alignment to stdout() or file.
blockWidth <- as.integer(blockWidth) blockWidth <- as.integer(blockWidth)
if (is.na(blockWidth)) { if (is.na(blockWidth)) {
stop("PANIC: parameter \"blockWidth\" must be numeric.") stop("PANIC: parameter \"blockWidth\" must be numeric.")
} }
if (! blockWidth > 0){ if (! blockWidth > 0){
stop("PANIC: parameter \"blockWidth\" must be greater than zero.") stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
} }
# Extract the raw data from the objects depending on their respective class # Extract the raw data from the objects depending on their respective class
# and put it into a named vector of strings. # and put it into a named vector of strings.
# Extract XStringSet from MsaXMultipleAlignment ... # Extract XStringSet from MsaXMultipleAlignment ...
if (class(ali) == "MsaAAMultipleAlignment" | if (class(ali) == "MsaAAMultipleAlignment" |
class(ali) == "MsaDNAMultipleAlignment" | class(ali) == "MsaDNAMultipleAlignment" |
class(ali) == "MsaRNAMultipleAlignment") { class(ali) == "MsaRNAMultipleAlignment") {
ali <- ali@unmasked ali <- ali@unmasked
} }
# Process XStringSet # Process XStringSet
if (class(ali) == "AAStringSet" | if (class(ali) == "AAStringSet" |
class(ali) == "DNAStringSet" | class(ali) == "DNAStringSet" |
class(ali) == "RNAStringSet") { class(ali) == "RNAStringSet") {
sSet <- as.character(ali) # we use as.character(), not toString() thus sSet <- as.character(ali) # we use as.character(), not toString() thus
# we don't _have_ to load Biostrings # we don't _have_ to load Biostrings
} else if (class(ali) == "character") { } else if (class(ali) == "character") {
sSet <- ali sSet <- ali
} else { } else {
stop(paste("Input object of class", stop(paste("Input object of class",
class(ali), class(ali),
"can't be handled by this function.")) "can't be handled by this function."))
} }
if (missing(range)) { if (missing(range)) {
range <- 1 range <- 1
range[2] <- max(nchar(sSet)) range[2] <- max(nchar(sSet))
} else { } else {
range <- as.integer(range) range <- as.integer(range)
if(length(range) != 2 || if(length(range) != 2 ||
any(is.na(range)) || any(is.na(range)) ||
range[1] > range[2] || range[1] > range[2] ||
range[1] < 1) { range[1] < 1) {
stop("PANIC: \"range\" parameter must contain valid start and end index.") stop("PANIC: \"range\" parameter must contain valid start and end index.")
} }
} }
# Process each sequence # Process each sequence
txt <- character() txt <- character()
if (note != "") { # construct header line if (note != "") { # construct header line
headers <- paste(names(sSet), note) headers <- paste(names(sSet), note)
} else { } else {
headers <- names(sSet) headers <- names(sSet)
} }
for (i in seq_along(sSet)) { for (i in seq_along(sSet)) {
# output FASTA header # output FASTA header
txt <- c(txt, sprintf(">%s", headers[i])) txt <- c(txt, sprintf(">%s", headers[i]))
# output the sequence in blocks of blockWidth per line ... # output the sequence in blocks of blockWidth per line ...
iStarts <- seq(range[1], range[2], by = blockWidth) iStarts <- seq(range[1], range[2], by = blockWidth)
iEnds <- c((iStarts[-1] - 1), range[2]) iEnds <- c((iStarts[-1] - 1), range[2])
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
txt <- c(txt, thisSeq) txt <- c(txt, thisSeq)
txt <- c(txt, "") # append an empty line for readability txt <- c(txt, "") # append an empty line for readability
} }
writeLines(txt, con = myCon) writeLines(txt, con = myCon)
} }
# ==== TESTS ================================================================= # ==== TESTS =================================================================
# Enter your function tests here... # Enter your function tests here...
if (FALSE) { if (FALSE) {
# test ... # test ...
} }
# [END] # [END]

View File

@ -1,384 +1,384 @@
# BLAST.R # BLAST.R
# #
# Purpose: Send off one BLAST search and return parsed list of results # Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API # This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI. # (Application Programming Interface) at the NCBI.
# Read about the constraints here: # Read about the constraints here:
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
# #
# #
# Version: 3.2 # Version: 3.2
# Date: 2016 09 - 2020 09 # Date: 2016 09 - 2020 09
# Author: Boris Steipe # Author: Boris Steipe
# #
# Versions: # Versions:
# 3.2 2020 updates # 3.2 2020 updates
# 3.1 Change from require() to requireNamespace(), # 3.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout # use <package>::<function>() idiom throughout
# 3.0 parsing logic had not been fully implemented; Fixed. # 3.0 parsing logic had not been fully implemented; Fixed.
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines; # 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits. # refactored parseBLASTalignment() to handle lists with multiple hits.
# 2.0 Completely rewritten because the interface completely changed. # 2.0 Completely rewritten because the interface completely changed.
# Code adpated in part from NCBI Perl sample code: # Code adpated in part from NCBI Perl sample code:
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ # $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
# 1.0 first version posted for BCH441 2016, based on BLAST - API # 1.0 first version posted for BCH441 2016, based on BLAST - API
# #
# ToDo: Return the organism/strain name in the output, and propagate # ToDo: Return the organism/strain name in the output, and propagate
# into MYSPE selection script. # into MYSPE selection script.
# #
# Notes: This is somewhat pedestrian, but apparently there are currently # Notes: This is somewhat pedestrian, but apparently there are currently
# no R packages that contain such code. # no R packages that contain such code.
# #
# ============================================================================== # ==============================================================================
if (! requireNamespace("httr", quietly = TRUE)) { if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr") install.packages("httr")
} }
BLAST <- function(Q, BLAST <- function(Q,
db = "refseq_protein", db = "refseq_protein",
nHits = 30, nHits = 30,
E = 0.1, E = 0.1,
limits = "", limits = "",
rid = "", rid = "",
query = "", query = "",
quietly = FALSE, quietly = FALSE,
myTimeout = 120) { myTimeout = 120) {
# Purpose: # Purpose:
# Basic BLAST search # Basic BLAST search
# #
# Parameters: # Parameters:
# Q: query - either a valid ID or a sequence # Q: query - either a valid ID or a sequence
# db: "refseq_protein" by default, # db: "refseq_protein" by default,
# other legal values include: "nr", "pdb", "swissprot" ... # other legal values include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return # nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected # E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence. # to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter # limits: a valid ENTREZ filter
# rid: a request ID - to retrieve earlier search results # rid: a request ID - to retrieve earlier search results
# query: the actual query string (needed when retrieving results # query: the actual query string (needed when retrieving results
# with an rid) # with an rid)
# quietly: controls printing of wait-time progress bar # quietly: controls printing of wait-time progress bar
# timeout: how much longer _after_ rtoe to wait for a result # timeout: how much longer _after_ rtoe to wait for a result
# before giving up (seconds) # before giving up (seconds)
# Value: # Value:
# result: list of process status or resulting hits, and some metadata # result: list of process status or resulting hits, and some metadata
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
results <- list() results <- list()
results$query = query results$query = query
results$rid <- rid results$rid <- rid
results$rtoe <- 0 results$rtoe <- 0
if (rid == "") { # If no rid is available, spawn a search. if (rid == "") { # If no rid is available, spawn a search.
# Else, proceed directly to retrieval. # Else, proceed directly to retrieval.
# prepare query, GET(), and parse rid and rtoe from BLAST server response # prepare query, GET(), and parse rid and rtoe from BLAST server response
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?", "?",
"CMD=Put", "CMD=Put",
"&PROGRAM=", "blastp", "&PROGRAM=", "blastp",
"&QUERY=", URLencode(Q), "&QUERY=", URLencode(Q),
"&DATABASE=", db, "&DATABASE=", db,
"&MATRIX=", "BLOSUM62", "&MATRIX=", "BLOSUM62",
"&EXPECT=", as.character(E), "&EXPECT=", as.character(E),
"&HITLIST_SIZE=", as.character(nHits), "&HITLIST_SIZE=", as.character(nHits),
"&ALIGNMENTS=", as.character(nHits), "&ALIGNMENTS=", as.character(nHits),
"&FORMAT_TYPE=Text") "&FORMAT_TYPE=Text")
if (limits != "") { if (limits != "") {
results$query <- paste0( results$query <- paste0(
results$query, results$query,
"&ENTREZ_QUERY=", limits) "&ENTREZ_QUERY=", limits)
} }
# send it off ... # send it off ...
response <- httr::GET(results$query) response <- httr::GET(results$query)
if (httr::http_status(response)$category != "Success" ) { if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
httr::http_status(response)$message)) httr::http_status(response)$message))
} }
txt <- httr::content(response, "text", encoding = "UTF-8") txt <- httr::content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2] results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
patt <- "RTOE = (\\d+)" # match the expected completion time patt <- "RTOE = (\\d+)" # match the expected completion time
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
# Now we wait ... # Now we wait ...
if (quietly) { if (quietly) {
Sys.sleep(results$rtoe) Sys.sleep(results$rtoe)
} else { } else {
cat(sprintf("BLAST is processing %s:\n", results$rid)) cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe) waitTimer(results$rtoe)
} }
} # done sending query and retrieving rid, rtoe } # done sending query and retrieving rid, rtoe
# Enter an infinite loop to check for result availability # Enter an infinite loop to check for result availability
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?", "?",
"CMD=Get", "CMD=Get",
"&RID=", results$rid, "&RID=", results$rid,
"&FORMAT_TYPE=Text", "&FORMAT_TYPE=Text",
"&FORMAT_OBJECT=SearchInfo", "&FORMAT_OBJECT=SearchInfo",
sep = "") sep = "")
while (TRUE) { while (TRUE) {
# Check whether the result is ready # Check whether the result is ready
response <- httr::GET(checkStatus) response <- httr::GET(checkStatus)
if (httr::http_status(response)$category != "Success" ) { if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
httr::http_status(response)$message)) httr::http_status(response)$message))
} }
txt <- httr::content(response, "text", encoding = "UTF-8") txt <- httr::content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) { if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT myTimeout <- myTimeout - EXTRAWAIT
if (myTimeout <= 0) { # abort if (myTimeout <= 0) { # abort
cat("BLAST search not concluded before timeout. Aborting.\n") cat("BLAST search not concluded before timeout. Aborting.\n")
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n", cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
"Trying checking back later with >", "Trying checking back later with >",
results$rid)) results$rid))
return(results) return(results)
} }
if (quietly) { if (quietly) {
Sys.sleep(EXTRAWAIT) Sys.sleep(EXTRAWAIT)
} else { } else {
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
EXTRAWAIT, EXTRAWAIT,
myTimeout)) myTimeout))
waitTimer(EXTRAWAIT) waitTimer(EXTRAWAIT)
next next
} }
} else if (length(grep("Status=FAILED", txt)) > 0) { } else if (length(grep("Status=FAILED", txt)) > 0) {
cat("BLAST search returned status \"FAILED\". Aborting.\n") cat("BLAST search returned status \"FAILED\". Aborting.\n")
return(results) return(results)
} else if (length(grep("Status=UNKNOWN", txt)) > 0) { } else if (length(grep("Status=UNKNOWN", txt)) > 0) {
cat("BLAST search returned status \"UNKNOWN\".\n") cat("BLAST search returned status \"UNKNOWN\".\n")
cat("This probably means the rid has expired. Aborting.\n") cat("This probably means the rid has expired. Aborting.\n")
return(results) return(results)
} else if (length(grep("Status=READY", txt)) > 0) { # Done } else if (length(grep("Status=READY", txt)) > 0) { # Done
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
cat("BLAST search ready but no hits found. Aborting.\n") cat("BLAST search ready but no hits found. Aborting.\n")
return(results) return(results)
} else { } else {
break # done ... retrieve search result break # done ... retrieve search result
} }
} }
} # end result-check loop } # end result-check loop
# retrieve results from BLAST server # retrieve results from BLAST server
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?", "?",
"&CMD=Get", "&CMD=Get",
"&RID=", results$rid, "&RID=", results$rid,
"&FORMAT_TYPE=Text", "&FORMAT_TYPE=Text",
sep = "") sep = "")
response <- httr::GET(retrieve) response <- httr::GET(retrieve)
if (httr::http_status(response)$category != "Success" ) { if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
httr::http_status(response)$message)) httr::http_status(response)$message))
} }
txt <- httr::content(response, "text", encoding = "UTF-8") txt <- httr::content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process: # txt contains the whole set of results. Process:
# First, we strsplit() on linebreaks: # First, we strsplit() on linebreaks:
txt <- unlist(strsplit(txt, "\n")) txt <- unlist(strsplit(txt, "\n"))
# The alignments range from the first line that begins with ">" ... # The alignments range from the first line that begins with ">" ...
iFirst <- grep("^>", txt)[1] iFirst <- grep("^>", txt)[1]
# ... to the last line that begins with "Sbjct" # ... to the last line that begins with "Sbjct"
x <- grep("^Sbjct", txt) x <- grep("^Sbjct", txt)
iLast <- x[length(x)] iLast <- x[length(x)]
# Get the alignments block # Get the alignments block
txt <- txt[iFirst:iLast] txt <- txt[iFirst:iLast]
# Drop empty lines # Drop empty lines
txt <- txt[!(nchar(txt) == 0)] txt <- txt[!(nchar(txt) == 0)]
# A line that ends "]" but does not begin ">" seems to be a split # A line that ends "]" but does not begin ">" seems to be a split
# defline ... eg. # defline ... eg.
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " # [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
# [2] "EXF-2481]" # [2] "EXF-2481]"
# Merge these lines to the preceding lines and delete them. # Merge these lines to the preceding lines and delete them.
# #
x <- which(grepl("]$", txt) & !(grepl("^>", txt))) x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
if (length(x) > 0) { if (length(x) > 0) {
txt[x-1] <- paste0(txt[x-1], txt[x]) txt[x-1] <- paste0(txt[x-1], txt[x])
txt <- txt[-x] txt <- txt[-x]
} }
# Special case: there may be multiple deflines when the BLAST hit is to # Special case: there may be multiple deflines when the BLAST hit is to
# redundant, identical sequences. Keep only the first instance. # redundant, identical sequences. Keep only the first instance.
iKeep <- ! grepl("^>", txt) iKeep <- ! grepl("^>", txt)
x <- rle(iKeep) x <- rle(iKeep)
x$positions <- cumsum(x$lengths) x$positions <- cumsum(x$lengths)
i <- which(x$lengths > 1 & x$values == FALSE) i <- which(x$lengths > 1 & x$values == FALSE)
if (length(i) > 0) { if (length(i) > 0) {
firsts <- x$positions[i] - x$lengths[i] + 1 firsts <- x$positions[i] - x$lengths[i] + 1
iKeep[firsts] <- TRUE iKeep[firsts] <- TRUE
txt <- txt[iKeep] txt <- txt[iKeep]
} }
# After this preprocessing the following should be true: # After this preprocessing the following should be true:
# - Every alignment block begins with a defline in which the # - Every alignment block begins with a defline in which the
# first character is ">" # first character is ">"
# - There is only one defline in each block. # - There is only one defline in each block.
# - Lines are not split. # - Lines are not split.
# Make a dataframe of first and last indices of alignment blocks # Make a dataframe of first and last indices of alignment blocks
x <- grep("^>", txt) x <- grep("^>", txt)
blocks <- data.frame(iFirst = x, blocks <- data.frame(iFirst = x,
iLast = c((x[-1] - 1), length(txt))) iLast = c((x[-1] - 1), length(txt)))
# Build the hits list by parsing the blocks # Build the hits list by parsing the blocks
results$hits <- list() results$hits <- list()
for (i in seq_len(nrow(blocks))) { for (i in seq_len(nrow(blocks))) {
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
results$hits[[i]] <- parseBLASTalignment(thisBlock) results$hits[[i]] <- parseBLASTalignment(thisBlock)
} }
return(results) return(results)
} }
parseBLASTalignment <- function(hit) { parseBLASTalignment <- function(hit) {
# Parse data from a character vector containing a BLAST hit # Parse data from a character vector containing a BLAST hit
# Parameters: # Parameters:
# hit char one BLAST hit as char vector # hit char one BLAST hit as char vector
# Value: # Value:
# list $def chr defline # list $def chr defline
# $accession chr accession number # $accession chr accession number
# $organism chr complete organism definition # $organism chr complete organism definition
# $species chr binomial species # $species chr binomial species
# $E num E value # $E num E value
# $lengthAli num length of the alignment # $lengthAli num length of the alignment
# $nIdentitites num number of identities # $nIdentitites num number of identities
# $nGaps num number of gaps # $nGaps num number of gaps
# $Qbounds num 2-element vector of query start-end # $Qbounds num 2-element vector of query start-end
# $Sbounds num 2-element vector of subject start-end # $Sbounds num 2-element vector of subject start-end
# $Qseq chr query sequence # $Qseq chr query sequence
# $midSeq chr midline string # $midSeq chr midline string
# $Sseq chr subject sequence # $Sseq chr subject sequence
getToken <- function(patt, v) { getToken <- function(patt, v) {
# get the first token identified by pattern patt in character vector v # get the first token identified by pattern patt in character vector v
v <- v[grep(patt, v)] v <- v[grep(patt, v)]
if (length(v) > 1) { v <- v[1] } if (length(v) > 1) { v <- v[1] }
if (length(v) == 0) { token <- NA if (length(v) == 0) { token <- NA
} else { } else {
token <- regmatches(v, regexec(patt, v))[[1]][2] } token <- regmatches(v, regexec(patt, v))[[1]][2] }
return(token) return(token)
} }
h <- list() h <- list()
# FASTA defline # FASTA defline
h$def <- hit[1] h$def <- hit[1]
# accesion number (ID), use the first if there are several, separated by "|" # accesion number (ID), use the first if there are several, separated by "|"
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2] h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# organism # organism
patt <- "\\[(.+)]" patt <- "\\[(.+)]"
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2] h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
# species # species
x <- unlist(strsplit(h$organism, "\\s+")) x <- unlist(strsplit(h$organism, "\\s+"))
if (length(x) >= 2) { if (length(x) >= 2) {
h$species <- paste(x[1], x[2]) h$species <- paste(x[1], x[2])
} else if (length(x) == 1) { } else if (length(x) == 1) {
h$species <- paste(x[1], "sp.") h$species <- paste(x[1], "sp.")
} else { } else {
h$species <- NA h$species <- NA
} }
# E-value # E-value
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
# length of alignment # length of alignment
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
# number of identities # number of identities
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
# number of gaps # number of gaps
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
# split up alignment section # split up alignment section
idx <- grep("^Query ", hit) idx <- grep("^Query ", hit)
Que <- hit[idx] Que <- hit[idx]
Mid <- hit[idx + 1] Mid <- hit[idx + 1]
Sbj <- hit[idx + 2] Sbj <- hit[idx + 2]
# first and last positions # first and last positions
h$Qbounds <- c(start = 0, end = 0) h$Qbounds <- c(start = 0, end = 0)
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
h$Sbounds <- c(start = 0, end = 0) h$Sbounds <- c(start = 0, end = 0)
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
# aligned sequences # aligned sequences
for (i in seq_along(Que)) { for (i in seq_along(Que)) {
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
m <- regexec(patt, Que[i]) m <- regexec(patt, Que[i])
iFirst <- m[[1]][2] iFirst <- m[[1]][2]
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
Que[i] <- substring(Que[i], iFirst, iLast) Que[i] <- substring(Que[i], iFirst, iLast)
Mid[i] <- substring(Mid[i], iFirst, iLast) Mid[i] <- substring(Mid[i], iFirst, iLast)
Sbj[i] <- substring(Sbj[i], iFirst, iLast) Sbj[i] <- substring(Sbj[i], iFirst, iLast)
} }
h$Qseq <- paste0(Que, collapse = "") h$Qseq <- paste0(Que, collapse = "")
h$midSeq <- paste0(Mid, collapse = "") h$midSeq <- paste0(Mid, collapse = "")
h$Sseq <- paste0(Sbj, collapse = "") h$Sseq <- paste0(Sbj, collapse = "")
return(h) return(h)
} }
# ==== TESTS =================================================================== # ==== TESTS ===================================================================
if (FALSE) { if (FALSE) {
# define query: # define query:
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
sep="") sep="")
# or ... # or ...
q <- "NP_010227" # refseq ID q <- "NP_010227" # refseq ID
test <- BLAST(q, test <- BLAST(q,
nHits = 100, nHits = 100,
E = 0.001, E = 0.001,
rid = "", rid = "",
limits = "txid4751[ORGN]") # Fungi limits = "txid4751[ORGN]") # Fungi
str(test) str(test)
length(test$hits) length(test$hits)
} }
# [END] # [END]

View File

@ -1,32 +1,32 @@
# test_biCode.R # test_biCode.R
# #
context("biCode() utility function tests") # A set of tests for some context("biCode() utility function tests") # A set of tests for some
# functionality # functionality
test_that("expected input is processed correctly", { # Related expectations test_that("expected input is processed correctly", { # Related expectations
expect_equal(biCode("homo sapiens"), "HOMSA") expect_equal(biCode("homo sapiens"), "HOMSA")
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
c("PHACI", "MACRU")) c("PHACI", "MACRU"))
}) })
test_that("unexpected input is managed", { test_that("unexpected input is managed", {
expect_equal(biCode(""), ".....") expect_equal(biCode(""), ".....")
expect_equal(biCode(" "), ".....") expect_equal(biCode(" "), ".....")
expect_equal(biCode("123 12"), ".....") expect_equal(biCode("123 12"), ".....")
expect_equal(biCode("h sapiens"), "H..SA") expect_equal(biCode("h sapiens"), "H..SA")
}) })
test_that("NA values are preserved", { test_that("NA values are preserved", {
expect_true(is.na((biCode(NA)))) expect_true(is.na((biCode(NA))))
expect_equal(biCode(c("first", NA, "last")), expect_equal(biCode(c("first", NA, "last")),
c("FIRST", NA, "LAST.")) c("FIRST", NA, "LAST."))
}) })
test_that("Missing argument throws an error", { test_that("Missing argument throws an error", {
expect_error(biCode(), "argument \"s\" is missing, with no default") expect_error(biCode(), "argument \"s\" is missing, with no default")
}) })
# [END] # [END]